In [10]:
from openai import OpenAI

# read openai token from token.txt
with open('token.txt', 'r') as file:
    token = file.read().replace('\n', '')

openai = OpenAI(api_key=token)

def get_first_response(prompt):
    response = openai.chat.completions.create(
        model="gpt-4o",
        temperature=0,
        top_p=0.1,
        messages=prompt,
    )
    return response.choices[0].message.content

get_first_response([
    {
        "role": "system",
        "content": "You are a helpful assistant."
    },
    {
        "role": "user",
        "content": "What is the capital of the United States?"
    }
])

'The capital of the United States is Washington, D.C.'

In [12]:
from typing import List

def is_commit_message_good(
    commit_message: str
) -> bool:
    """
    Guide GPT to determine whether commit messages are 'good' based on Yuxia's taxonomy
    of good commit messages (https://arxiv.org/pdf/2202.02974). In short, a commit message
    is considered good if it describes 'what' and 'why'.
    """
    prompt = [
        {
            "role": "system",
            "content": str(
                "You will be provided with a commit message."
                "From the provided data, determine if the commit message is of high quality and relevant to the project."
                "A good commit message describes 'what' (summarize the change and the design decisions) "
                "and 'why' (what is the problem, what is the goal of the change, and why the change is necessary). "
                "\nExamples of good commit messages: "
                "\n- Remove outdated key. `aggregate-key-pattern` is no longer defined but was still referenced in the documentation."
                "\n- Fix concurrent problem of zookeeper configcenter, wait to start until cache being fully populated."
                "\n- Polish pom.xml. Apply consistent formatting, drop JDK 8 support and cleanup repo."
                "\nExamples of bad commit messages: "
                "\n- Update README.md"
                "\n- Fix bug"
                "\n- A lot of changes"
                "\n\nYour response must be a 'yes' or 'no' answer."
                "\n\nDo not format your response in anyway."
            ),
        },
        {"role": "user", "content": commit_message},
    ]
    _response = get_first_response(prompt)
    if _response.strip().lower() == "no":
        return False
    if _response.strip().lower() == "yes":
        return True
    return False

is_commit_message_good("log message")

False

In [15]:
import pandas as pd

df = pd.read_csv('sampled messages.csv', index_col=0)
df

Unnamed: 0_level_0,repo_id,label,url,if_mulit_commit,message,new_message1,authorName,authorEmail,commitDate,committerName,committerEmail,committerDate
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
1,1,0,https://github.com/junit-team/junit4/commit/12...,,Mark ThreadGroups created by FailOnTimeout as ...,Mark ThreadGroups created by <file_name> as ...,Kevin Cooney,kcooney@users.noreply.github.com,2021-01-02T22:09:07Z,GitHub,noreply@github.com,2021-01-02T22:09:07Z
27,1,0,https://github.com/junit-team/junit4/commit/43...,,Use Google's Maven Central mirror <enter> <en...,Use Google's <iden> mirror <enter> We are usi...,Stefan Birkner,mail@stefan-birkner.de,2020-02-07T20:35:25Z,Marc Philipp,marc@gradle.com,2020-02-19T10:13:01Z
38,1,3,https://github.com/junit-team/junit4/commit/6c...,,Remove reference to obsolete mailing list,Remove reference to obsolete mailing list,Marc Philipp,mail@marcphilipp.de,2019-11-02T13:05:11Z,Marc Philipp,mail@marcphilipp.de,2019-11-02T13:05:11Z
56,1,0,https://github.com/junit-team/junit4/commit/46...,,Deprecate Assert#assertThat <enter> <enter> T...,Deprecate <method_name> <enter> The method <i...,Stefan Birkner,mail@stefan-birkner.de,2019-09-26T20:24:47Z,Stefan Birkner,mail@stefan-birkner.de,2019-10-13T18:24:00Z
66,1,0,https://github.com/junit-team/junit4/commit/8a...,,Revert 'ExternalResource: declare after() to t...,Revert ' <file_name> : declare after() to thro...,Alex Panchenko,alex.panchenko@gmail.com,2019-04-03T15:20:00Z,Marc Philipp,marc@gradle.com,2019-04-04T09:22:28Z
...,...,...,...,...,...,...,...,...,...,...,...,...
29726,5,2,https://github.com/spring-projects/spring-boot...,,Fix checkstyle errors,Fix checkstyle errors,Phillip Webb,pwebb@pivotal.io,2017-03-07T01:37:09Z,Phillip Webb,pwebb@pivotal.io,2017-03-07T01:37:09Z
29742,5,0,https://github.com/spring-projects/spring-boot...,,Remove dependency management for `hibernate-en...,Remove dependency management for `hibernate-en...,Vedran Pavic,vedran.pavic@gmail.com,2017-02-28T07:17:55Z,Stephane Nicoll,snicoll@pivotal.io,2017-03-02T17:31:53Z
29756,5,0,https://github.com/spring-projects/spring-boot...,,Support reactive web servers with LocalServerP...,Support reactive web servers with LocalServerP...,Brian Clozel,bclozel@pivotal.io,2017-02-22T13:29:59Z,Brian Clozel,bclozel@pivotal.io,2017-02-24T12:03:11Z
29769,5,0,https://github.com/spring-projects/spring-boot...,,Upgrade to Rxjava2 2.0.6 <enter> <enter> Clos...,Upgrade to Rxjava2 <version> <enter> Closes <...,Andy Wilkinson,awilkinson@pivotal.io,2017-02-22T15:06:16Z,Andy Wilkinson,awilkinson@pivotal.io,2017-02-22T15:06:16Z


In [16]:
from tqdm import tqdm

for i, row in tqdm(df.iterrows(), total=df.shape[0]):
    try:
        df.loc[i, 'is_good'] = is_commit_message_good(row['message'])
    except Exception as e:
        print(f"Error: {e}")
        df.loc[i, 'is_good'] = None

100%|██████████| 1649/1649 [13:25<00:00,  2.05it/s] 


In [17]:
# save df
df.to_csv('is_good.csv')

In [18]:
df[df['is_good'] == True]

Unnamed: 0_level_0,repo_id,label,url,if_mulit_commit,message,new_message1,authorName,authorEmail,commitDate,committerName,committerEmail,committerDate,is_good
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
1,1,0,https://github.com/junit-team/junit4/commit/12...,,Mark ThreadGroups created by FailOnTimeout as ...,Mark ThreadGroups created by <file_name> as ...,Kevin Cooney,kcooney@users.noreply.github.com,2021-01-02T22:09:07Z,GitHub,noreply@github.com,2021-01-02T22:09:07Z,True
27,1,0,https://github.com/junit-team/junit4/commit/43...,,Use Google's Maven Central mirror <enter> <en...,Use Google's <iden> mirror <enter> We are usi...,Stefan Birkner,mail@stefan-birkner.de,2020-02-07T20:35:25Z,Marc Philipp,marc@gradle.com,2020-02-19T10:13:01Z,True
38,1,3,https://github.com/junit-team/junit4/commit/6c...,,Remove reference to obsolete mailing list,Remove reference to obsolete mailing list,Marc Philipp,mail@marcphilipp.de,2019-11-02T13:05:11Z,Marc Philipp,mail@marcphilipp.de,2019-11-02T13:05:11Z,True
56,1,0,https://github.com/junit-team/junit4/commit/46...,,Deprecate Assert#assertThat <enter> <enter> T...,Deprecate <method_name> <enter> The method <i...,Stefan Birkner,mail@stefan-birkner.de,2019-09-26T20:24:47Z,Stefan Birkner,mail@stefan-birkner.de,2019-10-13T18:24:00Z,True
66,1,0,https://github.com/junit-team/junit4/commit/8a...,,Revert 'ExternalResource: declare after() to t...,Revert ' <file_name> : declare after() to thro...,Alex Panchenko,alex.panchenko@gmail.com,2019-04-03T15:20:00Z,Marc Philipp,marc@gradle.com,2019-04-04T09:22:28Z,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...
29594,5,0,https://github.com/spring-projects/spring-boot...,,Align with Spring Framework 5's new Commons Lo...,Align with <iden> Framework 5's new Commons <i...,Andy Wilkinson,awilkinson@pivotal.io,2017-04-06T12:22:53Z,Andy Wilkinson,awilkinson@pivotal.io,2017-04-06T12:22:53Z,True
29674,5,0,https://github.com/spring-projects/spring-boot...,,Align with API changes in latest Spring Data K...,Align with API changes in latest Spring <iden>...,Andy Wilkinson,awilkinson@pivotal.io,2017-03-24T11:26:27Z,Andy Wilkinson,awilkinson@pivotal.io,2017-03-24T11:26:27Z,True
29742,5,0,https://github.com/spring-projects/spring-boot...,,Remove dependency management for `hibernate-en...,Remove dependency management for `hibernate-en...,Vedran Pavic,vedran.pavic@gmail.com,2017-02-28T07:17:55Z,Stephane Nicoll,snicoll@pivotal.io,2017-03-02T17:31:53Z,True
29756,5,0,https://github.com/spring-projects/spring-boot...,,Support reactive web servers with LocalServerP...,Support reactive web servers with LocalServerP...,Brian Clozel,bclozel@pivotal.io,2017-02-22T13:29:59Z,Brian Clozel,bclozel@pivotal.io,2017-02-24T12:03:11Z,True


In [19]:
df[df['label'] == 3]

Unnamed: 0_level_0,repo_id,label,url,if_mulit_commit,message,new_message1,authorName,authorEmail,commitDate,committerName,committerEmail,committerDate,is_good
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
38,1,3,https://github.com/junit-team/junit4/commit/6c...,,Remove reference to obsolete mailing list,Remove reference to obsolete mailing list,Marc Philipp,mail@marcphilipp.de,2019-11-02T13:05:11Z,Marc Philipp,mail@marcphilipp.de,2019-11-02T13:05:11Z,True
97,1,3,https://github.com/junit-team/junit4/commit/e2...,,Change links to point directly to junit4/5 repos,Change links to point directly to junit4/5 repos,Marc Philipp,mail@marcphilipp.de,2018-11-25T18:48:18Z,Marc Philipp,mail@marcphilipp.de,2018-11-25T18:48:18Z,True
112,1,3,https://github.com/junit-team/junit4/commit/3b...,,Use new logo and link to new website,Use new logo and link to new website,Marc Philipp,mail@marcphilipp.de,2016-04-18T19:15:59Z,Marc Philipp,mail@marcphilipp.de,2016-04-18T19:15:59Z,False
131,1,3,https://github.com/junit-team/junit4/commit/78...,,Publish from Travis <enter> <enter> Remove 11...,Publish from Travis <enter> Remove 11 <enter>...,Marc Philipp,mail@marcphilipp.de,2018-09-30T18:47:01Z,Marc Philipp,mail@marcphilipp.de,2018-09-30T19:29:05Z,False
171,1,3,https://github.com/junit-team/junit4/commit/8e...,,Add siili,Add <file_name>,Marc Philipp,mail@marcphilipp.de,2015-10-07T17:32:12Z,Marc Philipp,mail@marcphilipp.de,2015-10-07T17:32:12Z,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...
25494,5,3,https://github.com/spring-projects/spring-boot...,,Make fields private where possible,Make fields private where possible,Phillip Webb,pwebb@gopivotal.com,2014-01-22T00:21:00Z,Phillip Webb,pwebb@gopivotal.com,2014-01-22T00:29:07Z,False
25719,5,3,https://github.com/spring-projects/spring-boot...,,Reduce logging in Travis,Reduce logging in Travis,Phillip Webb,pwebb@gopivotal.com,2014-01-16T21:37:19Z,Phillip Webb,pwebb@gopivotal.com,2014-01-16T21:37:19Z,False
27076,5,3,https://github.com/spring-projects/spring-boot...,,Add SpringApplicationErrorHandler <enter> <en...,Add <file_name> <enter> [#48055575] [bs-31] A...,Dave Syer,dsyer@gopivotal.com,2013-11-04T16:00:48Z,Dave Syer,dsyer@gopivotal.com,2013-11-04T17:07:04Z,True
27983,5,3,https://github.com/spring-projects/spring-boot...,,Polish spring-boot-loader-tools <enter> <ente...,Polish spring-boot-loader-tools <enter> Issue...,Phillip Webb,pwebb@vmware.com,2013-07-30T07:04:20Z,Phillip Webb,pwebb@vmware.com,2013-07-30T07:06:35Z,False


In [21]:
# calculate the precision and recall

true_positives = df[(df['is_good'] == True) & (df['label'] == 3)].shape[0]
false_positives = df[(df['is_good'] == True) & (df['label'] != 3)].shape[0]
false_negatives = df[(df['is_good'] == False) & (df['label'] == 3)].shape[0]

precision = true_positives / (true_positives + false_positives)
recall = true_positives / (true_positives + false_negatives)

precision, recall

(0.23575129533678757, 0.4252336448598131)