In [None]:
import xml.etree.ElementTree as ET
import pandas as pd
import numpy as np
# Parse the XML file and get the root element
# extract all the attributes from the XML, some rows have incomplete attribute 
# list , so need to scan all rows
def read_large_xml(filename):
    tree = ET.parse(filename)
    root = tree.getroot()
    attr = set()
    for i in root:
      for j in i.attrib:
        attr.add(j)
    attr = list(attr)
    # using iterparse to parse large XML files effciently
    return pd.read_xml(filename, iterparse={"row": attr})

posts_df = read_large_xml("Posts.xml")
comments_df = read_large_xml("Comments.xml")

Separating Parent Post (Questions) and Answers (programs)

In [None]:
# extract parent posts
parent_posts_df = posts_df[posts_df["ParentId"].isnull()]
# extract answers
answer_posts_df = posts_df[~posts_df["ParentId"].isnull()]
# need only python answers so filtering for Python in the answer body, Most usual format for Code Golf
python_posts_df = answer_posts_df[answer_posts_df["Body"].str.contains("Python")].reset_index()

In [None]:
import cleantext
from bs4 import BeautifulSoup
# using cleantext and beautiful soup to remove HTML tags, links and other special characters from answer body to use it as a prompt for LLMs
p_pattern = r'(?s)<p>(.*?)</p>'
code_pattern = r'(?s)<code>(.*?)</code>'
python_posts_df["Text"] = python_posts_df["Body"].str.extract(p_pattern)[0].str.replace(code_pattern, '')
python_posts_df["Text"] = python_posts_df["Text"].apply(lambda x: cleantext.clean(BeautifulSoup(x if x is not np.nan else "", 'html.parser').get_text(), no_urls=True))

In [None]:
def concat_long_strings(series):
    long_strings = series.str.findall(r'\w{11,}').explode().dropna()
    if long_strings.empty:
        return np.nan
    else:
        return ','.join(long_strings)
    
# extract the python code
# there are multiple <code></code> tags with bits of code here and there, we ignore code of length less than 10 (most likely explaining the following code).
code_pattern = r'(?s)<code>(.*?)</code>'
python_posts_df["Code"] = python_posts_df["Body"].str.extractall(code_pattern).groupby(level=0)[0].apply(lambda x: ','.join(x[x.str.len() > 10]) if not x.isnull().all() else np.nan)
# still some code is empty because the body tag does not contain code, dropping those rows # around 200
python_posts_df.dropna(subset=["Code"], inplace=True)

In [None]:
# calculate the Bytes, we could extract bytes from body, but some didnt have it
# the usage of calculating bytes is so that we could produce git diff between a code that is large and code that is small (essence of Code Golf) for the same problem
bytes = []
for i in python_posts_df['Code']:
    bytes.append(len(i))
python_posts_df["ByteCount"] = bytes

In [None]:
# We keep programs between 10 and 1000, because more than 1000, its usually text written inside the <code></code> 
python_posts_df = python_posts_df[(python_posts_df["ByteCount"]>10)&(python_posts_df["ByteCount"]<1000)]

In [None]:
# removing username mentioned in comments
def clean_text(string):
    try:
        return cleantext.clean(string if string is not np.nan else "", no_urls=True)
    except Exception:
        print(string)
comments_df["Cleaned_Text"] = comments_df["Text"].str.replace(r'@\w+\s?', '').apply(clean_text)
        

In [None]:
# creating PostId:Comment mapping to later integrate with the main DataFrame and query a LLM to generate the commit message
postid_mapping = {}
for postid, group in comments_df.groupby(["PostId"]):
    postid_mapping[postid] = "\n".join(group["Cleaned_Text"].values)

In [None]:
# Mapping the comments using the dict generated above
python_posts_df["Comments"] = python_posts_df["Id"].map(postid_mapping)

In [None]:
# Clean the dataframe, drop unnecesary columns
python_posts_df.drop(labels=['index', 'LastActivityDate', 'LastEditDate', 'ContentLicense',
       'OwnerUserId', 'ViewCount', 'CreationDate', 'AnswerCount', 'PostTypeId',
       'CommentCount', 'LastEditorDisplayName', 'ClosedDate', 'Score',
       'AcceptedAnswerId', 'LastEditorUserId', 'Title', 'Tags',
       'OwnerDisplayName', 'CommunityOwnedDate', 'FavoriteCount'], axis=1, inplace=True)

In [None]:
import traceback
# Generate diffs using the unified diff format
_no_eol = "\ No newline at end of file"
def generate_diffs(row):
    import difflib
    # generating diffs between the larger code and smaller code
    diffs = difflib.unified_diff(row["Code"].splitlines(True),row["Code_shifted"].splitlines(True),n=0)
    try: _,_ = next(diffs),next(diffs)
    except StopIteration: pass
    # adding the _no_eol to end of file if code didnt have \n character at end
    return ''.join([d if d[-1] == '\n' else d+'\n'+_no_eol+'\n' for d in diffs])

diff_df = pd.DataFrame()

for parentId, group in python_posts_df.groupby("ParentId"):
    # Looping through the group of answers to the same post question to sort by Bytes and shift a Dataframe one 
    # one row down so could generate an easy zip like function using pd.concat and then generate diffs
    try:
        group.sort_values(["ByteCount"], ascending=False, inplace=True)
        # Sorting by bytes to create diffs from a big program to small program for the same program, shifting one row down and concatenating pairs
        # some questions have only one python answer so diff from empty string
        if len(group)==1:
            # if only one answer to the question then ignoring shift and proceeding to Diff from empty string.
            group.rename(columns=lambda x: f'{x}_shifted', inplace=True)
            group["Code"] = ""
            group["Id"] = ""
            df_concat = group
        else:
            shifted = group.shift().rename(columns=lambda x: f'{x}_shifted')
            df_concat = pd.concat([shifted, group], axis=1).iloc[1:]
        diffs = df_concat.apply(generate_diffs, axis=1)
        # extracting useful columns alone to create a new Diff dataframe
        code = df_concat["Code_shifted"]
        id = df_concat["Id_shifted"]
        text = df_concat["Text_shifted"]
        comments = df_concat["Comments_shifted"]
        no_of_rows = len(diffs)
        diff_df = pd.concat([diff_df, pd.DataFrame({"Diff": diffs.values, "Code": code.values, "ParentId": np.full(no_of_rows, parentId), "Id":id.values, "Text": text.values, "Comments": comments})])   
    except Exception as e:
        traceback.print_exc()
        print(code)
        print(diffs) 
        break

In [None]:
# generate a filename mapping from Title string (removed special characters and any starting number) limited to 15 characters.
filenames = dict(zip(parent_posts_df["Id"], parent_posts_df["Title"].str.replace('^\d+|[^\w\s]+', '').str.replace('\s+', '').str.strip().str.slice(stop=15) + '.py'))

In [None]:
# generating a tag mapping to be tested with LLMs for generating Git commit messages
tags = dict(zip(parent_posts_df["Id"], parent_posts_df["Tags"].str.replace('[<>]', ' ').str.slice(stop=40)))

In [None]:
# using the above generated dict to map the filename with questions
diff_df["filename"] = diff_df["ParentId"].map(filenames)

In [None]:
# Preprocessing to generate prompts to query an LLM
diff_df.reset_index(inplace=True)
diff_df["Text"].fillna(" ", inplace=True)
diff_df["Comments"].fillna(" ", inplace=True)

In [None]:
# generating prompts to query LLM
diff_df["prompt"] = "Generate a git commit message explaining, within 25 tokens, what the following git diff does with the help of description below \n" \
                        + diff_df["Text"] + " \n " + diff_df["Diff"]+ "\n Commit Message:"

Using Cohere's generate API to generate relevant Git Commit messages (Code Golf Problem) for finetuning Diff-Codgen Model

In [None]:
import cohere
co = cohere.Client("Cohere API Key")
p = []
index = 0
for prompt in diff_df["prompt"].values:
    response = co.generate(  
        model='xlarge',  
        prompt = prompt,  
        max_tokens=25,  
        temperature=0.7,  
        stop_sequences=["~--~"])
    p.append(response.generations[0].text)
    print(f"Completed: {index}")
    index+=1

In [None]:
# assigning commit messages generated by the LLM
diff_df["commitMessage"] = p

In [None]:
# file format for finetuning the Diff-Codegen Model
final_format = ['<NME> {}\n'
 '<BEF> {}\n'
 '<MSG> {}\n'
 '<DFF> {}'
 .format(row['filename'], row['Code'], row['commitMessage'], row['Diff']) for _, row in diff_df.iterrows()]