In [106]:
import pandas as pd
import numpy as np
import re
df = pd.read_csv("data/commits.csv")

# Text Preprocessing

In [107]:
def text_preprocessing(text):
    text = text.lower() # Make all words lower case
    text = re.sub('.*?(?:http|https)://[^\s]+.*?', "", text) # Remove entire sentence if url detected
    text = re.sub('\n', ' ', text) # Remove newlines, replace with space
    return text

df = df.dropna(how='any') # Drop any na values
df['message'] = df['message'].apply(str).apply(lambda x: text_preprocessing(x))
df = df[df['message'].str.len() <= 512] # Remove any commits longer than 512 (change to your model's max context length)
df.head(200)

Unnamed: 0,message,sha,remote_url
0,"fixup! if -s & -p specified, mention 'sftp -p'...",2709809fd616a0991dc18e3a58dea10fb383c3f0,https://github.com/openssh/openssh-portable
1,make ssh-copy-id(1) consistent with openssh. ...,204e0bf05161b7641500d7ab266c21217412379f,https://github.com/openssh/openssh-portable
2,"if -s & -p specified, mention 'sftp -p' on suc...",9de79df66d1430d290fab670bb4b18612875e518,https://github.com/openssh/openssh-portable
3,drop whitespace ssh-copy-id-upstream: e604fae...,801cda54c00e0f4e7d89345a90874c8d05dc233a,https://github.com/openssh/openssh-portable
4,make -x also apply to the target script ssh-c...,288482f53613f3e74544eb92deeb24f7c7f1f371,https://github.com/openssh/openssh-portable
...,...,...,...
199,remove skipping test when scp not in path. an...,8a5e99a70fcf9b022a8aa175ebf6a71f58511da3,https://github.com/openssh/openssh-portable
200,"upstream: add a ""host"" line to the output of s...",41f36dd896c8fb8337d403fcf476762986976e9d,https://github.com/openssh/openssh-portable
201,"upstream: avoid printf(""%s"", null) if using ss...",f673b49f3be3eb51074fbb8a405beb6cd0f7d93e,https://github.com/openssh/openssh-portable
202,upstream: clamp the minimum buffer lengths and...,93fc7c576563e3d88a1dc019dd213f65607784cc,https://github.com/openssh/openssh-portable


# Stratified Random Sampling (disproportionate)
An imbalanced dataset is a machine learning classification problem in which the two class labels in the target variable are not proportional to one another. In other words, one class has a higher count than the other, resulting in an imbalance.

In machine learning, stratified sampling is also used to obtain the same sample proportion for a train and test set if there is an imbalance in the dataset.  

Benefits
- Ensure adequate amount of data for the smallest group (nginx)
- Evenly divide total sample size between subgroups




In [5]:
# Number of entries per repo
repo_types = np.unique(df['remote_url'].tolist())
for repo in repo_types:
    print(f"{repo}: {len(df[df['remote_url'] == repo])}")

https://github.com/apache/httpd: 32270
https://github.com/nginx/nginx: 7621
https://github.com/openssh/openssh-portable: 11964
https://github.com/openssl/openssl: 31082


# Generating sample dataset with 5000 commits per repo

In [108]:
sample_df = df.groupby("remote_url",group_keys=False).apply(lambda x:x.sample(n=5000, random_state=6)) # random_state is a seed
# openssl_samples = sample_df.loc[df['remote_url'] == "https://github.com/openssl/openssl"]
sample_df.to_csv("test.csv")
len(sample_df)

20000

In [None]:
from transformers import AutoTokenizer, DataCollatorForLanguageModeling
import torch
from datasets import Dataset

# Load the dataset

In [None]:
dataset = Dataset.from_pandas(df)
dataset = dataset.remove_columns(['sha','remote_url'])
dataset

Dataset({
    features: ['message'],
    num_rows: 120637
})

# Tokenizer

In [None]:
model_name = "microsoft/codebert-base" # Change model name to your model
tokenizer = AutoTokenizer.from_pretrained(model_name)
context_length = 512 # Change to your model max context length

def tokenize_pad_and_truncate(texts):
    return tokenizer(texts['message'], truncation=True, padding="max_length", max_length=context_length)

tokenized_dataset = dataset.map(tokenize_pad_and_truncate, batched=True)
tokenized_dataset[0]


# inputs = tokenizer('Fix: bug racing event', return_tensors='pt', max_length=5, truncation=True, padding='max_length')
# print(inputs)

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

{'message': 'log an error if bio_write(3) fails\n\n\ngit-svn-id: https://svn.apache.org/repos/asf/httpd/httpd/trunk@1910269 13f79535-47bb-0310-9956-ffa450edef68\n',
 'input_ids': [0,
  12376,
  41,
  5849,
  114,
  10709,
  1215,
  29631,
  1640,
  246,
  43,
  10578,
  50140,
  50118,
  20901,
  12,
  36245,
  282,
  12,
  808,
  35,
  1205,
  640,
  36245,
  282,
  4,
  48530,
  4,
  1957,
  73,
  241,
  11474,
  73,
  281,
  506,
  73,
  8166,
  417,
  73,
  8166,
  417,
  73,
  4328,
  6435,
  1039,
  1646,
  698,
  31416,
  508,
  506,
  36346,
  2022,
  12,
  3706,
  14141,
  12,
  3933,
  698,
  12,
  2831,
  4419,
  12,
  3145,
  102,
  13872,
  196,
  4550,
  4671,
  50118,
  2,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1],
 'attention_

In [None]:
dataset = Dataset.from_pandas(df)
dataset[0]

{'message': 'log an error if BIO_write(3) fails\n\n\ngit-svn-id: https://svn.apache.org/repos/asf/httpd/httpd/trunk@1910269 13f79535-47bb-0310-9956-ffa450edef68\n'}

In [None]:
model_name = "microsoft/codebert-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)