In [11]:
import pandas as pd
from spacy.cli import download
from sklearn.feature_extraction.text import CountVectorizer
import spacy

In [12]:
data = pd.read_json('../../Data/Preprocessed Data/kind:bug/merged_data_with_comments.json')

In [13]:
data.head()

Unnamed: 0,comments_url,id,title,body,issue_url,pr_url,labels,pr_number,filename,status,additions,deletions,changes,all_comments
0,https://api.github.com/repos/kubernetes/kubern...,2639668210,kubelet crash: fatal error: concurrent map writes,### What happened?\n\nWhile looking into three...,https://github.com/kubernetes/kubernetes/issue...,https://github.com/kubernetes/kubernetes/pull/...,[area/kubelet],128657,"['pkg/kubelet/cm/container_manager_linux.go', ...","['modified', 'modified', 'modified', 'modified']","[3, 4, 18, 75]","[3, 3, 6, 0]","[6, 7, 24, 75]",/sig node Thought to search for similar errors...
1,https://api.github.com/repos/kubernetes/kubern...,2617512099,[FG:InPlacePodVerticalScaling] failed to verif...,### What happened?\n\nOne line bug description...,https://github.com/kubernetes/kubernetes/issue...,https://github.com/kubernetes/kubernetes/pull/...,[area/kubelet],126620,"['pkg/kubelet/status/state/checkpoint.go', 'pk...","['modified', 'modified', 'added']","[42, 23, 166]","[26, 17, 0]","[68, 40, 166]",This issue is currently awaiting triage.\n\nIf...
2,https://api.github.com/repos/kubernetes/kubern...,2604613192,Restore build-tag flag for code-generator,### What happened?\n\nThe `build-tag` flag is ...,https://github.com/kubernetes/kubernetes/issue...,https://github.com/kubernetes/kubernetes/pull/...,[area/code-generation],128259,['staging/src/k8s.io/code-generator/cmd/conver...,"['modified', 'modified', 'modified', 'modified...","[12, 1, 1, 12, 1, 1]","[2, 1, 1, 1, 1, 1]","[14, 2, 2, 13, 2, 2]",@p0lyn0mial FYI /sig api-machinery /triage acc...
3,https://api.github.com/repos/kubernetes/kubern...,2596132738,[Failing Tests] ci-crio-cgroupv1-node-e2e-conf...,### Which jobs are failing?\n* master-blocking...,https://github.com/kubernetes/kubernetes/issue...,https://github.com/kubernetes/kubernetes/pull/...,"[area/kubeadm, area/dependency]",128175,"['go.mod', 'go.sum', 'hack/unwanted-dependenci...","['modified', 'modified', 'modified', 'modified...","[1, 3, 1, 21, 9, 9, 4, 0, 10, 2, 2, 5, 8, 1]","[1, 2, 0, 98, 8, 7, 6, 62, 11, 1, 4, 8, 11, 1]","[2, 5, 1, 119, 17, 16, 10, 62, 21, 3, 6, 13, 1...",@drewhagen: The provided milestone is not vali...
4,https://api.github.com/repos/kubernetes/kubern...,2591358936,Crash on kube manager's service-lb-controller ...,### What happened?\n\nIf kube manager is start...,https://github.com/kubernetes/kubernetes/issue...,https://github.com/kubernetes/kubernetes/pull/...,[area/cloudprovider],128182,['cmd/kube-controller-manager/app/controllerma...,"['modified', 'modified', 'modified']","[26, 12, 5]","[0, 1, 4]","[26, 13, 9]",This issue is currently awaiting triage.\n\nIf...


# Text Preprocessing

Merge Title, Description, and Comments

In [14]:
# make new column with title, body, and comments all together
data['all_text'] = data['title'] + ' ' + data['body'] 

Lowercasing

In [15]:
# lowercase everything
data['all_text'] = data['all_text'].str.lower()
data['all_comments'] = data['all_comments'].str.lower()

Line break removal

In [16]:
# remove line breaks (\r, \n)
data['all_text'] = data['all_text'].str.replace('\r', ' ')
data['all_text'] = data['all_text'].str.replace('\n', ' ')
data['all_comments'] = data['all_comments'].str.replace('\r', ' ')
data['all_comments'] = data['all_comments'].str.replace('\n', ' ')

Remove Non-alphanumeric character

In [17]:
# remove non-alphanumeric characters such as punctuation, symbols, emojis, etc.
data['all_text'] = data['all_text'].str.replace(r'[^a-zA-Z0-9 ]', '')
data['all_comments'] = data['all_comments'].str.replace(r'[^a-zA-Z0-9 ]', '')

Change the datatype to string

In [18]:
# change datatype to string
data['all_text'] = data['all_text'].astype(str)
data['all_comments'] = data['all_comments'].astype(str)

Stopwords Removal

In [19]:
# remove stopwords using spaCy
try:
    nlp = spacy.load('en_core_web_sm')
except OSError:
    download('en_core_web_sm')
    nlp = spacy.load('en_core_web_sm')

data['all_text'] = data['all_text'].apply(lambda x: ' '.join([word.text for word in nlp(x) if not word.is_stop]))
data['all_comments'] = data['all_comments'].apply(lambda x: ' '.join([word.text for word in nlp(x) if not word.is_stop]))

KeyboardInterrupt: 

Data Lemmatization

In [None]:
# lematize the text using spaCy
data['all_text'] = data['all_text'].apply(lambda x: ' '.join([word.lemma_ for word in nlp(x)]))
data['all_comments'] = data['all_comments'].apply(lambda x: ' '.join([word.lemma_ for word in nlp(x)]))

Remove High Frequency Words

In [None]:
# Remove high frequency words with different thresholds
thresholds = [0.5, 0.6, 0.7, 0.8, 0.9]

for threshold in thresholds:
    print(f"Processing with threshold: {threshold}")
    
    # Create and fit CountVectorizer
    cv = CountVectorizer(max_df=threshold)
    cv.fit(data['all_text'])
    
    # Get vocabulary from fitted vectorizer
    vocabulary = cv.vocabulary_
    
    # Filter each document to only keep words in the vocabulary
    filtered_texts = []
    for text in data['all_text']:
        words = text.split()
        filtered_words = [word for word in words if word in vocabulary]
        filtered_texts.append(' '.join(filtered_words))
    
    # Store filtered texts in new column
    data['all_text_' + str(threshold)] = filtered_texts
    
    # Print statistics
    print(f"  Vocabulary size: {len(vocabulary)}")
    print(f"  Average words per document: {sum(len(text.split()) for text in filtered_texts) / len(filtered_texts):.2f}")

Processing with threshold: 0.5
  Vocabulary size: 26632
  Average words per document: 160.55
Processing with threshold: 0.6
  Vocabulary size: 26641
  Average words per document: 164.91
Processing with threshold: 0.7
  Vocabulary size: 26652
  Average words per document: 174.07
Processing with threshold: 0.8
  Vocabulary size: 26659
  Average words per document: 178.25
Processing with threshold: 0.9
  Vocabulary size: 26664
  Average words per document: 188.45


In [None]:
threshold = 0.3

# Create and fit CountVectorizer
cv = CountVectorizer(max_df=threshold)
cv.fit(data['all_comments'])

# Get vocabulary from fitted vectorizer
vocabulary = cv.vocabulary_

# Filter each document to only keep words in the vocabulary
filtered_texts = []
for text in data['all_comments']:
    words = text.split()
    filtered_words = [word for word in words if word in vocabulary]
    filtered_texts.append(' '.join(filtered_words))

# Store filtered texts in new column
data['all_comments_' + str(threshold)] = filtered_texts

# Print statistics
print(f"  Vocabulary size: {len(vocabulary)}")
print(f"  Average words per document: {sum(len(text.split()) for text in filtered_texts) / len(filtered_texts):.2f}")



  Vocabulary size: 28002
  Average words per document: 191.64


Remove comments_url, id, title, body, issue_url, pr_url, and all_comments

In [None]:
data.head(5)

Unnamed: 0,comments_url,id,title,body,issue_url,pr_url,labels,pr_number,filename,status,additions,deletions,changes,all_comments,all_text,all_text_0.5,all_text_0.6,all_text_0.7,all_text_0.8,all_text_0.9
0,https://api.github.com/repos/kubernetes/kubern...,2639668210,kubelet crash: fatal error: concurrent map writes,### What happened?\n\nWhile looking into three...,https://github.com/kubernetes/kubernetes/issue...,https://github.com/kubernetes/kubernetes/pull/...,[area/kubelet],128657,"['pkg/kubelet/cm/container_manager_linux.go', ...","['modified', 'modified', 'modified', 'modified']","[3, 4, 18, 75]","[3, 3, 6, 0]","[6, 7, 24, 75]",/sig node thought search similar error example...,kubelet crash : fatal error : concurrent map w...,kubelet crash fatal error concurrent map write...,kubelet crash fatal error concurrent map write...,kubelet crash fatal error concurrent map write...,kubelet crash fatal error concurrent map write...,kubelet crash fatal error concurrent map write...
1,https://api.github.com/repos/kubernetes/kubern...,2617512099,[FG:InPlacePodVerticalScaling] failed to verif...,### What happened?\n\nOne line bug description...,https://github.com/kubernetes/kubernetes/issue...,https://github.com/kubernetes/kubernetes/pull/...,[area/kubelet],126620,"['pkg/kubelet/status/state/checkpoint.go', 'pk...","['modified', 'modified', 'added']","[42, 23, 166]","[26, 17, 0]","[68, 40, 166]",issue currently await triage . sig subproje...,[ fg : inplacepodverticalscale ] fail verify p...,fg inplacepodverticalscale fail verify pod sta...,fg inplacepodverticalscale fail verify pod sta...,fg inplacepodverticalscale fail verify pod sta...,fg inplacepodverticalscale fail verify pod sta...,fg inplacepodverticalscale fail verify pod sta...
2,https://api.github.com/repos/kubernetes/kubern...,2604613192,Restore build-tag flag for code-generator,### What happened?\n\nThe `build-tag` flag is ...,https://github.com/kubernetes/kubernetes/issue...,https://github.com/kubernetes/kubernetes/pull/...,[area/code-generation],128259,['staging/src/k8s.io/code-generator/cmd/conver...,"['modified', 'modified', 'modified', 'modified...","[12, 1, 1, 12, 1, 1]","[2, 1, 1, 1, 1, 1]","[14, 2, 2, 13, 2, 2]",@p0lyn0mial fyi /sig api - machinery /triage a...,restore build - tag flag code - generator # # ...,restore build tag flag code generator build ta...,restore build tag flag code generator build ta...,restore build tag flag code generator build ta...,restore build tag flag code generator build ta...,restore build tag flag code generator happen b...
3,https://api.github.com/repos/kubernetes/kubern...,2596132738,[Failing Tests] ci-crio-cgroupv1-node-e2e-conf...,### Which jobs are failing?\n* master-blocking...,https://github.com/kubernetes/kubernetes/issue...,https://github.com/kubernetes/kubernetes/pull/...,"[area/kubeadm, area/dependency]",128175,"['go.mod', 'go.sum', 'hack/unwanted-dependenci...","['modified', 'modified', 'modified', 'modified...","[1, 3, 1, 21, 9, 9, 4, 0, 10, 2, 2, 5, 8, 1]","[1, 2, 0, 98, 8, 7, 6, 62, 11, 1, 4, 8, 11, 1]","[2, 5, 1, 119, 17, 16, 10, 62, 21, 3, 6, 13, 1...",@drewhagen : provide milestone valid repositor...,[ fail test ] ci-crio-cgroupv1-node-e2e-confor...,fail test impact multiple job job fail master ...,fail test impact multiple job job fail master ...,fail test impact multiple job job fail master ...,fail test impact multiple job job fail master ...,fail test impact multiple job job fail master ...
4,https://api.github.com/repos/kubernetes/kubern...,2591358936,Crash on kube manager's service-lb-controller ...,### What happened?\n\nIf kube manager is start...,https://github.com/kubernetes/kubernetes/issue...,https://github.com/kubernetes/kubernetes/pull/...,[area/cloudprovider],128182,['cmd/kube-controller-manager/app/controllerma...,"['modified', 'modified', 'modified']","[26, 12, 5]","[0, 1, 4]","[26, 13, 9]",issue currently await triage . sig subproje...,crash kube manager service - lb - controller v...,crash kube manager service lb controller kube ...,crash kube manager service lb controller kube ...,crash kube manager service lb controller kube ...,crash kube manager service lb controller kube ...,crash kube manager service lb controller happe...


In [None]:
# remove columns that are not needed
data = data.drop(columns=['comments_url','id','title', 'body', 'issue_url','pr_url','all_comments','pr_number'])

Add filename to the text

In [None]:
# add filename to the data
for column_name in data.columns:
    if 'all_text' in column_name:
        data[column_name] = data[column_name] + ' ' + data['all_comments_0.3']
        data[column_name] = data[column_name] + ' ' + data['filename']

# Export the Data

In [None]:
data.head()

Unnamed: 0,labels,filename,status,additions,deletions,changes,all_text,all_text_0.5,all_text_0.6,all_text_0.7,all_text_0.8,all_text_0.9,all_comments_0.3
0,[area/kubelet],"['pkg/kubelet/cm/container_manager_linux.go', ...","['modified', 'modified', 'modified', 'modified']","[3, 4, 18, 75]","[3, 3, 6, 0]","[6, 7, 24, 75]",kubelet crash : fatal error : concurrent map w...,kubelet crash fatal error concurrent map write...,kubelet crash fatal error concurrent map write...,kubelet crash fatal error concurrent map write...,kubelet crash fatal error concurrent map write...,kubelet crash fatal error concurrent map write...,thought search similar error example crash fin...
1,[area/kubelet],"['pkg/kubelet/status/state/checkpoint.go', 'pk...","['modified', 'modified', 'added']","[42, 23, 166]","[26, 17, 0]","[68, 40, 166]",[ fg : inplacepodverticalscale ] fail verify p...,fg inplacepodverticalscale fail verify pod sta...,fg inplacepodverticalscale fail verify pod sta...,fg inplacepodverticalscale fail verify pod sta...,fg inplacepodverticalscale fail verify pod sta...,fg inplacepodverticalscale fail verify pod sta...,currently await subproject determine relevant ...
2,[area/code-generation],['staging/src/k8s.io/code-generator/cmd/conver...,"['modified', 'modified', 'modified', 'modified...","[12, 1, 1, 12, 1, 1]","[2, 1, 1, 1, 1, 1]","[14, 2, 2, 13, 2, 2]",restore build - tag flag code - generator # # ...,restore build tag flag code generator build ta...,restore build tag flag code generator build ta...,restore build tag flag code generator build ta...,restore build tag flag code generator build ta...,restore build tag flag code generator happen b...,fyi machinery
3,"[area/kubeadm, area/dependency]","['go.mod', 'go.sum', 'hack/unwanted-dependenci...","['modified', 'modified', 'modified', 'modified...","[1, 3, 1, 21, 9, 9, 4, 0, 10, 2, 2, 5, 8, 1]","[1, 2, 0, 98, 8, 7, 6, 62, 11, 1, 4, 8, 11, 1]","[2, 5, 1, 119, 17, 16, 10, 62, 21, 3, 6, 13, 1...",[ fail test ] ci-crio-cgroupv1-node-e2e-confor...,fail test impact multiple job job fail master ...,fail test impact multiple job job fail master ...,fail test impact multiple job job fail master ...,fail test impact multiple job job fail master ...,fail test impact multiple job job fail master ...,provide milestone valid milestone candidate us...
4,[area/cloudprovider],['cmd/kube-controller-manager/app/controllerma...,"['modified', 'modified', 'modified']","[26, 12, 5]","[0, 1, 4]","[26, 13, 9]",crash kube manager service - lb - controller v...,crash kube manager service lb controller kube ...,crash kube manager service lb controller kube ...,crash kube manager service lb controller kube ...,crash kube manager service lb controller kube ...,crash kube manager service lb controller happe...,currently await subproject determine relevant ...


In [None]:
# swap the order of the columns
data = data[['all_text','all_text_0.5','all_text_0.6','all_text_0.7','all_text_0.8','all_text_0.9','labels','filename','status','additions','deletions','changes', 'all_comments_0.3']]


In [None]:
data.head()

Unnamed: 0,all_text,all_text_0.5,all_text_0.6,all_text_0.7,all_text_0.8,all_text_0.9,labels,filename,status,additions,deletions,changes,all_comments_0.3
0,kubelet crash : fatal error : concurrent map w...,kubelet crash fatal error concurrent map write...,kubelet crash fatal error concurrent map write...,kubelet crash fatal error concurrent map write...,kubelet crash fatal error concurrent map write...,kubelet crash fatal error concurrent map write...,[area/kubelet],"['pkg/kubelet/cm/container_manager_linux.go', ...","['modified', 'modified', 'modified', 'modified']","[3, 4, 18, 75]","[3, 3, 6, 0]","[6, 7, 24, 75]",thought search similar error example crash fin...
1,[ fg : inplacepodverticalscale ] fail verify p...,fg inplacepodverticalscale fail verify pod sta...,fg inplacepodverticalscale fail verify pod sta...,fg inplacepodverticalscale fail verify pod sta...,fg inplacepodverticalscale fail verify pod sta...,fg inplacepodverticalscale fail verify pod sta...,[area/kubelet],"['pkg/kubelet/status/state/checkpoint.go', 'pk...","['modified', 'modified', 'added']","[42, 23, 166]","[26, 17, 0]","[68, 40, 166]",currently await subproject determine relevant ...
2,restore build - tag flag code - generator # # ...,restore build tag flag code generator build ta...,restore build tag flag code generator build ta...,restore build tag flag code generator build ta...,restore build tag flag code generator build ta...,restore build tag flag code generator happen b...,[area/code-generation],['staging/src/k8s.io/code-generator/cmd/conver...,"['modified', 'modified', 'modified', 'modified...","[12, 1, 1, 12, 1, 1]","[2, 1, 1, 1, 1, 1]","[14, 2, 2, 13, 2, 2]",fyi machinery
3,[ fail test ] ci-crio-cgroupv1-node-e2e-confor...,fail test impact multiple job job fail master ...,fail test impact multiple job job fail master ...,fail test impact multiple job job fail master ...,fail test impact multiple job job fail master ...,fail test impact multiple job job fail master ...,"[area/kubeadm, area/dependency]","['go.mod', 'go.sum', 'hack/unwanted-dependenci...","['modified', 'modified', 'modified', 'modified...","[1, 3, 1, 21, 9, 9, 4, 0, 10, 2, 2, 5, 8, 1]","[1, 2, 0, 98, 8, 7, 6, 62, 11, 1, 4, 8, 11, 1]","[2, 5, 1, 119, 17, 16, 10, 62, 21, 3, 6, 13, 1...",provide milestone valid milestone candidate us...
4,crash kube manager service - lb - controller v...,crash kube manager service lb controller kube ...,crash kube manager service lb controller kube ...,crash kube manager service lb controller kube ...,crash kube manager service lb controller kube ...,crash kube manager service lb controller happe...,[area/cloudprovider],['cmd/kube-controller-manager/app/controllerma...,"['modified', 'modified', 'modified']","[26, 12, 5]","[0, 1, 4]","[26, 13, 9]",currently await subproject determine relevant ...


In [None]:
# export cleaned data 
data.to_json('../../Data/Preprocessed Data/kind:bug/cleaned_data_with_changed_files_reduced_comments.json')

In [None]:
data.head()

Unnamed: 0,all_text,all_text_0.5,all_text_0.6,all_text_0.7,all_text_0.8,all_text_0.9,labels,filename,status,additions,deletions,changes,all_comments_0.3
0,kubelet crash : fatal error : concurrent map w...,kubelet crash fatal error concurrent map write...,kubelet crash fatal error concurrent map write...,kubelet crash fatal error concurrent map write...,kubelet crash fatal error concurrent map write...,kubelet crash fatal error concurrent map write...,[area/kubelet],"['pkg/kubelet/cm/container_manager_linux.go', ...","['modified', 'modified', 'modified', 'modified']","[3, 4, 18, 75]","[3, 3, 6, 0]","[6, 7, 24, 75]",thought search similar error example crash fin...
1,[ fg : inplacepodverticalscale ] fail verify p...,fg inplacepodverticalscale fail verify pod sta...,fg inplacepodverticalscale fail verify pod sta...,fg inplacepodverticalscale fail verify pod sta...,fg inplacepodverticalscale fail verify pod sta...,fg inplacepodverticalscale fail verify pod sta...,[area/kubelet],"['pkg/kubelet/status/state/checkpoint.go', 'pk...","['modified', 'modified', 'added']","[42, 23, 166]","[26, 17, 0]","[68, 40, 166]",currently await subproject determine relevant ...
2,restore build - tag flag code - generator # # ...,restore build tag flag code generator build ta...,restore build tag flag code generator build ta...,restore build tag flag code generator build ta...,restore build tag flag code generator build ta...,restore build tag flag code generator happen b...,[area/code-generation],['staging/src/k8s.io/code-generator/cmd/conver...,"['modified', 'modified', 'modified', 'modified...","[12, 1, 1, 12, 1, 1]","[2, 1, 1, 1, 1, 1]","[14, 2, 2, 13, 2, 2]",fyi machinery
3,[ fail test ] ci-crio-cgroupv1-node-e2e-confor...,fail test impact multiple job job fail master ...,fail test impact multiple job job fail master ...,fail test impact multiple job job fail master ...,fail test impact multiple job job fail master ...,fail test impact multiple job job fail master ...,"[area/kubeadm, area/dependency]","['go.mod', 'go.sum', 'hack/unwanted-dependenci...","['modified', 'modified', 'modified', 'modified...","[1, 3, 1, 21, 9, 9, 4, 0, 10, 2, 2, 5, 8, 1]","[1, 2, 0, 98, 8, 7, 6, 62, 11, 1, 4, 8, 11, 1]","[2, 5, 1, 119, 17, 16, 10, 62, 21, 3, 6, 13, 1...",provide milestone valid milestone candidate us...
4,crash kube manager service - lb - controller v...,crash kube manager service lb controller kube ...,crash kube manager service lb controller kube ...,crash kube manager service lb controller kube ...,crash kube manager service lb controller kube ...,crash kube manager service lb controller happe...,[area/cloudprovider],['cmd/kube-controller-manager/app/controllerma...,"['modified', 'modified', 'modified']","[26, 12, 5]","[0, 1, 4]","[26, 13, 9]",currently await subproject determine relevant ...
