In [1]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from spacy.cli import download

In [2]:
data = pd.read_json('../../Data/Preprocessed Data/kind:feature/merged_data_with_comments.json')

In [3]:
data.head()

Unnamed: 0,comments_url,id,title,body,issue_url,pr_url,labels,pr_number,filename,status,additions,deletions,changes,all_comments
0,https://api.github.com/repos/kubernetes/kubern...,275859420,Kubelet flag precedence order vs files/ConfigM...,See https://docs.google.com/document/d/18-MsCh...,https://github.com/kubernetes/kubernetes/issue...,https://github.com/kubernetes/kubernetes/pull/...,"[area/kubelet, area/kubelet-api]",56097,"['cmd/kubelet/kubelet.go', 'hack/make-rules/te...","['modified', 'modified', 'modified', 'modified...","[7, 1, 21, 1, 1, 1, 1, 1, 1, 15, 139, 146]","[1, 0, 4, 0, 0, 1, 1, 1, 1, 0, 9, 0]","[8, 1, 25, 1, 1, 2, 2, 2, 2, 15, 148, 146]",
1,https://api.github.com/repos/kubernetes/kubead...,262492428,Individual control of preflight checks,Many times users know better than kubeadm arou...,https://github.com/kubernetes/kubeadm/issues/480,https://github.com/kubernetes/kubernetes/pull/...,[area/kubeadm],56072,['cmd/kubeadm/app/apis/kubeadm/validation/BUIL...,"['modified', 'modified', 'modified', 'modified...","[1, 26, 29, 20, 21, 1, 3, 17, 4, 2, 6, 3, 5, 6...","[0, 1, 0, 14, 15, 0, 2, 9, 0, 0, 1, 7, 1, 0, 0...","[1, 27, 29, 34, 36, 1, 5, 26, 4, 2, 7, 10, 6, ...",New example in 1.8.0:\r\nsystemctl start kubel...
2,https://api.github.com/repos/kubernetes/kubern...,275470204,seccomp is an alpha feature and not feature gated,see #55983,https://github.com/kubernetes/kubernetes/issue...,https://github.com/kubernetes/kubernetes/pull/...,"[area/kubelet, area/kubelet-api]",55983,"['cmd/kubelet/app/options/options.go', 'cmd/ku...","['modified', 'modified', 'modified', 'modified...","[5, 6, 0, 0, 0, 0, 0, 0, 5]","[1, 3, 1, 1, 2, 4, 2, 2, 3]","[6, 9, 1, 1, 2, 4, 2, 2, 8]",
3,https://api.github.com/repos/kubernetes/kubead...,272308417,Use ComponentConfig for the kube-proxy,Important feature for v1.9; dependency for IPv...,https://github.com/kubernetes/kubeadm/issues/527,https://github.com/kubernetes/kubernetes/pull/...,[area/ipv6],55972,"['cmd/kubeadm/app/apis/kubeadm/BUILD', 'cmd/ku...","['modified', 'modified', 'modified', 'modified...","[1, 1, 36, 7, 3, 25, 7, 29, 27, 4, 6, 18, 15, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, ...","[1, 1, 36, 7, 3, 25, 7, 29, 27, 4, 6, 18, 15, ...",
4,https://api.github.com/repos/kubernetes/kubern...,251361039,Add kubeadm config for setting kube-proxy Bind...,<!-- This form is for bug reports and feature ...,https://github.com/kubernetes/kubernetes/issue...,https://github.com/kubernetes/kubernetes/pull/...,[area/ipv6],55972,"['cmd/kubeadm/app/apis/kubeadm/BUILD', 'cmd/ku...","['modified', 'modified', 'modified', 'modified...","[1, 1, 36, 7, 3, 25, 7, 29, 27, 4, 6, 18, 15, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, ...","[1, 1, 36, 7, 3, 25, 7, 29, 27, 4, 6, 18, 15, ...",/sig cluster-lifecycle\r\n/area ipv6


# Text Preprocessing

Merge Title, Description, and Comments

In [4]:
# make new column with title, and body all together
data['all_text'] = data['title'] + ' ' + data['body']

Lowercasing

In [5]:
# lowercase everything
data['all_text'] = data['all_text'].str.lower()

Line break removal

In [6]:
# remove line breaks (\r, \n)
data['all_text'] = data['all_text'].str.replace('\r', ' ')
data['all_text'] = data['all_text'].str.replace('\n', ' ')

Remove Non-alphanumeric character

In [7]:
# remove non-alphanumeric characters such as punctuation, symbols, emojis, etc.
data['all_text'] = data['all_text'].str.replace(r'[^a-zA-Z0-9 ]', '')

Change the datatype to string

In [8]:
# change datatype to string
data['all_text'] = data['all_text'].astype(str)

Stopwords Removal

In [12]:
import spacy

In [13]:
# remove stopwords using spaCy
try:
    nlp = spacy.load('en_core_web_sm')
except OSError:
    download('en_core_web_sm')
    nlp = spacy.load('en_core_web_sm')

data['all_text'] = data['all_text'].apply(lambda x: ' '.join([word.text for word in nlp(x) if not word.is_stop]))

Remove High Frequency Words

In [14]:
# remove high frequency words with a threshold of 0.5
threshold = 0.5

cv = CountVectorizer(max_df=threshold)
cv.fit(data['all_text'])
data['all_text'] = data['all_text'].apply(lambda x: ' '.join([word for word in x.split() if word in cv.vocabulary_]))

Remove comments_url, id, title, body, issue_url, pr_url, and all_comments

In [15]:
data.head(5)

Unnamed: 0,comments_url,id,title,body,issue_url,pr_url,labels,pr_number,filename,status,additions,deletions,changes,all_comments,all_text
0,https://api.github.com/repos/kubernetes/kubern...,275859420,Kubelet flag precedence order vs files/ConfigM...,See https://docs.google.com/document/d/18-MsCh...,https://github.com/kubernetes/kubernetes/issue...,https://github.com/kubernetes/kubernetes/pull/...,"[area/kubelet, area/kubelet-api]",56097,"['cmd/kubelet/kubelet.go', 'hack/make-rules/te...","['modified', 'modified', 'modified', 'modified...","[7, 1, 21, 1, 1, 1, 1, 1, 1, 15, 139, 146]","[1, 0, 4, 0, 0, 1, 1, 1, 1, 0, 9, 0]","[8, 1, 25, 1, 1, 2, 2, 2, 2, 15, 148, 146]",,kubelet flag precedence order vs files configm...
1,https://api.github.com/repos/kubernetes/kubead...,262492428,Individual control of preflight checks,Many times users know better than kubeadm arou...,https://github.com/kubernetes/kubeadm/issues/480,https://github.com/kubernetes/kubernetes/pull/...,[area/kubeadm],56072,['cmd/kubeadm/app/apis/kubeadm/validation/BUIL...,"['modified', 'modified', 'modified', 'modified...","[1, 26, 29, 20, 21, 1, 3, 17, 4, 2, 6, 3, 5, 6...","[0, 1, 0, 14, 15, 0, 2, 9, 0, 0, 1, 7, 1, 0, 0...","[1, 27, 29, 34, 36, 1, 5, 26, 4, 2, 7, 10, 6, ...",New example in 1.8.0:\r\nsystemctl start kubel...,individual control preflight checks times user...
2,https://api.github.com/repos/kubernetes/kubern...,275470204,seccomp is an alpha feature and not feature gated,see #55983,https://github.com/kubernetes/kubernetes/issue...,https://github.com/kubernetes/kubernetes/pull/...,"[area/kubelet, area/kubelet-api]",55983,"['cmd/kubelet/app/options/options.go', 'cmd/ku...","['modified', 'modified', 'modified', 'modified...","[5, 6, 0, 0, 0, 0, 0, 0, 5]","[1, 3, 1, 1, 2, 4, 2, 2, 3]","[6, 9, 1, 1, 2, 4, 2, 2, 8]",,seccomp alpha feature feature gated 55983
3,https://api.github.com/repos/kubernetes/kubead...,272308417,Use ComponentConfig for the kube-proxy,Important feature for v1.9; dependency for IPv...,https://github.com/kubernetes/kubeadm/issues/527,https://github.com/kubernetes/kubernetes/pull/...,[area/ipv6],55972,"['cmd/kubeadm/app/apis/kubeadm/BUILD', 'cmd/ku...","['modified', 'modified', 'modified', 'modified...","[1, 1, 36, 7, 3, 25, 7, 29, 27, 4, 6, 18, 15, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, ...","[1, 1, 36, 7, 3, 25, 7, 29, 27, 4, 6, 18, 15, ...",,use componentconfig kube proxy important featu...
4,https://api.github.com/repos/kubernetes/kubern...,251361039,Add kubeadm config for setting kube-proxy Bind...,<!-- This form is for bug reports and feature ...,https://github.com/kubernetes/kubernetes/issue...,https://github.com/kubernetes/kubernetes/pull/...,[area/ipv6],55972,"['cmd/kubeadm/app/apis/kubeadm/BUILD', 'cmd/ku...","['modified', 'modified', 'modified', 'modified...","[1, 1, 36, 7, 3, 25, 7, 29, 27, 4, 6, 18, 15, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, ...","[1, 1, 36, 7, 3, 25, 7, 29, 27, 4, 6, 18, 15, ...",/sig cluster-lifecycle\r\n/area ipv6,add kubeadm config setting kube proxy bindaddr...


In [16]:
# remove columns that are not needed
data = data.drop(columns=['comments_url','id','title', 'body', 'issue_url','pr_url','all_comments','pr_number'])

Data Lemmatization

In [17]:
# lematize the text using spaCy
data['all_text'] = data['all_text'].apply(lambda x: ' '.join([word.lemma_ for word in nlp(x)]))

Add filename to the text

In [18]:
# add filename to the data
data['all_text'] = data['all_text'] + ' ' + data['filename']

# Export the Data

In [19]:
# swap the order of the columns
data = data[['all_text', 'labels','filename','status','additions','deletions','changes']]

In [20]:
data.head()

Unnamed: 0,all_text,labels,filename,status,additions,deletions,changes
0,kubelet flag precedence order vs file configma...,"[area/kubelet, area/kubelet-api]","['cmd/kubelet/kubelet.go', 'hack/make-rules/te...","['modified', 'modified', 'modified', 'modified...","[7, 1, 21, 1, 1, 1, 1, 1, 1, 15, 139, 146]","[1, 0, 4, 0, 0, 1, 1, 1, 1, 0, 9, 0]","[8, 1, 25, 1, 1, 2, 2, 2, 2, 15, 148, 146]"
1,individual control preflight check time user k...,[area/kubeadm],['cmd/kubeadm/app/apis/kubeadm/validation/BUIL...,"['modified', 'modified', 'modified', 'modified...","[1, 26, 29, 20, 21, 1, 3, 17, 4, 2, 6, 3, 5, 6...","[0, 1, 0, 14, 15, 0, 2, 9, 0, 0, 1, 7, 1, 0, 0...","[1, 27, 29, 34, 36, 1, 5, 26, 4, 2, 7, 10, 6, ..."
2,seccomp alpha feature feature gate 55983 ['cmd...,"[area/kubelet, area/kubelet-api]","['cmd/kubelet/app/options/options.go', 'cmd/ku...","['modified', 'modified', 'modified', 'modified...","[5, 6, 0, 0, 0, 0, 0, 0, 5]","[1, 3, 1, 1, 2, 4, 2, 2, 3]","[6, 9, 1, 1, 2, 4, 2, 2, 8]"
3,use componentconfig kube proxy important featu...,[area/ipv6],"['cmd/kubeadm/app/apis/kubeadm/BUILD', 'cmd/ku...","['modified', 'modified', 'modified', 'modified...","[1, 1, 36, 7, 3, 25, 7, 29, 27, 4, 6, 18, 15, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, ...","[1, 1, 36, 7, 3, 25, 7, 29, 27, 4, 6, 18, 15, ..."
4,add kubeadm config set kube proxy bindaddress ...,[area/ipv6],"['cmd/kubeadm/app/apis/kubeadm/BUILD', 'cmd/ku...","['modified', 'modified', 'modified', 'modified...","[1, 1, 36, 7, 3, 25, 7, 29, 27, 4, 6, 18, 15, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, ...","[1, 1, 36, 7, 3, 25, 7, 29, 27, 4, 6, 18, 15, ..."


In [None]:
# export cleaned data 
data.to_json('../../Data/Preprocessed Data/kind:feature/cleaned_data_with_changed_files.json')