### From chapter 6

In [6]:
import pandas as pd
pd.set_option('display.max_colwidth', 50)
from pathlib import Path

import matplotlib.pyplot as plt

BASE_PATH = Path("../DATASETS/6")

In [3]:
import html 
import re
# tags like 
RE_TAG = re.compile(r'<[^<>]*>')
# text or code in brackets like [0]
RE_BRACKET = re.compile('\[[^\[\]]*\]')
# text or code in brackets like (0)
RE_BRACKET_1 = re.compile('\([^)]*\)')
# specials that are not part of words; matches # but not #cool
RE_SPECIAL = re.compile(r'(?:^|\s)[&#<>{}\[\]+]+(?:\s|$)')
# standalone sequences of hyphens like --- or ==
RE_HYPHEN_SEQ = re.compile(r'(?:^|\s)[\-=\+]{2,}(?:\s|$)')
# sequences of white spaces
RE_MULTI_SPACE = re.compile('\s+')

def clean(text):
    text = html.unescape(text)
    text = RE_TAG.sub(' ', text)
    text = RE_BRACKET.sub(' ', text)
    text = RE_BRACKET_1.sub(' ', text)
    text = RE_SPECIAL.sub(' ', text)
    text = RE_HYPHEN_SEQ.sub(' ', text)
    text = RE_MULTI_SPACE.sub(' ', text)
    return text.strip()

In [4]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix, classification_report

In [7]:
# Loading the dataframe

df = pd.read_csv(BASE_PATH/"eclipse_jdt.csv.gz")
df = df.groupby('Component', as_index=False).apply(pd.DataFrame.sample, random_state=42, frac=.2)
df = df[['Title','Description','Component']]
df = df.dropna()
df['text'] = df['Title'] + " " + df['Description']
df = df.drop(columns=['Title','Description'])

# Step 1 - Data Preparation

df['text'] = df['text'].apply(clean)

# Step 2 - Train-Test Split

X_train, X_test, Y_train, Y_test = train_test_split(df['text'], df['Component'], 
                                                    test_size=0.2, random_state=42,
                                                    stratify=df['Component'])
print ('Size of Training Data ', X_train.shape[0])
print ('Size of Test Data ', X_test.shape[0])

# Step 3 - Training the Machine Learning model

tfidf = TfidfVectorizer(min_df = 10, ngram_range=(1,2), stop_words="english")
X_train_tf = tfidf.fit_transform(X_train)


svc = SVC(kernel="linear", C=1, probability=True, random_state=42)
svc.fit(X_train_tf, Y_train)

Size of Training Data  7240
Size of Test Data  1811


SVC(C=1, kernel='linear', probability=True, random_state=42)

In [8]:
X_test_tf = tfidf.transform(X_test)
Y_pred = svc.predict(X_test_tf)
result = pd.DataFrame({ 'text': X_test.values, 'actual': Y_test.values, 'predicted': Y_pred })

In [9]:
## Explainable AI

In [10]:
result[result["actual"] != result["predicted"]].head()

Unnamed: 0,text,actual,predicted
2,NPE in Delta processor while executing JDT/UI ...,Core,UI
15,Inserting a block of text in editor badly alig...,UI,Text
16,Differences when debugging identical objects W...,Debug,Core
20,Foreach template doesnt work for class members...,Core,UI
21,exchange left and right operands for compariso...,UI,Core


In [11]:
text = result.iloc[21]["text"]
print(text)

exchange left and right operands for comparison operators changes semantics Fix for Bug 149803 was not good.; ; The right fix should do the following; if --> if --> if ; if ; if


In [12]:
svc.predict_proba(X_test_tf[21])

array([[0.002669  , 0.46736578, 0.07725225, 0.00319434, 0.06874877,
        0.38076986]])

In [13]:
class_names = ["APT", "Core", "Debug", "Doc", "Text", "UI"]
prob = svc.predict_proba(X_test_tf)

In [14]:
er = result.copy().reset_index()

In [15]:
for i, c in enumerate(class_names):
    er[c] = prob[:, i]

In [16]:
er[["actual", "predicted"] + class_names].sample(5, random_state=99)

Unnamed: 0,actual,predicted,APT,Core,Debug,Doc,Text,UI
266,UI,UI,0.000598,0.000929,0.000476,0.001377,0.224473,0.772148
835,Text,Text,0.002083,0.032109,0.001481,0.002085,0.696666,0.265577
998,Text,Text,0.000356,0.026525,0.003425,0.000673,0.942136,0.026884
754,Core,Text,0.003862,0.334308,0.011312,0.015478,0.492112,0.142927
686,UI,UI,0.019319,0.099088,0.143744,0.082969,0.053174,0.601705
