In [9]:
import pandas as pd
from pathlib import Path

BASE_PATH = Path("../DATASETS/6")

In [13]:
import html
import re

def clean(text):
    text = html.unescape(text)
    # tags like <tab>
    text = re.sub(r'<[^<>]*>', ' ', text)
    # markdown URLs like [Some text](https://....)
    text = re.sub(r'\[([^\[\]]*)\]\([^\(\)]*\)', r'\1', text)
    # text or code in brackets like [0]
    text = re.sub(r'\[[^\[\]]*\]', ' ', text)
    # standalone sequences of specials, matches &# but not #cool
    text = re.sub(r'(?:^|\s)[&#<>{}\[\]+|\\:-]{1,}(?:\s|$)', ' ', text)
    # standalone sequences of hyphens like --- or ==
    text = re.sub(r'(?:^|\s)[\-=\+]{2,}(?:\s|$)', ' ', text)
    # sequences of white spaces
    text = re.sub(r'\s+', ' ', text)
    return text.strip()

In [10]:
runSVC = True

In [19]:
# Loading the DataFrame
df = pd.read_csv(BASE_PATH/'eclipse_jdt.csv.gz')
df = df[['Title', 'Description', 'Component']]
df = df.dropna()
df['text'] = df['Title'] + df['Description']
df = df.drop(columns=['Title', 'Description'])

In [15]:
# Step 1 - Data Preparation
df['text'] = df['text'].apply(clean)
df = df[df['text'].str.len() > 50]


In [16]:
if (runSVC):
 # Sample the data when running SVC to ensure reasonable run-times
 df = df.groupby('Component', as_index=False).apply(pd.DataFrame.sample,
                                                     random_state=21,
                                                     frac=.2)

In [28]:
from sklearn.model_selection import train_test_split

In [29]:
# Step 2 - Train-Test Split
X_train, X_test, Y_train, Y_test = train_test_split(df['text'],
                                     df['Component'],
                                     test_size=0.2,
                                     random_state=42,
                                     stratify=df['Component'])

print('Size of Training Data ', X_train.shape[0])
print('Size of Test Data ', X_test.shape[0])

Size of Training Data  36202
Size of Test Data  9051


In [30]:
from sklearn.feature_extraction.text import TfidfVectorizer
# Step 3 - Training the Machine Learning model
tfidf = TfidfVectorizer(stop_words="english")

In [43]:
from sklearn.svm import LinearSVC, SVC
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV

In [38]:
if (runSVC):
    model = SVC(random_state=42, probability=True)
    grid_param = [{
         'tfidf__min_df': [5, 10],
         'tfidf__ngram_range': [(1, 3), (1, 6)],
         'model__C': [1, 100],
         'model__kernel': ['linear']
     }]
else:
    model = LinearSVC(random_state=42, tol=1e-5)
    grid_param = {
         'tfidf__min_df': [5, 10],
         'tfidf__ngram_range': [(1, 3), (1, 6)],
         'model__C': [1, 100],
         'model__loss': ['hinge']
     }

In [41]:
training_pipeline = Pipeline(
                     steps=[
                              ('tfidf', TfidfVectorizer(stop_words="english")), 
                              ('model', model)
                     ]
)

In [None]:
gridSearchProcessor = GridSearchCV(
                        estimator=training_pipeline,
                        param_grid=grid_param,
                        cv=5)
gridSearchProcessor.fit(X_train, Y_train)

In [None]:
best_params = gridSearchProcessor.best_params_
print("Best alpha parameter identified by grid search ", best_params)
best_result = gridSearchProcessor.best_score_
print("Best result identified by grid search ", best_result)
best_model = gridSearchProcessor.best_estimator_


In [None]:
# Step 4 - Model Evaluation
Y_pred = best_model.predict(X_test)
print('Accuracy Score - ', accuracy_score(Y_test, Y_pred))
print(classification_report(Y_test, Y_pred))

In [None]:
frame = { 'text': X_test, 'actual': Y_test, 'predicted': Y_pred }
result = pd.DataFrame(frame)
result[result['actual'] == result['predicted']].sample(2)

In [None]:
result[result['actual'] != result['predicted']].sample(2)