In [None]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')



In [None]:
data = pd.read_csv('msproj/clean_compl_data2.csv')

In [None]:
X_train = data.loc[:449999, 'text'].values
y_train = data.loc[:449999, 'label'].values
X_test = data.loc[450000:, 'text'].values
y_test = data.loc[450000:, 'label'].values

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import precision_score, recall_score, f1_score

tvec = TfidfVectorizer(stop_words=None, max_features=100000, ngram_range=(1, 3),min_df=10, max_df=.80)

def cv_train(splits, X, Y, pipeline, average_method):
    
    kfold = StratifiedKFold(n_splits=splits, shuffle=True, random_state=777)
    accuracy = []
    precision = []
    recall = []
    f1 = []
    for train, test in kfold.split(X, Y):
        clf_fit = pipeline.fit(X[train], Y[train])
        prediction = clf_fit.predict(X[test])
        scores = clf_fit.score(X[test],Y[test])
        
        accuracy.append(scores * 100)
        precision.append(precision_score(Y[test], prediction, average=average_method)*100)
        print('                Bug       Feature    Question')
        print('precision:',precision_score(Y[test], prediction, average=None))
        recall.append(recall_score(Y[test], prediction, average=average_method)*100)
        print('recall:   ',recall_score(Y[test], prediction, average=None))
        f1.append(f1_score(Y[test], prediction, average=average_method)*100)
        print('f1 score: ',f1_score(Y[test], prediction, average=None))
        print('-'*50)

    print("accuracy: %.2f%% (+/- %.2f%%)" % (np.mean(accuracy), np.std(accuracy)))
    print("precision: %.2f%% (+/- %.2f%%)" % (np.mean(precision), np.std(precision)))
    print("recall: %.2f%% (+/- %.2f%%)" % (np.mean(recall), np.std(recall)))
    print("f1 score: %.2f%% (+/- %.2f%%)" % (np.mean(f1), np.std(f1)))

#Applying Data Balancing techniques

In [None]:
from sklearn.linear_model import LogisticRegression

In [None]:
from sklearn.pipeline import Pipeline
from imblearn.pipeline import make_pipeline

In [None]:
from imblearn.over_sampling import ADASYN, SMOTE, RandomOverSampler

Logistic Regression

In [None]:
lr   =    LogisticRegression()

In [None]:
ROS_pipeline = make_pipeline(tvec, RandomOverSampler(random_state=777),lr)

In [None]:
SMOTE_pipeline = make_pipeline(tvec, SMOTE(random_state=777),lr)

In [None]:
ADASYN_pipeline = make_pipeline(tvec, ADASYN(sampling_strategy='minority', random_state=777),lr)

In [None]:
cv_train(5, X_train, y_train, ROS_pipeline, 'macro')

                Bug       Feature    Question
precision: [0.80128492 0.80776771 0.34150613]
recall:    [0.75897845 0.73533668 0.57760664]
f1 score:  [0.77955812 0.76985229 0.42923179]
--------------------------------------------------
                Bug       Feature    Question
precision: [0.80034953 0.81181304 0.33845828]
recall:    [0.76523843 0.73388964 0.5686019 ]
f1 score:  [0.78240027 0.77088717 0.42433352]
--------------------------------------------------
                Bug       Feature    Question
precision: [0.79917678 0.812204   0.32770094]
recall:    [0.7602504  0.71553637 0.59514218]
f1 score:  [0.77922775 0.76081187 0.42266913]
--------------------------------------------------
                Bug       Feature    Question
precision: [0.80867545 0.81059101 0.33898191]
recall:    [0.75506896 0.73798326 0.59490521]
f1 score:  [0.78095336 0.77258496 0.43187683]
--------------------------------------------------
                Bug       Feature    Question
precision: [0.

In [None]:
%%time
cv_train(5, X_train, y_train, SMOTE_pipeline, 'macro')

                Bug       Feature    Question
precision: [0.79849405 0.81200453 0.32828735]
recall:    [0.76434058 0.72626857 0.56528436]
f1 score:  [0.78104413 0.76674729 0.41535716]
--------------------------------------------------
                Bug       Feature    Question
precision: [0.80595899 0.80863641 0.33137652]
recall:    [0.75289306 0.73569844 0.58187204]
f1 score:  [0.7785228  0.77044502 0.42226999]
--------------------------------------------------
                Bug       Feature    Question
precision: [0.80578316 0.81153118 0.32958801]
recall:    [0.76032522 0.73357611 0.57345972]
f1 score:  [0.78239446 0.77058712 0.41859459]
--------------------------------------------------
                Bug       Feature    Question
precision: [0.8065328  0.81204787 0.32707241]
recall:    [0.75928374 0.72826375 0.58388626]
f1 score:  [0.78219539 0.76787712 0.41927936]
--------------------------------------------------
                Bug       Feature    Question
precision: [0.

In [None]:
%%time
cv_train(5, X_train, y_train, ADASYN_pipeline, 'macro')

                Bug       Feature    Question
precision: [0.80200857 0.81008555 0.33082024]
recall:    [0.76082402 0.73533668 0.56149289]
f1 score:  [0.78087364 0.77090339 0.41634087]
--------------------------------------------------
                Bug       Feature    Question
precision: [0.80766255 0.8049786  0.33201718]
recall:    [0.75498803 0.73934015 0.567891  ]
f1 score:  [0.7804375  0.77076445 0.41904179]
--------------------------------------------------
                Bug       Feature    Question
precision: [0.80851572 0.80984127 0.33101979]
recall:    [0.75915303 0.73827899 0.5707346 ]
f1 score:  [0.78305721 0.77240614 0.41901531]
--------------------------------------------------
                Bug       Feature    Question
precision: [0.80505568 0.81441747 0.32841526]
recall:    [0.76090481 0.73050672 0.58021327]
f1 score:  [0.78235784 0.77018333 0.41942529]
--------------------------------------------------
                Bug       Feature    Question
precision: [0.

In [None]:
%%time
from imblearn.combine import SMOTETomek
SMOTETomek_pipeline = make_pipeline(tvec, SMOTETomek(random_state=42),lr)
cv_train(5, X_train, y_train, SMOTETomek_pipeline, 'macro')

                Bug       Feature    Question
precision: [0.7987733  0.80810804 0.33011127]
recall:    [0.76002594 0.73024793 0.56244076]
f1 score:  [0.77891804 0.76720763 0.41603856]
--------------------------------------------------
                Bug       Feature    Question
precision: [0.79992177 0.80914476 0.34034403]
recall:    [0.76508879 0.73791723 0.55793839]
f1 score:  [0.78211764 0.77189132 0.42278686]
--------------------------------------------------
                Bug       Feature    Question
precision: [0.79887907 0.81733882 0.3276699 ]
recall:    [0.77142358 0.7185028  0.57582938]
f1 score:  [0.78491131 0.76474061 0.4176693 ]
--------------------------------------------------
                Bug       Feature    Question
precision: [0.81272653 0.81101383 0.3096889 ]
recall:    [0.74377135 0.7238743  0.59798578]
f1 score:  [0.77672153 0.7649705  0.40805271]
--------------------------------------------------
                Bug       Feature    Question
precision: [0.

MultinomialNB

In [None]:
from sklearn.naive_bayes import MultinomialNB
mnb  =  MultinomialNB() 

In [None]:
ROS_pipeline = make_pipeline(tvec, RandomOverSampler(random_state=777),mnb)

In [None]:
%%time
cv_train(5, X_train, y_train, ROS_pipeline, 'macro')

                Bug       Feature    Question
precision: [0.81001782 0.7405426  0.27864883]
recall:    [0.63482642 0.75113351 0.54537915]
f1 score:  [0.71180089 0.74580046 0.36884491]
--------------------------------------------------
                Bug       Feature    Question
precision: [0.80756501 0.73932847 0.27923628]
recall:    [0.63896648 0.74770886 0.54063981]
f1 score:  [0.71344036 0.74349505 0.36826601]
--------------------------------------------------
                Bug       Feature    Question
precision: [0.81327008 0.74325003 0.27962566]
recall:    [0.63707103 0.74954177 0.55580569]
f1 score:  [0.71446752 0.74638264 0.37206536]
--------------------------------------------------
                Bug       Feature    Question
precision: [0.81159603 0.74321768 0.27747269]
recall:    [0.64094571 0.74859513 0.54478673]
f1 score:  [0.71624653 0.74589671 0.36767822]
--------------------------------------------------
                Bug       Feature    Question
precision: [0.

In [None]:
SMOTE_pipeline = make_pipeline(tvec, SMOTE(random_state=777),mnb)

In [None]:
%%time
cv_train(5, X_train, y_train, SMOTE_pipeline, 'macro')

                Bug       Feature    Question
precision: [0.80962932 0.73475171 0.28674699]
recall:    [0.63789405 0.76292688 0.52168246]
f1 score:  [0.71357429 0.74857427 0.37007775]
--------------------------------------------------
                Bug       Feature    Question
precision: [0.80843286 0.73464712 0.28560362]
recall:    [0.63981445 0.75791048 0.52417062]
f1 score:  [0.71430759 0.74609751 0.36974509]
--------------------------------------------------
                Bug       Feature    Question
precision: [0.81252382 0.73830266 0.28596735]
recall:    [0.6381684  0.76034632 0.53554502]
f1 score:  [0.71486841 0.74916237 0.37284501]
--------------------------------------------------
                Bug       Feature    Question
precision: [0.81229753 0.73784182 0.28480849]
recall:    [0.64411303 0.76072643 0.52156398]
f1 score:  [0.71849441 0.74910939 0.36842986]
--------------------------------------------------
                Bug       Feature    Question
precision: [0.

In [None]:
ADASYN_pipeline = make_pipeline(tvec, ADASYN(sampling_strategy='minority', random_state=777),mnb)

In [None]:
%%time
cv_train(5, X_train, y_train, ADASYN_pipeline, 'macro')

                Bug       Feature    Question
precision: [0.81282496 0.73141652 0.28907276]
recall:    [0.63163408 0.76934208 0.52156398]
f1 score:  [0.71086535 0.74990009 0.37197904]
--------------------------------------------------


KeyboardInterrupt: ignored

In [None]:
%%time
from imblearn.combine import SMOTETomek
SMOTETomek_pipeline = make_pipeline(tvec, SMOTETomek(random_state=42),mnb)
cv_train(5, X_train, y_train, SMOTETomek_pipeline, 'macro')

KeyboardInterrupt: ignored