# Nested 5-Fold Cross Validation For Logistic Regression On Textual Features

In [1]:
# pip install xlrd

In [2]:
# pip install numpy

In [3]:
# pip install pandas

In [4]:
# pip install scikit-learn

In [5]:
import numpy as np
import pandas as pd
import xlrd as xl
from pandas import ExcelWriter
from pandas import ExcelFile
import pprint
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from nltk.stem import WordNetLemmatizer
import re
import pickle
from operator import itemgetter
import time, datetime
from functools import partial, update_wrapper
from openpyxl import load_workbook

from sklearn.feature_extraction.text import TfidfVectorizer

from imblearn.over_sampling import SMOTE

from sklearn.pipeline import Pipeline
from imblearn.pipeline import Pipeline as Imb_Pipeline

from sklearn.linear_model import LogisticRegression

from sklearn.model_selection import StratifiedKFold, GridSearchCV, cross_validate
from sklearn.metrics import precision_recall_fscore_support, classification_report, accuracy_score, make_scorer, confusion_matrix

pp = pprint.PrettyPrinter(indent=4)

## Ignore warnings
import warnings
warnings.filterwarnings('ignore')



#### Use spaCy parser for word tokenization of a sentence:

In [6]:
import spacy
from spacy.lang.en import English

# Load the English language model
nlp = spacy.load('en_core_web_sm')

# Create an instance of the English parser
parser = English()


#### Define stopwords as punctuation + common contractions:

In [7]:
from string import punctuation
from nltk.corpus import stopwords

stop_words = list(punctuation) + ["'s","'m","n't","'re","-","'ll",'...'] #+ stopwords.words('english')

#### Code to lemmatize and tokenize:

In [8]:
def get_lemma(item):
    return WordNetLemmatizer().lemmatize(item)

def tokenize(line):
    line_tokens = []
    tokens = parser(line)
    for token in tokens:
        if token.orth_.isspace():
            continue
        elif token.like_url:
            line_tokens.append('URL')
        elif token.orth_.startswith('@'):
            line_tokens.append('SCREEN_NAME')
        elif str(token) not in stop_words:
            line_tokens.append(get_lemma(token.lower_))
    return line_tokens

In [9]:
### Read from the pickled file
all_data = pd.read_csv('../data/combined_data_oversampled.csv')

print("Size of corpus: "+str(len(all_data)))

Size of corpus: 38350


In [10]:
all_data = all_data.dropna(subset=['Text Content', 'Code'])

In [11]:
labels_to_remove = [ "Testing",'Future Plan','Issue Content Management']
all_data = all_data[~all_data['Code'].isin(labels_to_remove)]

In [12]:
X = all_data['Text Content'].values
y = all_data['Code'].values

print("Number of unique labels: "+str(len(set(y))))

labels = list(set(y))
labels.sort()

pp.pprint(labels)

Number of unique labels: 13
[   'Action on Issue',
    'Bug Reproduction',
    'Contribution and Commitment',
    'Expected Behaviour',
    'Investigation and Exploration',
    'Motivation',
    'Observed Bug Behaviour',
    'Potential New Issues and Requests',
    'Social Conversation',
    'Solution Discussion',
    'Solution Usage',
    'Task Progress',
    'Workarounds']


# Nested Cross-Validation on Logistic Regression:

In [13]:
# To be used within GridSearch
inner_cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)

# To be used in outer CV
outer_cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)

## Define Pipeline tfidf + ngram_range + logreg C

In [14]:

pipeline = Imb_Pipeline([
    ('vect', TfidfVectorizer(tokenizer=tokenize)),
    ('clf', LogisticRegression())
])

### Hyperparameters to search
### We can change hyperparameter values here
parameters = {
    'vect__ngram_range': [(2, 3)],  # unigrams or bigrams
    'clf__C': [100],
    'vect__max_df': [1],
}

## Nested Cross Validation using GridSearch

In [15]:
### Define and create the scoring functions
import nltk
nltk.download('wordnet')
def score_func(y_true, y_pred, score_index, i):
    return(precision_recall_fscore_support(y_true,y_pred)[score_index][i])

def avg_score(y_true, y_pred, score_index):
    return precision_recall_fscore_support(y_true,y_pred,average='weighted')[score_index]

def sum_support(y_true, y_pred):
    return len(y_true)

### Create partials for each of the metrics returned
score_funcs = {v: partial(score_func, score_index=k) for k, v in {0:'precision',1:'recall',2:'fscore',3:'support'}.items()}
prec_score = partial(score_func, score_index=0)
update_wrapper(prec_score,score_func)
rec_score = partial(score_func, score_index=1)
update_wrapper(rec_score,score_func)
f_score = partial(score_func, score_index=2)
update_wrapper(f_score,score_func)
support_score = partial(score_func, score_index=3)
update_wrapper(support_score,score_func)

### Create a callable scoring function for each of the metrics for each classification label
scorer = {}
for label_id in range(0,13):
    scorer['label'+str(label_id)+'_precision'] = make_scorer(prec_score, i=label_id)
    scorer['label'+str(label_id)+'_recall'] = make_scorer(rec_score, i=label_id)
    scorer['label'+str(label_id)+'_fscore'] = make_scorer(f_score, i=label_id)
    scorer['label'+str(label_id)+'_support'] = make_scorer(support_score, i=label_id)

### Create a callable scoring function for avg/total of the metrics across classification labels
scorer['avg_precision'] = make_scorer(avg_score,score_index=0)
scorer['avg_recall'] = make_scorer(avg_score,score_index=1)
scorer['avg_fscore'] = make_scorer(avg_score,score_index=2)
scorer['total_support'] = make_scorer(sum_support)


### Perform Nested cross-validation on Pipeline
start = time.time()
clf = GridSearchCV(pipeline, parameters, cv=inner_cv, scoring='f1_weighted')
clf_results = cross_validate(clf, X=X, y=y, cv=outer_cv, scoring=scorer,error_score='raise')
print("Completed Pipeline2 scenario in "+ str(datetime.timedelta(seconds=(time.time()-start))))

[nltk_data] Downloading package wordnet to
[nltk_data]     /home/mintymine/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Completed Pipeline2 scenario in 0:04:46.364975


# Display and Save Training and Testing Results for each Fold:

## Pipeline Results:

In [16]:

train_report = pd.DataFrame(columns=['Precision', 'Recall', 'F1-score', 'Support'])
test_report = pd.DataFrame(columns=['Precision', 'Recall', 'F1-score', 'Support'])

result_dict = {}

writer = pd.ExcelWriter('../results/hyperparameter_result5.xlsx')


datalength = 0

for i in range(0, 5):
    for label_id in range(0, 13):
        train_label_precision_key = 'train_label' + str(label_id) + '_precision'
        train_label_recall_key = 'train_label' + str(label_id) + '_recall'
        train_label_fscore_key = 'train_label' + str(label_id) + '_fscore'
        train_label_support_key = 'train_label' + str(label_id) + '_support'

        if train_label_precision_key in clf_results and train_label_recall_key in clf_results and \
            train_label_fscore_key in clf_results and train_label_support_key in clf_results:

            train_report.loc[labels[label_id], :] = [clf_results[train_label_precision_key][i],
                                                      clf_results[train_label_recall_key][i],
                                                      clf_results[train_label_fscore_key][i],
                                                      clf_results[train_label_support_key][i]]

        test_label_precision_key = 'test_label' + str(label_id) + '_precision'
        test_label_recall_key = 'test_label' + str(label_id) + '_recall'
        test_label_fscore_key = 'test_label' + str(label_id) + '_fscore'
        test_label_support_key = 'test_label' + str(label_id) + '_support'

        if test_label_precision_key in clf_results and test_label_recall_key in clf_results and \
            test_label_fscore_key in clf_results and test_label_support_key in clf_results:

            test_report.loc[labels[label_id], :] = [clf_results[test_label_precision_key][i],
                                                     clf_results[test_label_recall_key][i],
                                                     clf_results[test_label_fscore_key][i],
                                                     clf_results[test_label_support_key][i]]

    train_avg_precision_key = 'train_avg_precision'
    train_avg_recall_key = 'train_avg_recall'
    train_avg_fscore_key = 'train_avg_fscore'
    train_total_support_key = 'train_total_support'

    if train_avg_precision_key in clf_results and train_avg_recall_key in clf_results and \
        train_avg_fscore_key in clf_results and train_total_support_key in clf_results:

        train_report.loc['Avg/Total', :] = [clf_results[train_avg_precision_key][i],
                                             clf_results[train_avg_recall_key][i],
                                             clf_results[train_avg_fscore_key][i],
                                             clf_results[train_total_support_key][i]]

    test_avg_precision_key = 'test_avg_precision'
    test_avg_recall_key = 'test_avg_recall'
    test_avg_fscore_key = 'test_avg_fscore'
    test_total_support_key = 'test_total_support'

    if test_avg_precision_key in clf_results and test_avg_recall_key in clf_results and \
        test_avg_fscore_key in clf_results and test_total_support_key in clf_results:

        test_report.loc['Avg/Total', :] = [clf_results[test_avg_precision_key][i],
                                            clf_results[test_avg_recall_key][i],
                                            clf_results[test_avg_fscore_key][i],
                                            clf_results[test_total_support_key][i]]

    fold_index = pd.DataFrame(data=[{'Fold': 'Fold ' + str(i)}])
    fold_index.to_excel(writer, 'LTC', startrow=datalength, index=False)
    datalength += (len(fold_index) + 2)
    train_report.to_excel(writer, 'LTC', startrow=datalength)
    datalength += (len(train_report) + 2)
    test_report.to_excel(writer, 'LTC', startrow=datalength)
    datalength += (len(test_report) + 2)

    result_dict['LTC_train_' + str(i)] = train_report
    result_dict['LTC_test_' + str(i)] = test_report

    train_report = train_report.astype(float).round(2)
    test_report = test_report.astype(float).round(2)

    print("\n------------------------- FOLD " + str(i) + ": -------------------------")
    print("\nTraining Results:")
    print(train_report)
    print("\nTest Results:")
    print(test_report)

writer.close()



------------------------- FOLD 0: -------------------------

Training Results:
Empty DataFrame
Columns: [Precision, Recall, F1-score, Support]
Index: []

Test Results:
                                   Precision  Recall  F1-score  Support
Action on Issue                         0.08    1.00      0.14    590.0
Bug Reproduction                        0.00    0.00      0.00    590.0
Contribution and Commitment             0.00    0.00      0.00    590.0
Expected Behaviour                      0.00    0.00      0.00    590.0
Investigation and Exploration           0.00    0.00      0.00    590.0
Motivation                              0.00    0.00      0.00    590.0
Observed Bug Behaviour                  0.00    0.00      0.00    590.0
Potential New Issues and Requests       0.00    0.00      0.00    590.0
Social Conversation                     0.00    0.00      0.00    590.0
Solution Discussion                     0.00    0.00      0.00    590.0
Solution Usage                         

In [17]:
import pandas as pd
import numpy as np


avg_test_report = pd.DataFrame(columns=['Avg Precision', 'Avg Recall', 'Avg F1-score', 'Avg Support'], index=labels)

# Variables to calculate weighted averages
total_test_support = 0
weighted_test_precision = 0
weighted_test_recall = 0
weighted_test_f1 = 0

# Calculate averages across 5 folds for each label
for label_id in range(13):
    test_precisions = [result_dict['LTC_test_' + str(i)].loc[labels[label_id], 'Precision'] for i in range(5)]
    test_recalls = [result_dict['LTC_test_' + str(i)].loc[labels[label_id], 'Recall'] for i in range(5)]
    test_f1_scores = [result_dict['LTC_test_' + str(i)].loc[labels[label_id], 'F1-score'] for i in range(5)]
    test_supports = [result_dict['LTC_test_' + str(i)].loc[labels[label_id], 'Support'] for i in range(5)]

    # Calculate averages for each metric
    avg_test_precision = np.mean(test_precisions)
    avg_test_recall = np.mean(test_recalls)
    avg_test_f1 = np.mean(test_f1_scores)
    avg_test_support = np.mean(test_supports)  

    avg_test_report.loc[labels[label_id]] = [avg_test_precision, avg_test_recall, avg_test_f1, avg_test_support]

    weighted_test_precision += avg_test_precision * avg_test_support
    weighted_test_recall += avg_test_recall * avg_test_support
    weighted_test_f1 += avg_test_f1 * avg_test_support
    total_test_support += avg_test_support

if total_test_support > 0:
    avg_test_report.loc['Total/Avg', :] = [weighted_test_precision / total_test_support,
                                           weighted_test_recall / total_test_support,
                                           weighted_test_f1 / total_test_support,
                                           total_test_support]

avg_test_report = avg_test_report.astype(float).round(2)

existing_file_path = '../results/hyperparameter_result5.xlsx'  
try:
    existing_data = pd.read_excel(existing_file_path, sheet_name='LTC')  
    startrow = len(existing_data) + 2  
except FileNotFoundError:
    startrow = 0  

with pd.ExcelWriter(existing_file_path, mode='a', engine='openpyxl', if_sheet_exists='overlay') as writer:
    avg_test_report.to_excel(writer, sheet_name='LTC', startrow=startrow, index=True)  


print("\nAverage Test Report Across 5 Folds:")
print(avg_test_report)



Average Test Report Across 5 Folds:
                                   Avg Precision  Avg Recall  Avg F1-score  \
Action on Issue                             0.08        1.00          0.14   
Bug Reproduction                            0.00        0.00          0.00   
Contribution and Commitment                 0.00        0.00          0.00   
Expected Behaviour                          0.00        0.00          0.00   
Investigation and Exploration               0.00        0.00          0.00   
Motivation                                  0.00        0.00          0.00   
Observed Bug Behaviour                      0.00        0.00          0.00   
Potential New Issues and Requests           0.00        0.00          0.00   
Social Conversation                         0.00        0.00          0.00   
Solution Discussion                         0.00        0.00          0.00   
Solution Usage                              0.00        0.00          0.00   
Task Progress              