# Nested 5-Fold Cross Validation For Logistic Regression On Textual Features

In [1]:
import numpy as np
import pandas as pd
import pprint
import nltk
from nltk.stem import WordNetLemmatizer
from sklearn import preprocessing
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import precision_recall_fscore_support, classification_report, accuracy_score, make_scorer, confusion_matrix

pp = pprint.PrettyPrinter(indent=4)

## Ignore warnings
import warnings
warnings.filterwarnings('ignore')

#### Use spaCy parser for word tokenization of a sentence:

In [2]:
import spacy
from spacy.lang.en import English

# Load the English language model
nlp = spacy.load('en_core_web_sm')

# Create an instance of the English parser
parser = English()


#### Define stopwords as punctuation + common contractions:

In [3]:
from string import punctuation
from nltk.corpus import stopwords

stop_words = list(punctuation) + ["'s","'m","n't","'re","-","'ll",'...'] #+ stopwords.words('english')

#### Code to lemmatize and tokenize:

In [4]:
def get_lemma(item):
    return WordNetLemmatizer().lemmatize(item)

def tokenize(line):
    line_tokens = []
    tokens = parser(line)
    for token in tokens:
        if token.orth_.isspace():
            continue
        elif token.like_url:
            line_tokens.append('URL')
        elif token.orth_.startswith('@'):
            line_tokens.append('SCREEN_NAME')
        elif str(token) not in stop_words:
            line_tokens.append(get_lemma(token.lower_))
    return line_tokens

In [5]:
### Read from the pickled file
all_data = pd.read_csv('../data/combined_data_oversampled.csv')

print("Size of corpus: "+str(len(all_data)))

Size of corpus: 38350


In [6]:
all_data = all_data.dropna(subset=['Text Content', 'Code'])

In [7]:
labels_to_remove = [ "Testing",'Future Plan','Issue Content Management']
all_data = all_data[~all_data['Code'].isin(labels_to_remove)]

In [8]:
X = all_data['Text Content'].values
y = all_data['Code'].values

print("Number of unique labels: "+str(len(set(y))))

labels = list(set(y))
labels.sort()

pp.pprint(labels)

Number of unique labels: 13
[   'Action on Issue',
    'Bug Reproduction',
    'Contribution and Commitment',
    'Expected Behaviour',
    'Investigation and Exploration',
    'Motivation',
    'Observed Bug Behaviour',
    'Potential New Issues and Requests',
    'Social Conversation',
    'Solution Discussion',
    'Solution Usage',
    'Task Progress',
    'Workarounds']


# Nested Cross-Validation on Logistic Regression:

In [9]:
pipeline = Pipeline([
    ('vect', TfidfVectorizer(tokenizer=tokenize)),
    ('clf', LogisticRegression())
])

### Hyperparameters to search
parameters = {
    'vect__ngram_range': ((1, 1), (1, 2)),  # unigrams or bigrams
    'clf__C': (0.01, 0.1, 1, 10),
}


## Nested Cross Validation using GridSearch

In [10]:
pipeline.fit(X, y)

In [11]:
print(pipeline.predict(["@DenVys thank you so much!"]))

['Social Conversation']


In [12]:
print(pipeline.predict(["For JAX, we may want to rely on Pallas. For TF, since we can't rely on custom ops, we may have to skip support."]))

['Potential New Issues and Requests']


In [13]:
print(pipeline.predict(["FlashAttention: Fast and Memory-Efficient Exact Attention with IO-Awareness."]))

['Expected Behaviour']
