In [1]:
# import pandas as pd
# import numpy as np
# import re
#Importing everything from NLP Week 1 - following that as a guide for now
import gzip
import json
import matplotlib.pyplot as plt
import numpy as np
import re
import random
import pandas as pd
import seaborn as sns
from collections import Counter, defaultdict
from sklearn.dummy import DummyClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split
from tqdm import tqdm
pd.options.display.max_rows = 100
pd.options.display.width = 150
RANDOM_SEED = 696


In [2]:
WikiLarge_Train_df = pd.read_csv(r'/Users/staceybruestle/Documents/Documents/Education/MADS/Coursework/aSIADS696- Milestone 2/Readability Project/Data/WikiLarge_Train.csv')#, \

In [3]:
train_df, dev_df, test_df = np.split(WikiLarge_Train_df.sample(frac=1, random_state= RANDOM_SEED), 
                       [int(.8*len(WikiLarge_Train_df)), int(.9*len(WikiLarge_Train_df))], axis = 0)
#Make list of labels
y_train = list(train_df.label)
y_dev = list(dev_df.label)
y_test = list(test_df.label)
# Shapes: 333414, 41677, 41677 all with 5 columns

Create the Dummy Classifiers to use as reference for other scores

In [4]:
def dummyClassifierScores(train_df, dev_df, vectorizer, min_df=1, stop_words= None, strategy='uniform'):
    # Stratgeies 'uniform' and 'most_frequent'
    X_train = vectorizer.fit_transform(train_df.original_text)
    X_dev = vectorizer.transform(dev_df.original_text)

    dummy = DummyClassifier(strategy= strategy, random_state = RANDOM_SEED, constant=None)
    dummy.fit(X_train, y_train)

    # Generate the predictions
    dev_preds = dummy.predict(X_dev)

    # Score the predictions
    f1_dummy = f1_score(y_dev, dev_preds, average='macro')

    return f1_dummy

Create a function for the steps so we can run it for various amounts of data to see the difference

In [11]:
# the function returns the macro-averaged F1 score on the dev data and the dummy if requested
def train_and_score(train_df, dev_df, min_df=1, max_df = 1.0, max_iter=100, C=1.0, \
                    ngram_range=(1,1), stop_words= None, dummy='no', strategy='uniform'):
    # Fit a new TfidfVectorizer
    vectorizer = TfidfVectorizer(min_df= min_df, max_df= max_df, stop_words= stop_words, ngram_range= ngram_range)
    X_train = vectorizer.fit_transform(train_df.original_text)

    #Get the labels
    y_train = list(train_df.label)

    #Fit the data to a Logistic Regression Classifier
    #'newton-cholesky' not working now - I must have been using a more recent version of python previously
    clf = LogisticRegression(random_state=RANDOM_SEED, max_iter = max_iter, C= C, multi_class='ovr', solver= 'newton-cg')
    clf.fit(X_train, y_train)

    # Generate the dev data
    X_dev = vectorizer.transform(dev_df.original_text)
    y_dev = list(dev_df.label)

    # Generate the predictions
    lr_tiny_dev_preds = clf.predict(X_dev)

    # Score the predictions
    f1 = f1_score(y_dev, lr_tiny_dev_preds, average='macro')

    if dummy== 'yes':
        f1 = (f1, dummyClassifierScores(train_df, dev_df, vectorizer, min_df, stop_words, strategy= strategy))

    return f1


In [8]:
print("All defaults with uniform dummy:")
train_and_score(train_df, dev_df, dummy='yes')

All defaults with uniform dummy:


(0.6834110283504679, 0.49840355363816646)

In [9]:
# Need to create the vectorizer to run dummyClassifierScores by itself
vectorizer = TfidfVectorizer()
print("All defaults - most frequent dummy:", \
dummyClassifierScores(train_df, dev_df, vectorizer, strategy= 'most_frequent') )

All defaults - most frequent dummy: 0.333061289806369


In [12]:
print("default with stopwords:", \
train_and_score(train_df, dev_df, stop_words= 'english') )

default with stopwords: 0.6605484429405641


In [13]:
train_and_score(train_df, dev_df, min_df=10)

0.6921716051041282

In [14]:
train_and_score(train_df, dev_df, min_df=15)

0.6925312886757502

In [15]:
train_and_score(train_df, dev_df, min_df=25)

0.6931104461778711

In [16]:
train_and_score(train_df, dev_df, min_df=30)

0.6925343220490136

In [17]:
train_and_score(train_df, dev_df, min_df=35)

0.6926548631383732

In [18]:
train_and_score(train_df, dev_df, min_df=24)

0.692797250901643

In [19]:
train_and_score(train_df, dev_df, min_df=26)

0.6925819160350151

Looks like leaving all others as defaults and playing with min_df, min_df = 25 is the best. 0.6931104461778711

In [20]:
train_and_score(train_df, dev_df, min_df=25, max_df = .8)

0.6931104461778711

In [21]:
train_and_score(train_df, dev_df, min_df=25, max_df = .9)

0.6931104461778711

Why didn't things change using a different max_df?

In [22]:
train_and_score(train_df, dev_df, min_df=25, max_df = .1)

0.6880147795379578

In [23]:
train_and_score(train_df, dev_df, min_df=25, max_df = .7)

0.6931104461778711

In [24]:
train_and_score(train_df, dev_df, min_df=25, max_df = .6)

0.6932060383885735

Intersesting. .1 went down but not as much as I would have expected. .6 went up slightly.

In [25]:
train_and_score(train_df, dev_df, min_df=25, max_df = 0.5)

0.6932060383885735

0.5 was the same as 0.6.  0.4 went down. So it looks like 0.6 or 0.5 might be good for that.

In [26]:
train_and_score(train_df, dev_df, min_df=25, max_df = .9)

0.6914490573484454

Lets try working with another max_iter.

In [27]:
train_and_score(train_df, dev_df, min_df=25, max_df = .9, max_iter=200)

0.6931104461778711

In [28]:
train_and_score(train_df, dev_df, min_df=25, max_df = .9, max_iter=300)

0.6931104461778711

In [29]:
train_and_score(train_df, dev_df, min_df=25, max_df = .9, max_iter=400)

0.6931104461778711

In [30]:
train_and_score(train_df, dev_df, min_df=25, max_df = .9, max_iter=500)

0.6931104461778711

Why aren't we seeing a change at all?

In [31]:
train_and_score(train_df, dev_df, min_df=25, max_df = .9, max_iter=1000)

0.6931104461778711

In [32]:
train_and_score(train_df, dev_df, min_df=25, max_df = .9, C= .9)

0.6932544131304068

In [33]:
train_and_score(train_df, dev_df, min_df=25, max_df = .9, C= .8)

0.6929172936912404

In [34]:
train_and_score(train_df, dev_df, min_df=25, max_df = .9, C= .7)

0.6925834197233214

Best so far for C was .9. Lets try slightly higher and slightly lower   0.6932544131304068

In [35]:
train_and_score(train_df, dev_df, min_df=25, max_df = .9, C= .85)

0.6930620712550157

In [36]:
train_and_score(train_df, dev_df, min_df=25, max_df = .9, C= .95)

0.6930620712550157

In [37]:
train_and_score(train_df, dev_df, min_df=25, max_df = .9, C= .9)

0.6932544131304068

C = .9 looks best.  0.6932544131304068
Now play with n_grams

In [38]:
train_and_score(train_df, dev_df, min_df=25, max_df = .9, C= .9, ngram_range=(1,3))

0.7101257948813133

In [39]:
train_and_score(train_df, dev_df, min_df=25, max_df = .9, C= .9, ngram_range=(1,4))

0.7104838972466732

In [None]:
# train_and_score(train_df, dev_df, min_df=1, max_df = 1.0, max_iter=100, C=1.0, \
#                     ngram_range=(1,1), stop_words= None, dummy='no', strategy='uniform')