## Lab

_Marilyn, Shiva, Olivier_

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
# Setup chunk

import time

# Custom utils
from utils import *

# Data wrangling
import pandas as pd
import numpy as np

# Plotting
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

# Text sanitization
import re
import nltk
from nltk.stem.snowball import SnowballStemmer

try:
    # Avoid error if you don't have the resource
    stopwords = nltk.corpus.stopwords.words("english")
except LookupError:
    nltk.download("stopwords")
    stopwords = nltk.corpus.stopwords.words("english")
    
stemmer = SnowballStemmer(language="english")

# Lang detection
#import langid
#from langid.langid import LanguageIdentifier, model
#identifier = LanguageIdentifier.from_modelstring(model, norm_probs=True)

# Misc
from tqdm import tqdm
tqdm.pandas()

# Define the seed for reproducibility
SEED = 31415

# Define n_jobs
JOBS = 4

In [3]:
# Scikit time
from sklearn.naive_bayes import MultinomialNB, ComplementNB, BernoulliNB
from sklearn.linear_model import LogisticRegression, RidgeClassifier, SGDClassifier
from sklearn.svm import SVC, LinearSVC
from sklearn.ensemble import RandomForestClassifier, BaggingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import (
    LinearDiscriminantAnalysis,
    QuadraticDiscriminantAnalysis,
)

from sklearn.feature_extraction.text import (
    CountVectorizer, 
    TfidfTransformer, 
    TfidfVectorizer
)
from sklearn.pipeline import Pipeline, make_pipeline


from sklearn.model_selection import (
    train_test_split, 
    GridSearchCV, 
    KFold, 
    cross_val_score
)

from sklearn.metrics import (
    classification_report, 
    accuracy_score, 
    confusion_matrix
)

In [40]:
df_san = pd.read_pickle("./data/sanitized.pkl")
df_san.head()

In [4]:
df = pd.read_csv("data/MLUnige2021_train.csv")

# 2. Fitting

In [6]:
# Sample
df_sub = df.sample(frac=0.5, random_state=SEED)

# Train test split
X_train, X_test, y_train, y_test = train_test_split(df_sub["text"], df_sub["emotion"], 
                                                    test_size=0.2, shuffle=True, random_state=SEED)

#only 4 folds because I have 4 cores, just to test
folds = KFold(n_splits=4, shuffle=True, random_state=SEED)

# Sanity check
print("X_train: ", X_train.shape)
print("X_test: ", X_test.shape)
print("y_train: ", y_train.shape)
print("y_test: ", y_test.shape)

X_train:  (512000,)
X_test:  (128000,)
y_train:  (512000,)
y_test:  (128000,)


In [33]:
# Define the classifiers to use
models = [BernoulliNB(), LinearSVC()]
models = [LinearSVC()]

# Vectorizer and transformer parameters
params_tfidf = {
    "tfidfvectorizer__max_df": [0.5, 0.8],
    "tfidfvectorizer__min_df": [9, 20, 100],
    "tfidfvectorizer__max_features": [500, 1000, 2000, 5000],
    #"tfidfvectorizer__stop_words": [stopwords, None],
    "tfidfvectorizer__stop_words": [None],
    "tfidfvectorizer__ngram_range": [(1,1)],
    "tfidfvectorizer__use_idf": [True, False],
}
params_countvectorizer = {
    
}
params_tfidftransformer = {
    
}

# Classifiers parameters
params = {
    "bernoullinb": {
        #"bernoullinb__alpha": np.linspace(0.1, 10, 10),
        #"bernoullinb__fit_prior": [True],
        **params_tfidf
    },
    "linearsvc": {
        "linearsvc__random_state": [SEED],
        "linearsvc__dual": [True, False],
        **params_tfidf
    }
}

pipes = []

# Also check what we can do with the TfidfVectorizer parameters
for model in models:
    pipe = make_pipeline(TfidfVectorizer(), model)
    #pipe = make_pipeline(CountVectorizer(), model)    
    #pipe = make_pipeline(CountVectorizer(), TfidfTransformer(), model)
    
    pipes.append(pipe)
    
    # Will use that once we have the best params
    #pipe.set_params(**params[pipe.steps[1][0]])

In [34]:
for pipe in pipes:
    print(pipe)

Pipeline(steps=[('tfidfvectorizer', TfidfVectorizer()),
                ('linearsvc', LinearSVC())])


In [35]:
# Fit each different pipeline
resdfs = []
for pipe in pipes:
    print(pipe.steps[1][0])
    
    start = time.time()
    
    gridsearch = GridSearchCV(pipe, params[pipe.steps[1][0]], scoring="accuracy", cv=folds, n_jobs=JOBS, verbose=3)
    gridsearch.fit(X_train, y_train)
    y_pred = gridsearch.predict(X_test)
    
    score = accuracy_score(y_test, y_pred)
    resdf = pd.DataFrame(gridsearch.cv_results_)
    resdfs.append(resdf)
    
    print(f"Time {time.time() - start}s")
    print(f"Test accuracy: {score}")

linearsvc
Fitting 4 folds for each of 96 candidates, totalling 384 fits
Time 1403.363578081131s
Test accuracy: 0.7885859375


In [36]:
resdfs[0].sort_values(by=["rank_test_score"]) 

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_linearsvc__dual,param_linearsvc__random_state,param_tfidfvectorizer__max_df,param_tfidfvectorizer__max_features,param_tfidfvectorizer__min_df,param_tfidfvectorizer__ngram_range,param_tfidfvectorizer__stop_words,param_tfidfvectorizer__use_idf,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,mean_test_score,std_test_score,rank_test_score
91,18.229727,0.287411,1.611809,0.057283,False,31415,0.8,5000,9,"(1, 1)",,False,"{'linearsvc__dual': False, 'linearsvc__random_...",0.788398,0.788055,0.788492,0.787320,0.788066,0.000461,1
67,18.062269,0.299284,1.584996,0.071440,False,31415,0.5,5000,9,"(1, 1)",,False,"{'linearsvc__dual': False, 'linearsvc__random_...",0.788398,0.788055,0.788492,0.787320,0.788066,0.000461,1
93,18.066794,0.142911,1.619427,0.064970,False,31415,0.8,5000,20,"(1, 1)",,False,"{'linearsvc__dual': False, 'linearsvc__random_...",0.788438,0.788047,0.788391,0.787383,0.788064,0.000421,3
69,18.117394,0.118312,1.695312,0.170147,False,31415,0.5,5000,20,"(1, 1)",,False,"{'linearsvc__dual': False, 'linearsvc__random_...",0.788438,0.788047,0.788391,0.787383,0.788064,0.000421,3
45,14.030505,0.247007,1.617622,0.052953,True,31415,0.8,5000,20,"(1, 1)",,False,"{'linearsvc__dual': True, 'linearsvc__random_s...",0.788438,0.788047,0.788391,0.787383,0.788064,0.000421,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
49,8.948052,0.292218,1.505437,0.033829,False,31415,0.5,500,9,"(1, 1)",,False,"{'linearsvc__dual': False, 'linearsvc__random_...",0.746422,0.746820,0.745656,0.743383,0.745570,0.001330,91
75,8.307357,0.116856,1.464836,0.059942,False,31415,0.8,500,20,"(1, 1)",,False,"{'linearsvc__dual': False, 'linearsvc__random_...",0.746422,0.746820,0.745656,0.743383,0.745570,0.001330,91
51,8.685737,0.237702,1.490620,0.009213,False,31415,0.5,500,20,"(1, 1)",,False,"{'linearsvc__dual': False, 'linearsvc__random_...",0.746422,0.746820,0.745656,0.743383,0.745570,0.001330,91
73,8.333922,0.090250,1.489895,0.060286,False,31415,0.8,500,9,"(1, 1)",,False,"{'linearsvc__dual': False, 'linearsvc__random_...",0.746422,0.746820,0.745656,0.743383,0.745570,0.001330,91


In [41]:
res = resdfs[0] 
res[res["rank_test_score"] == 1]["params"].values

#{'linearsvc__dual': False, 'linearsvc__random_state': 31415, 'tfidfvectorizer__max_df': 0.5, 'tfidfvectorizer__max_features': 5000, 'tfidfvectorizer__min_df': 9, 'tfidfvectorizer__ngram_range': (1, 1), 'tfidfvectorizer__stop_words': None, 'tfidfvectorizer__use_idf': False}

array([{'linearsvc__dual': False, 'linearsvc__random_state': 31415, 'tfidfvectorizer__max_df': 0.5, 'tfidfvectorizer__max_features': 5000, 'tfidfvectorizer__min_df': 9, 'tfidfvectorizer__ngram_range': (1, 1), 'tfidfvectorizer__stop_words': None, 'tfidfvectorizer__use_idf': False},
       {'linearsvc__dual': False, 'linearsvc__random_state': 31415, 'tfidfvectorizer__max_df': 0.8, 'tfidfvectorizer__max_features': 5000, 'tfidfvectorizer__min_df': 9, 'tfidfvectorizer__ngram_range': (1, 1), 'tfidfvectorizer__stop_words': None, 'tfidfvectorizer__use_idf': False}],
      dtype=object)

In [53]:
# Run optimal on whole training sample
X_train, y_train = df["text"], df["emotion"]

# Sanity check
print("X_train: ", X_train.shape)
print("y_train: ", y_train.shape)

X_train:  (1280000,)
y_train:  (1280000,)


In [54]:
# Optimal params
models = [LinearSVC()]
opti_params = {'linearsvc__dual': False, 'linearsvc__random_state': 31415, 'tfidfvectorizer__max_df': 0.5, 'tfidfvectorizer__max_features': 5000, 'tfidfvectorizer__min_df': 9, 'tfidfvectorizer__ngram_range': (1, 1), 'tfidfvectorizer__stop_words': None, 'tfidfvectorizer__use_idf': False}

pipes = []

# Also check what we can do with the TfidfVectorizer parameters
for model in models:
    pipe = make_pipeline(TfidfVectorizer(), model)
    #pipe = make_pipeline(CountVectorizer(), model)    
    #pipe = make_pipeline(CountVectorizer(), TfidfTransformer(), model)
    
    pipes.append(pipe)
    
    # Will use that once we have the best params
    pipe.set_params(**opti_params)

print(pipes)

[Pipeline(steps=[('tfidfvectorizer',
                 TfidfVectorizer(max_df=0.5, max_features=5000, min_df=9,
                                 use_idf=False)),
                ('linearsvc', LinearSVC(dual=False, random_state=31415))])]


In [55]:
# Import and prepare real test df
df_test = pd.read_csv("./data/MLUnige2021_test.csv").drop(columns=["tweet_id", "date", "lyx_query", "user"])
df_test.head()

Unnamed: 0,Id,text
0,0,working add oil
1,1,@KristianaNKOTB you're welcome
2,2,"is going to bed, work in the morning boo but t..."
3,3,@sparky_habbo - uni &amp; assignments happened...
4,4,Can't wait to have chinese food! Still disappo...


In [57]:
df_test.shape

(320000, 3)

In [56]:
for pipe in pipes:
    print(pipe.steps[1][0])
    
    start = time.time()
    
    pipe.fit(X_train, y_train)
    df_test["emotion"] = pipe.predict(df_test["text"])
    
    print(f"Time {time.time() - start}s")

linearsvc
Time 51.75299334526062s


In [58]:
df_test

Unnamed: 0,Id,text,emotion
0,0,working add oil,1
1,1,@KristianaNKOTB you're welcome,1
2,2,"is going to bed, work in the morning boo but t...",0
3,3,@sparky_habbo - uni &amp; assignments happened...,0
4,4,Can't wait to have chinese food! Still disappo...,0
...,...,...,...
319995,319995,@davidgregory How is it that MTP isn't in HiDe...,0
319996,319996,im thinking about this long shot for a long ti...,1
319997,319997,@meg___ i feel so old.. i'll be 22 in september,0
319998,319998,watching supernatural ahah poor they. in jail,0


In [59]:
df_test_final = df_test.copy().drop(["text"], axis=1)
print(df_test_final.columns)
df_test_final.to_csv("./data/submissions/linearsvc.csv", index=None)

Index(['Id', 'emotion'], dtype='object')


In [60]:
df_test_final

Unnamed: 0,Id,emotion
0,0,1
1,1,1
2,2,0
3,3,0
4,4,0
...,...,...
319995,319995,0
319996,319996,1
319997,319997,0
319998,319998,0
