## ML competition
### Predict a test set

_Marilyn, Shiva, Olivier_

Generate a submission from the test set in the `data` folder.

In [6]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [7]:
# Setup chunk

import time

# Custom utils
from utils import *

# Data wrangling
import pandas as pd
import numpy as np

# Plotting
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

# Text sanitization
import re
import nltk
from nltk.stem.snowball import SnowballStemmer

try:
    # Avoid error if you don't have the resource
    stopwords = nltk.corpus.stopwords.words("english")
except LookupError:
    nltk.download("stopwords")
    stopwords = nltk.corpus.stopwords.words("english")
    
stemmer = SnowballStemmer(language="english")

# Lang detection
#import langid
#from langid.langid import LanguageIdentifier, model
#identifier = LanguageIdentifier.from_modelstring(model, norm_probs=True)

# Misc
from tqdm import tqdm
tqdm.pandas()

# Define the seed for reproducibility
SEED = 31415

# Define n_jobs
JOBS = 7

In [8]:
# Scikit time
from sklearn.naive_bayes import MultinomialNB, ComplementNB, BernoulliNB
from sklearn.linear_model import LogisticRegression, RidgeClassifier, SGDClassifier
from sklearn.svm import SVC, LinearSVC
from sklearn.ensemble import RandomForestClassifier, BaggingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import (
    LinearDiscriminantAnalysis,
    QuadraticDiscriminantAnalysis,
)

from sklearn.feature_extraction.text import (
    CountVectorizer, 
    TfidfTransformer, 
    TfidfVectorizer
)
from sklearn.pipeline import Pipeline, make_pipeline


from sklearn.model_selection import (
    train_test_split, 
    GridSearchCV, 
    KFold, 
    cross_val_score
)

from sklearn.metrics import (
    classification_report, 
    accuracy_score, 
    confusion_matrix
)

In [9]:
df = pd.read_csv("data/MLUnige2021_train.csv")

In [10]:
try:
    df_san = pd.read_pickle("./data/sanitized.pkl")
except FileNotFoundError:
    print("No pickle file found, sanitizing existing df")
    
    # Sanitize whole dataset
    df_san = df.copy()
    df_san["sanitized"] = df["text"].progress_apply(sanitize)

    # Export it to pickle so we don't have to redo it
    df_san.to_pickle("./data/sanitized.pkl")

In [11]:
df_san.head()

Unnamed: 0,Id,emotion,tweet_id,date,lyx_query,user,text,sanitized
0,0,1,2063391019,Sun Jun 07 02:28:13 PDT 2009,NO_QUERY,BerryGurus,@BreeMe more time to play with you BlackBerry ...,time play blackberri
1,1,0,2000525676,Mon Jun 01 22:18:53 PDT 2009,NO_QUERY,peterlanoie,Failed attempt at booting to a flash drive. Th...,fail attempt boot flash drive fail attempt swi...
2,2,0,2218180611,Wed Jun 17 22:01:38 PDT 2009,NO_QUERY,will_tooker,@msproductions Well ain't that the truth. Wher...,well truth damn auto lock disabl go copi past ...
3,3,1,2190269101,Tue Jun 16 02:14:47 PDT 2009,NO_QUERY,sammutimer,@Meaghery cheers Craig - that was really sweet...,cheer craig realli sweet repli pump
4,4,0,2069249490,Sun Jun 07 15:31:58 PDT 2009,NO_QUERY,ohaijustin,I was reading the tweets that got send to me w...,read tweet got send lie phone face drop amp hi...


# 2. Fitting

## 1. with manual preprocessing

In [12]:
# On whole training dataset
X_train, y_train = df_san["text"], df_san["emotion"]

In [14]:
# Sanity check
print("X_train: ", X_train.shape)
print("y_train: ", y_train.shape)

X_train:  (1280000,)
y_train:  (1280000,)


In [15]:
#Import real test set
df_test = pd.read_csv("./data/MLUnige2021_test.csv").drop(columns=["tweet_id", "date", "lyx_query", "user"])
df_test.head()

Unnamed: 0,Id,text
0,0,working add oil
1,1,@KristianaNKOTB you're welcome
2,2,"is going to bed, work in the morning boo but t..."
3,3,@sparky_habbo - uni &amp; assignments happened...
4,4,Can't wait to have chinese food! Still disappo...


In [16]:
# Preprocess tweets
df_test["text"] = df_test["text"].progress_apply(sanitize)
df_test.head() #seems good

100%|████████████████████████████████████████████████████████████████████████| 320000/320000 [01:05<00:00, 4849.63it/s]


Unnamed: 0,Id,text
0,0,work add oil
1,1,usernam welcom
2,2,go bed work morn boo get see b
3,3,usernam uni amp assign happen goodnight mr sparkl
4,4,wait chines food still disappoint ocharley sto...


In [17]:
# Keep the chosen model(s) with the optimal CV params
models = [BernoulliNB()]

params_tfid = {
    #"tfidfvectorizer__strip_accents": ["unicode"],
}

params = {
    "bernoullinb": {
        "bernoullinb__alpha": [7.5],
        #"bernoullinb__fit_prior": [False, True],
    }
}

# If we also want to gridsearch the different Tfidf params
for k, v in params_tfid.items():
    #params["bernoullinb"][k] = v
    #Easier if we comment above
    pass

pipes = []

for model in models:
    #pipe = make_pipeline(TfidfVectorizer(), model)
    #pipe = make_pipeline(CountVectorizer(lowercase=True, stop_words=None, ngram_range=(1, 3), analyzer='word', max_df=1.0, min_df=1, binary=True, verbose=3), model)
    pipes.append(pipe)
    
    # Set optimal parameters
    pipe.set_params(**params[pipe.steps[1][0]])

In [18]:
# Fit the chosen pipe
# I keep the list because we could have multiple optimal models to fit
for pipe in pipes:
    print(pipe.steps[1][0])
    start = time.time()
    
    # Fit
    pipe.fit(X_train, y_train)
    
    # Predict real data
    df_test["emotion"] = pipe.predict(df_test["text"])
    
    print(f"Time {time.time() - start}s")  

bernoullinb
Time 209.34469556808472s


In [19]:
df_test_final = df_test.copy().drop(["text"], axis=1)
df_test_final.to_csv("./data/submissions/bernoullinb.csv", index=None)

In [20]:
df_test_final

Unnamed: 0,Id,emotion
0,0,1
1,1,1
2,2,0
3,3,1
4,4,1
...,...,...
319995,319995,1
319996,319996,1
319997,319997,0
319998,319998,0
