## ML competition
### Predict a test set

_Marilyn, Shiva, Olivier_

Generate a submission from the test set in the `data` folder.

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
# Setup chunk

import time

# Custom utils
from utils import *

# Data wrangling
import pandas as pd
import numpy as np

# Plotting
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

# Text sanitization
import re
import nltk
from nltk.stem.snowball import SnowballStemmer

try:
    # Avoid error if you don't have the resource
    stopwords = nltk.corpus.stopwords.words("english")
except LookupError:
    nltk.download("stopwords")
    stopwords = nltk.corpus.stopwords.words("english")
    
stemmer = SnowballStemmer(language="english")

# Lang detection
#import langid
#from langid.langid import LanguageIdentifier, model
#identifier = LanguageIdentifier.from_modelstring(model, norm_probs=True)

# Misc
from tqdm import tqdm
tqdm.pandas()

# Define the seed for reproducibility
SEED = 31415

# Define n_jobs
JOBS = 7

  from pandas import Panel


In [4]:
# Scikit time
from sklearn.naive_bayes import MultinomialNB, ComplementNB, BernoulliNB
from sklearn.linear_model import LogisticRegression, RidgeClassifier, SGDClassifier
from sklearn.svm import SVC, LinearSVC
from sklearn.ensemble import RandomForestClassifier, BaggingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import (
    LinearDiscriminantAnalysis,
    QuadraticDiscriminantAnalysis,
)

from sklearn.feature_extraction.text import (
    CountVectorizer, 
    TfidfTransformer, 
    TfidfVectorizer
)
from sklearn.pipeline import Pipeline, make_pipeline


from sklearn.model_selection import (
    train_test_split, 
    GridSearchCV, 
    KFold, 
    cross_val_score
)

from sklearn.metrics import (
    classification_report, 
    accuracy_score, 
    confusion_matrix
)

In [5]:
df = pd.read_csv("MLUnige2021_train.csv")

In [6]:
try:
    df_san = pd.read_pickle("./data/sanitized-rec.pkl")
except FileNotFoundError:
    print("No pickle file found, sanitizing existing df")
    
    # Sanitize whole dataset
    df_san = df.copy()
    df_san["sanitized"] = df["text"].progress_apply(sanitize)

    # Export it to pickle so we don't have to redo it
    df_san.to_pickle("./data/sanitized-rec.pkl")

  0%|          | 330/1280000 [00:00<06:30, 3276.14it/s]

No pickle file found, sanitizing existing df


100%|██████████| 1280000/1280000 [03:47<00:00, 5633.64it/s]


In [7]:
print(df_san.shape)
df_san.head()

(1280000, 8)


Unnamed: 0,Id,emotion,tweet_id,date,lyx_query,user,text,sanitized
0,0,1,2063391019,Sun Jun 07 02:28:13 PDT 2009,NO_QUERY,BerryGurus,@BreeMe more time to play with you BlackBerry ...,breem time play blackberri
1,1,0,2000525676,Mon Jun 01 22:18:53 PDT 2009,NO_QUERY,peterlanoie,Failed attempt at booting to a flash drive. Th...,fail attempt boot flash drive fail attempt swi...
2,2,0,2218180611,Wed Jun 17 22:01:38 PDT 2009,NO_QUERY,will_tooker,@msproductions Well ain't that the truth. Wher...,msproduct well truth damn auto lock disabl go...
3,3,1,2190269101,Tue Jun 16 02:14:47 PDT 2009,NO_QUERY,sammutimer,@Meaghery cheers Craig - that was really sweet...,meagheri cheer craig realli sweet repli pump
4,4,0,2069249490,Sun Jun 07 15:31:58 PDT 2009,NO_QUERY,ohaijustin,I was reading the tweets that got send to me w...,read tweet got send lie phone face drop amp hi...


In [9]:
print("Before sanitizing", df['text'].apply(lambda x: len(x.split(' '))).sum())
print("After sanitizing", df_san['sanitized'].apply(lambda x: len(x.split(' '))).sum())

Before sanitizing 18398298
After sanitizing 10730388


# 2. Fitting

In [10]:
vectorizer = CountVectorizer(lowercase=True, stop_words=None, ngram_range=(1, 3), analyzer='word', max_df=1.0, min_df=1, binary=True)
df_san_vec = vectorizer.fit_transform(df_san['sanitized'])
type(df_san_vec)


scipy.sparse.csr.csr_matrix

In [23]:
X_train, y_train = df_san_vec, df_san["emotion"]

#only 4 folds because I have 4 cores, just to test
folds = KFold(n_splits=4, shuffle=True, random_state=SEED)

In [25]:
# Sanity check
print("X_train: ", X_train.shape)

print("y_train: ", y_train.shape)


X_train:  (1280000, 9956945)
y_train:  (1280000,)


In [26]:
#Import real test set
df_test = pd.read_csv("MLUnige2021_test.csv").drop(columns=["tweet_id", "date", "lyx_query", "user"])
df_test.head()

Unnamed: 0,Id,text
0,0,working add oil
1,1,@KristianaNKOTB you're welcome
2,2,"is going to bed, work in the morning boo but t..."
3,3,@sparky_habbo - uni &amp; assignments happened...
4,4,Can't wait to have chinese food! Still disappo...


In [27]:
# Preprocess tweets
df_test["text"] = df_test["text"].progress_apply(sanitize)
df_test.head() #seems good

100%|██████████| 320000/320000 [00:57<00:00, 5591.72it/s]


Unnamed: 0,Id,text
0,0,work add oil
1,1,kristianankotb welcom
2,2,go bed work morn boo get see b
3,3,sparky_habbo uni amp assign happen goodnight ...
4,4,wait chines food still disappoint ocharley sto...


In [28]:
df_test_vec = vectorizer.fit_transform(df_test['text'])
type(df_test_vec)

scipy.sparse.csr.csr_matrix

In [29]:
X_test = df_test_vec


In [30]:
# Sanity check
print("X_test: ", X_test.shape)



X_test:  (320000, 2924225)


In [32]:
berNB = Pipeline(
    [
    
        ("clf", BernoulliNB()),
    ]
)

start = time.time()

CV_ber = cross_val_score(
    berNB, X_train, y_train, scoring="accuracy", cv=folds, n_jobs=-1
)

berNB.fit(X_train, y_train)


"""
Whole dataset:
Time 21.568854093551636
Mean CV accuracy: 0.7639306640625
Test accuracy: 0.76375390625

10% sample:
Time 2.146036386489868
Mean CV accuracy: 0.7483203125
Test accuracy: 0.7523828125
"""

#("tfidf", TfidfVectorizer())

'\nWhole dataset:\nTime 21.568854093551636\nMean CV accuracy: 0.7639306640625\nTest accuracy: 0.76375390625\n\n10% sample:\nTime 2.146036386489868\nMean CV accuracy: 0.7483203125\nTest accuracy: 0.7523828125\n'

In [35]:

df_test["emotion"] = berNB.predict(X_test)
    
    

ValueError: Expected input with 9956945 features, got 2924225 instead

In [14]:
df_test_final = df_test.copy().drop(["text"], axis=1)


df_test_final.to_csv("./data/submissions/bernoullinb.csv", index=None)

NameError: name 'df_test' is not defined

In [15]:
df_test_final

NameError: name 'df_test_final' is not defined