## Lab

_Marilyn, Shiva, Olivier_

In [2]:
%load_ext autoreload
%autoreload 2

In [14]:
# Setup chunk

import time

# Custom utils
from utils import *

# Data wrangling
import pandas as pd
import numpy as np

# Plotting
import matplotlib.pyplot as plt
%matplotlib inline

# Text sanitization
import re
import nltk
from nltk.stem.snowball import SnowballStemmer
from nltk.stem import WordNetLemmatizer

try:
    # Avoid error if you don't have the resource
    stopwords = nltk.corpus.stopwords.words("english")
    lemmatizer = WordNetLemmatizer()
except LookupError:
    nltk.download('wordnet')
    nltk.download("stopwords")
    stopwords = nltk.corpus.stopwords.words("english")
    
stemmer = SnowballStemmer(language="english")

# Lang detection
#import langid
#from langid.langid import LanguageIdentifier, model
#identifier = LanguageIdentifier.from_modelstring(model, norm_probs=True)

# Misc
from tqdm import tqdm
tqdm.pandas()

# Define the seed for reproducibility
SEED = 31415

# Define n_jobs
JOBS = 7

In [4]:
# Scikit time
from sklearn.naive_bayes import MultinomialNB, ComplementNB, BernoulliNB
from sklearn.linear_model import LogisticRegression, RidgeClassifier, SGDClassifier
from sklearn.svm import SVC, LinearSVC
from sklearn.ensemble import RandomForestClassifier, BaggingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import (
    LinearDiscriminantAnalysis,
    QuadraticDiscriminantAnalysis,
)

from sklearn.feature_extraction.text import (
    CountVectorizer, 
    TfidfTransformer, 
    TfidfVectorizer
)
from sklearn.pipeline import Pipeline, make_pipeline


from sklearn.model_selection import (
    train_test_split, 
    GridSearchCV, 
    KFold, 
    cross_val_score
)

from sklearn.metrics import (
    classification_report, 
    accuracy_score, 
    confusion_matrix
)

In [9]:
# Import df
df = pd.read_csv("./data/MLUnige2021_train.csv", index_col="Id")
df = df.drop(columns=["tweet_id", "date", "lyx_query", "user"])
df.head()

  mask |= (ar1 == a)


Unnamed: 0_level_0,emotion,text
Id,Unnamed: 1_level_1,Unnamed: 2_level_1
0,1,@BreeMe more time to play with you BlackBerry ...
1,0,Failed attempt at booting to a flash drive. Th...
2,0,@msproductions Well ain't that the truth. Wher...
3,1,@Meaghery cheers Craig - that was really sweet...
4,0,I was reading the tweets that got send to me w...


In [15]:
def sanitize3(txt: str) -> str:
    """Preprocess text"""
    
    # Regex patterns
    urlPattern        = r"((http://)[^ ]*|(https://)[^ ]*|( www\.)[^ ]*)"
    userPattern       = r"@[^\s]+"
    alphaPattern      = "[^a-zA-Z0-9]"
    sequencePattern   = r"(.)\1\1+"
    seqReplacePattern = r"\1\1"
    
    # Lowercase
    txt = txt.lower()
    # Url
    txt = re.sub(r"((http://)[^ ]*|(https://)[^ ]*|( www\.)[^ ]*)", " URL", txt)
    # Twitter handle
    txt = re.sub(r"@[^\s]+", " USER", txt)
    # Remove non-alphanumeric chars
    txt = re.sub(r"[^a-zA-Z0-9]", " ", txt)
    # Replace consecutive chars by 2 repetitions
    txt = re.sub(r"(.)\1\1+", r"\1\1", txt)
    
    words = ""
    for word in txt.split():
        word = lemmatizer.lemmatize(word)
        words += (word + " ")
    
    return words    

In [17]:
df_san = df.copy()
df_san["sanitized"] = df["text"].progress_apply(sanitize)

# Export it to pickle so we don't have to redo it
df_san.to_pickle("./data/sanitized3.pkl")

100%|██████████████████████████████████████████████████████████████████████| 1280000/1280000 [02:39<00:00, 8025.90it/s]


In [18]:
df_san.head()

Unnamed: 0_level_0,emotion,text,sanitized
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,1,@BreeMe more time to play with you BlackBerry ...,usernam time play blackberri
1,0,Failed attempt at booting to a flash drive. Th...,fail attempt boot flash drive fail attempt swi...
2,0,@msproductions Well ain't that the truth. Wher...,usernam well truth damn auto lock disabl go co...
3,1,@Meaghery cheers Craig - that was really sweet...,usernam cheer craig realli sweet repli pump
4,0,I was reading the tweets that got send to me w...,read tweet got send lie phone face drop amp hi...


In [35]:
# Train test split
X_train, X_test, y_train, y_test = train_test_split(df_san["sanitized"], df_san["emotion"], 
                                                    test_size=0.1, shuffle=True, random_state=SEED)

In [36]:
# Fit vectorizer
vect = TfidfVectorizer(max_df=0.5, max_features=50000, min_df=9, ngram_range=(1,2))
vect.fit(X_train)

TfidfVectorizer(max_df=0.5, max_features=50000, min_df=9, ngram_range=(1, 2))

In [37]:
# Transform data
X_train = vect.transform(X_train)
X_test  = vect.transform(X_test)

In [38]:
#BerNB
start = time.time()

ber = BernoulliNB(alpha=2)
ber.fit(X_train, y_train)
y_pred = ber.predict(X_test)
score = accuracy_score(y_test, y_pred)

print(f"Time {time.time() - start}")
print(f"Test accuracy: {score}")

Time 0.3360428810119629
Test accuracy: 0.77121875


In [39]:
#LinearSVC
start = time.time()

svc = LinearSVC()
svc.fit(X_train, y_train)
y_pred = svc.predict(X_test)
score = accuracy_score(y_test, y_pred)

print(f"Time {time.time() - start}")
print(f"Test accuracy: {score}")

Time 23.18459463119507
Test accuracy: 0.7836328125


In [40]:
#LinearSVC
start = time.time()

logreg = LogisticRegression(C=2, n_jobs=-1)
logreg.fit(X_train, y_train)
y_pred = logreg.predict(X_test)
score = accuracy_score(y_test, y_pred)

print(f"Time {time.time() - start}")
print(f"Test accuracy: {score}")

Time 15.302069902420044
Test accuracy: 0.7864140625
