# INIT

In [54]:
import pandas as pd
import pickle
import spacy
import time
import re
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import accuracy_score, classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import CategoricalNB
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import LinearSVC
from sklearn.preprocessing import FunctionTransformer
from sklearn.pipeline import make_pipeline

In [2]:
nlp = spacy.load('en_core_web_sm')

In [3]:
df = pd.read_csv("data/training.1600000.processed.noemoticon.csv", encoding='latin-1', header = None, names=['sentiment', 'id', 'date', 'query', 'name', 'tweet'])

In [4]:
df

Unnamed: 0,sentiment,id,date,query,name,tweet
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."
...,...,...,...,...,...,...
1599995,4,2193601966,Tue Jun 16 08:40:49 PDT 2009,NO_QUERY,AmandaMarie1028,Just woke up. Having no school is the best fee...
1599996,4,2193601969,Tue Jun 16 08:40:49 PDT 2009,NO_QUERY,TheWDBoards,TheWDB.com - Very cool to hear old Walt interv...
1599997,4,2193601991,Tue Jun 16 08:40:49 PDT 2009,NO_QUERY,bpbabe,Are you ready for your MoJo Makeover? Ask me f...
1599998,4,2193602064,Tue Jun 16 08:40:49 PDT 2009,NO_QUERY,tinydiamondz,Happy 38th Birthday to my boo of alll time!!! ...


In [5]:
def cleanText(data):
    lemma_text = []
    for text in data:
        nlp_text = nlp(text)
        buffer = []
        for token in nlp_text:
            lemma_token = token.lemma_
            buffer.append(lemma_token)
        lemma_text.append(buffer)
    filtered_text = []
    for text in lemma_text:
        buffer = []
        for token in text:
            term = nlp.vocab[token]
            if term.is_stop == False:
                buffer.append(token)
        filtered_text.append(buffer)
    refiltered_text = []
    for text in filtered_text:
        buffer = []
        for token in text:
            if re.sub(r"[^a-zA-ZÀ-ÿ]+", ' ', token) != ' ':
                buffer.append(re.sub(r"[^a-zA-ZÀ-ÿ]+", '', token))
        refiltered_text.append(buffer)
    my_stopwords = ['PRON', 'Twitter', 'RT']
    second_filter = []
    for text in refiltered_text:
        for token in text:
            if token not in my_stopwords and token.find('http') == -1 and len(token) > 2:
                second_filter.append(token)
    return second_filter

# DATA TREATMENT

In [6]:
with open('data2.pickle', 'rb') as handle:
    data = pickle.load(handle)

In [7]:
x = data['tweet']
y = data['sentiment']

In [8]:
xtrain, xtest, ytrain, ytest = train_test_split(x, y, test_size=0.25)

In [61]:
X_train = []
for tweet in xtrain:
    string = str(tweet).strip('[]')   
    X_train.append(re.sub("'", '', string))
X_trainNP = np.array(X_train)


KeyboardInterrupt



In [None]:
X_test = []
for tweet in xtest:
    string = str(tweet).strip('[]')   
    X_test.append(re.sub("'", '', string))
X_testNP = np.array(X_test)

# MODELS

## Randomforest

In [128]:
rfc = Pipeline([('tfidf', TfidfVectorizer()), ('rfc', RandomForestClassifier(n_estimators=400, max_features='auto', criterion='gini', max_depth=40, random_state=0))])

In [129]:
rfc.fit(X_train, ytrain)

Pipeline(steps=[('tfidf', TfidfVectorizer()),
                ('rfc',
                 RandomForestClassifier(max_depth=40, n_estimators=400,
                                        random_state=0))])

with open('model.pickle', 'wb') as handle:
    pickle.dump(rfc, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [3]:
with open('model.pickle', 'rb') as handle:
    model = pickle.load(handle)

In [130]:
ypred = rfc.predict(X_test)

In [131]:
accuracy_score(ytest, ypred)

0.7424145833333333

In [100]:
y_pred = model.predict(X_test)
accuracy_score(ytest, y_pred)

0.743821875

In [72]:
print(classification_report(ytest, ypred, digits=4))

              precision    recall  f1-score   support

           0     0.7562    0.7178    0.7365    319218
           4     0.7327    0.7698    0.7508    320782

    accuracy                         0.7438    640000
   macro avg     0.7444    0.7438    0.7436    640000
weighted avg     0.7444    0.7438    0.7436    640000



### Tests

In [17]:
sentence = "I really love you"
cleaned = cleanText([sentence])

test = []
for tweet in cleaned:
    string = str(tweet).strip('[]')   
    test.append(re.sub("'", '', string))
np.array(test)

pred = model.predict(test)
if pred[0] == 0:
    print("Négatif")
elif pred[0] == 4:
    print("Positif")

print(test)

Positif
['love']


## LogisticRegression

In [22]:
rfc = Pipeline([('tfidf', TfidfVectorizer()), ('rfc', LogisticRegression(max_iter=1000))])

In [23]:
rfc.fit(X_train, ytrain)

Pipeline(steps=[('tfidf', TfidfVectorizer()),
                ('rfc', LogisticRegression(max_iter=1000))])

In [24]:
ypred = rfc.predict(X_test)

In [25]:
accuracy_score(ytest, ypred)

0.768765

In [21]:
print(classification_report(ytest, ypred, digits=4))

              precision    recall  f1-score   support

           0     0.7820    0.7453    0.7632    200020
           4     0.7567    0.7922    0.7740    199980

    accuracy                         0.7688    400000
   macro avg     0.7693    0.7688    0.7686    400000
weighted avg     0.7693    0.7688    0.7686    400000



### Tests

In [27]:
sentence = "i want you to have a good life"
cleaned = cleanText([sentence])

test = []
for tweet in cleaned:
    string = str(tweet).strip('[]')   
    test.append(re.sub("'", '', string))
np.array(test)

pred = rfc.predict(test)
if pred[0] == 0:
    print("Négatif")
elif pred[0] == 4:
    print("Positif")

print(test)

Négatif
['want', 'good', 'life']


In [28]:
with open('model2.pickle', 'wb') as handle:
    pickle.dump(rfc, handle, protocol=pickle.HIGHEST_PROTOCOL)

## Naive Bayes

## Categorical

In [55]:
nvc = make_pipeline(
     CountVectorizer(),  
     CategoricalNB()
)

In [60]:
nvc.fit(X_trainNP, ytrain)

MemoryError: Unable to allocate 4.45 TiB for an array with shape (1200000, 510066) and data type int64

In [None]:
ypred = nvc.predict(X_test)

In [None]:
accuracy_score(ytest, ypred)

In [None]:
print(classification_report(ytest, ypred, digits=4))

## Tests

## Gaussian

In [43]:
nvg = Pipeline([('tfidf', TfidfVectorizer()), ('rfc', LinearSVC())])

In [44]:
nvg.fit(X_train, ytrain)

Pipeline(steps=[('tfidf', TfidfVectorizer()), ('rfc', LinearSVC())])

In [45]:
ypred = nvg.predict(X_test)

In [46]:
accuracy_score(ytest, ypred)

0.7588025

In [47]:
print(classification_report(ytest, ypred, digits=4))

              precision    recall  f1-score   support

           0     0.7653    0.7465    0.7558    200020
           4     0.7526    0.7711    0.7617    199980

    accuracy                         0.7588    400000
   macro avg     0.7590    0.7588    0.7588    400000
weighted avg     0.7590    0.7588    0.7588    400000



# Export