#### Imports

In [None]:
#%pip install nltk
#%pip install spacy
#%pip install mljar-supervised
#%pip install lime

In [1]:
# default
import pandas as pd
import numpy as np

# tokenização e pre-processamento
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer

# coisas de ml
from sklearn.model_selection import train_test_split

# modelos de ml
from supervised.automl import AutoML
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC

# métricas e explicabilidade
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from lime import lime_text
from lime.lime_text import LimeTextExplainer

Using `tqdm.autonotebook.tqdm` in notebook mode. Use `tqdm.tqdm` instead to force console mode (e.g. in jupyter console)


## Main

In [2]:
df = pd.read_csv("tweets.csv",encoding='ISO-8859-1', names=['emotion', 'id', 'date', 'query', 'user', 'text'])
df = df.drop(columns=['id', 'date', 'query', 'user'])
df.head()

Unnamed: 0,emotion,text
0,0,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,is upset that he can't update his Facebook by ...
2,0,@Kenichan I dived many times for the ball. Man...
3,0,my whole body feels itchy and like its on fire
4,0,"@nationwideclass no, it's not behaving at all...."


In [3]:
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\pedro\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\pedro\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [8]:
stop_words = set(stopwords.words('english'))

stemmer = PorterStemmer()

vocab = {}

def normalize_text(text):
    global vocab
    # Normalização de case
    text = text.lower()
    # Remoção de usuários e links
    # - (tem que fazer antes da tokenização pq ele não tava tirando as menções, já q o '@' ficou como um token separado)
    # - o .isalpha() dps do .split() iria tirar toda palavra que tivesse terminando com um acento ou vírgula, ele deve ser usado dps do word_tokenize
    text = ' '.join([word for word in text.split() if not(('@' in word) or ('.com' in word))]) 
    # Tokenização
    text = word_tokenize(text)
    # Remoção de stopwords
    text_norm = []
    for word in text:
        # Remoção de stopwords
        if((word not in stop_words) and (word.isalpha())):
            # Stemming
            word = stemmer.stem(word)
            # Adicionando à nova lista/texto
            text_norm.append(word)
            # Criação do vocabulário (p/ filtragem por freq.)
            if(word in vocab.keys()):
                vocab[word] += 1
            else:
                vocab[word] = 1
    
    return text_norm #(ainda não juntei p/ n ter q separar td dnv na filtragem por freq.)

def freq_filter(text):
    global vocab
    text = [word for word in text if vocab[word] > 1]
    return " ".join(text)

df["text_norm"] = df["text"].apply(normalize_text).apply(freq_filter)

df.head()

Unnamed: 0,emotion,text,text_norm
0,0,"@switchfoot http://twitpic.com/2y1zl - Awww, t...",awww bummer shoulda got david carr third day
1,0,is upset that he can't update his Facebook by ...,upset ca updat facebook text might cri result ...
2,0,@Kenichan I dived many times for the ball. Man...,dive mani time ball manag save rest go bound
3,0,my whole body feels itchy and like its on fire,whole bodi feel itchi like fire
4,0,"@nationwideclass no, it's not behaving at all....",behav mad ca see


In [9]:
df.loc[df['emotion']==4, 'emotion'] = 1 # mudando a classe positiva de 4 para 1

vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(df["text_norm"])

X_train, X_test, y_train, y_test = train_test_split(X, df["emotion"], test_size=0.3)
X.shape

(1600000, 81823)

X.shape:
* antes: (1600000, 460181)
* dps do filtro: (1600000, 169304)
* com filtragem de user e links antes (+ filtro de freq): (1600000, 65381)
* filtragem revisada: (1600000, 81823)

In [10]:
df['emotion'].value_counts()

emotion
0    800000
1    800000
Name: count, dtype: int64

In [11]:
model = MultinomialNB()
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
print(f'''
               Treinamento\t|  Teste
Acurácia:    {accuracy_score(y_train, model.predict(X_train)):.11f}\t|  {accuracy_score(y_test, model.predict(X_test)):.11f}
Precisão[1]: {precision_score(y_train, model.predict(X_train), pos_label=1):.11f}\t|  {precision_score(y_test, model.predict(X_test), pos_label=1):.11f}
Precisão[0]: {precision_score(y_train, model.predict(X_train), pos_label=0):.11f}\t|  {precision_score(y_test, model.predict(X_test), pos_label=0):.11f}
Recall:      {recall_score(y_train, model.predict(X_train), pos_label=1):.11f}\t|  {recall_score(y_test, model.predict(X_test), pos_label=1):.11f}
TNR:         {recall_score(y_train, model.predict(X_train), pos_label=0):.11f}\t|  {recall_score(y_test, model.predict(X_test), pos_label=0):.11f}
F1-Score:    {f1_score(y_train, model.predict(X_train), pos_label=1):.11f}\t|  {f1_score(y_test, model.predict(X_test), pos_label=1):.11f}''')


               Treinamento	|  Teste
Acurácia:    0.77815089286	|  0.75710833333
Precisão[1]: 0.77776589525	|  0.75568047460
Precisão[0]: 0.77853733415	|  0.75854868071
Recall:      0.77901137372	|  0.75944865311
TNR:         0.77728994783	|  0.75477095661
F1-Score:    0.77838813627	|  0.75755987806


Acurácia caiu em 0.0007, não é nd dms. As métricas não melhoraram, mas pelo menos o custo computacional sim.

AutoML:

In [12]:
automl = AutoML(
    total_time_limit=5*60, 
    mode='Explain', 
    ml_task='binary_classification'
)

automl.fit(X_train.toarray(), y_train)
# ^ Se não usar o .toarray(), dá erro dizendo que precisa de dados densos;
#   mas se usar, o .toarray() tenta alocar 3.5TiB de memória pra fazer a conversão e dá erro
# > Update: dps da redução da quant. de tokens pelos filtros, 
#   o .toarray() ainda dá erro pq agr tenta alocar só 683GiB

automl_predict = automl.predict(X_test)

#model = MultinomialNB()
#model.fit(X_train, y_train)

#y_pred = model.predict(X_test)

print(f'Test Accuracy for AutoML: {accuracy_score(y_test, automl_predict)}')

automl.report()

MemoryError: Unable to allocate 683. GiB for an array with shape (1120000, 81823) and data type float64

LIME:

([tutorial p/ texto](https://marcotcr.github.io/lime/tutorials/Lime%20-%20basic%20usage%2C%20two%20class%20case.html))