# Desafio Natural Language Processing with Disaster Tweets

In [None]:
import pandas as pd
import numpy as np
import zipfile
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from scipy.sparse import csr_matrix
from tqdm import tqdm

In [None]:
zf = zipfile.ZipFile('./data/nlp-getting-started.zip')
train = pd.read_csv(zf.open('train.csv'))
test = pd.read_csv(zf.open('test.csv'))

In [None]:
train.head(3)

In [None]:
test.head(3)

In [None]:
train.info()

In [None]:
test.info()

- A coluna 'keyword' tem potencial, ja que no treino e no teste tem poucos dados faltantes
    - Preencher com label 'faltante' nos faltantes

In [None]:
word_tokenize(train['text'][200])

In [None]:
stop_words_nltk = list(stopwords.words('english'))
count_vectorizer = CountVectorizer(stop_words='english')
count_train = count_vectorizer.fit_transform(train['text'].values)

In [None]:
csr_matrix(count_train).toarray()

In [None]:
train['text'].values[0]

In [None]:
# Checando a tokenizacao

word_tokenize(train['text'][0])

## Tweet tokenizer

In [None]:
from nltk.tokenize import TweetTokenizer

def tweet_tokenize_column(df, column):
    """ 
        This function gets the Dataframe and the name of a column (String) containing texts (Strings) and returns
        a list of lists containing the tokenized text. It also turns every token to it's lower form.
        
        Input: Pandas DataFrame, String
        Return: Nested List
    """
    
    tweet_tokenizer = TweetTokenizer()
    
    # List of sentences
    list_sent = [tweet_tokenizer.tokenize(sent) for sent in df[column].values]
    
    # List of sentences excluding stopword tokens
    list_sent_no_stop = [[token.lower() 
                           for token in sent 
                           if token not in stopwords.words('english')] 
                           for sent in list_sent]
    
    
    
    return list_sent_no_stop

In [None]:
tokenized_sent_train = tweet_tokenize_column(train,'text')
tokenized_sent_test = tweet_tokenize_column(test,'text')

In [None]:
tokenized_sent_train[:2]

In [None]:
tokenized_sent_test[:2]

In [None]:
tokenized_sent_all = tokenized_sent_train + tokenized_sent_test

Aplicando o TF-IDF nos datasets. Esses tem como caracteristicas:
- Contem palavras somente em letra minuscula
- Nao tem stopwords
- Foi tokenizado com o TweetTokenizer

In [None]:
# Funcao auxiliar para bypass do tokenizador, uma vez que este passo ja foi feito.
def identity_tokenizer(text):
    return text

tfidf_all = TfidfVectorizer(tokenizer=identity_tokenizer, stop_words='english', lowercase=False)    
tfidf_all_fit = tfidf_all.fit_transform(tokenized_sent_all)

tfidf_all.get_feature_names()[1000:1002]


In [None]:
tfidf_all.get_feature_names()

In [None]:
#       token1 token2 token3
# train1
# train2
# .
# .
# trainN
# test1
# test2
# .
# .
# testN

In [None]:
# print("TF-IDF DataFrame dimensions: {}\n".format(tfidf_train_fit.toarray().shape))
# print("TF-IDF Number or Features: {}\n".format(len(tfidf_train.get_feature_names())))

Faz sentido, ja que o numero de colunas do ``tfidf_train_fit`` corresponde ao numero de tokens, e a contagem do ``tfidf_train.get_feature_names()`` tambem. 

In [None]:
tfidf_all_df = pd.DataFrame(tfidf_all_fit.toarray(), columns=tfidf_all.get_feature_names())

In [None]:
tfidf_all_df

In [None]:
tfidf_train_df = tfidf_all_df[:len(train)]

tfidf_test_df = tfidf_all_df[len(train):]


In [None]:
tfidf_train_df["target_column"] = train['target']

In [None]:
tfidf_train_df['target_column']

In [None]:
# chi

In [None]:
# mi = mutual_info_classif(tfidf_train_df_int.drop("target_column", axis=1), tfidf_train_df_int["target_column"])
# mi = pd.Series(mi)
# mi.index = intersect_columns
# mi.sort_values(ascending=False, inplace=True) 

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, cross_val_score

X = tfidf_train_df.drop("target_column", axis=1)
y = tfidf_train_df["target_column"]

# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=16)

clf = LogisticRegression(random_state=16)

scores_logistic = cross_val_score(clf, X, y, cv=5)

In [None]:
scores_logistic.mean()

In [None]:
from sklearn.metrics import accuracy_score

clf.fit(X,y)

y_pred = clf.predict(X)

print('Training accuracy is {}'.format(accuracy_score(y, y_pred)))

In [None]:
# Submissao

sample_submission = pd.read_csv(zf.open('sample_submission.csv'))

y_sub = clf.predict(tfidf_test_df)

In [None]:
sub = sample_submission.copy()
sub['target'] = y_sub
sub.set_index('id',inplace=True)

In [None]:
sub

In [None]:
sub.to_csv("./submissions/sub_01.csv")

# Selecao de atributos

In [None]:
from sklearn.feature_selection import mutual_info_classif, chi2

chi = chi2(X,y)
chi = pd.Series(chi[0])
chi.index = X.columns
chi.sort_values(ascending=False, inplace=True)    


In [None]:
chi

In [None]:
chi.to_csv("./data/chi.csv")

In [None]:
# atts = np.linspace(100,10000,100)
# list_scores = []
# list_var = []

# for att in tqdm(atts):
    
#     list_scores.append(cross_val_score(clf, X[chi[:int(att)].index], y, cv=3).mean())
#     list_var.append(cross_val_score(clf, X[chi[:int(att)].index], y, cv=3).var())

In [None]:
# int_atts = [int(att) for att in atts]

# import matplotlib.pyplot as plt
# import seaborn as sns

# sns.set()
# plt.figure(figsize=(14,7))
# sns.lineplot(y=list_scores, x=int_atts)
# # plt.axvline(x=int_atts[np.array(list_scores[5:]).argmax()+5], color='r')
# # plt.xticks(ticks=np.arange(0.00, 0.25, 0.01))
# plt.show()

In [None]:
# sns.set()
# plt.figure(figsize=(14,7))
# sns.lineplot(y=list_var, x=int_atts)
# # plt.axvline(x=int_atts[np.array(list_var[5:]).argmin()+5], color='r')
# # plt.xticks(ticks=np.arange(0.00, 0.25, 0.01))
# plt.show()

In [None]:
# atts = np.linspace(100,10000,100)
# list_scores_over = []

# for att in tqdm(atts):
#     clf.fit(X[chi[:int(att)].index],y)
#     y_pred = clf.predict(X[chi[:int(att)].index])
#     acc = accuracy_score(y, y_pred)
    
#     list_scores_over.append(acc)

In [None]:
# int_atts = [int(att) for att in atts]

# sns.set()
# plt.figure(figsize=(14,7))
# sns.lineplot(y=list_scores_over, x=int_atts)
# # plt.axvline(x=int_atts[np.array(list_scores[5:]).argmax()+5], color='r')
# # plt.xticks(ticks=np.arange(0.00, 0.25, 0.01))
# plt.show()

In [None]:
atts = np.linspace(100,10000,100)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=16)



In [None]:
# list_scores_tts = []

# for att in tqdm(atts):
#     clf.fit(X_train[chi[:int(att)].index],y_train)
#     y_pred = clf.predict(X_test[chi[:int(att)].index])
#     acc = accuracy_score(y_test , y_pred)
    
#     list_scores_tts.append(acc)

In [None]:
# int_atts = [int(att) for att in atts]

# sns.set()
# plt.figure(figsize=(14,7))
# sns.lineplot(y=list_scores_tts, x=int_atts)
# # plt.axvline(x=int_atts[np.array(list_var[5:]).argmin()+5], color='r')
# # plt.xticks(ticks=np.arange(0.00, 0.25, 0.01))
# plt.show()

In [None]:
# int_atts = [int(att) for att in atts]
# int_atts[np.array(list_scores_atts).argmax()]

In [None]:
clf.fit(X_train[chi[:3800].index],y_train)
y_pred = clf.predict(X_test[chi[:3800].index])
acc = accuracy_score(y_test , y_pred)

In [None]:
y_sub_chi = clf.predict(tfidf_test_df[chi[:3800].index])

In [None]:
sub_chi = sample_submission.copy()
sub_chi['target'] = y_sub_chi
sub_chi.set_index('id',inplace=True)

In [None]:
sub_chi.to_csv("./submissions/sub_chi.csv")

In [None]:
from sklearn.svm import SVC

clf_svc = SVC()
clf_svc.fit(X_train[chi[:3800].index],y_train)
y_pred = clf_svc.predict(X_test[chi[:3800].index])
acc = accuracy_score(y_test , y_pred)

print('Training accuracy is {}'.format(acc))

In [None]:
clf_svc.fit(tfidf_train_df[chi[:3800].index],y)

In [None]:
y_sub_svc = clf_svc.predict(tfidf_test_df[chi[:3800].index])

In [None]:
sub_svc = sample_submission.copy()
sub_svc['target'] = y_sub_svc
sub_svc.set_index('id',inplace=True)

sub_svc.to_csv("./submissions/sub_svc_overfit.csv")

In [None]:
# atts = [1000,3000,5000]
# list_scores_svc = []

# for att in tqdm(atts):
#     clf_svc.fit(X_train[chi[:int(att)].index],y_train)
#     y_pred = clf_svc.predict(X_test[chi[:int(att)].index])
#     acc = accuracy_score(y_test , y_pred)
    
#     list_scores_svc.append(acc)

# Word Vectors

In [None]:
import spacy 

nlp = spacy.load('en_core_web_lg')

with nlp.disable_pipes():
    train_vecs = pd.DataFrame(np.array([nlp(text).vector for text in train.text])) # doc vectors for training set
    test_vecs = pd.DataFrame(np.array([nlp(text).vector for text in test.text])) # doc vectors for testing set

In [None]:
mi = mutual_info_classif(train_vecs,train.target)
mi = pd.Series(mi)
mi.index = train_vecs.columns
mi.sort_values(ascending=False, inplace=True)    

In [None]:
X_word_vec_train, X_word_vec_test, y_word_vec_train, y_word_vec_test = train_test_split(train_vecs, train.target.values, test_size=0.33, random_state=16)

In [None]:
svc = SVC()

atts = np.linspace(1, 299, 299)
list_scores_svc = []

for att in tqdm(atts):
    svc.fit(X_word_vec_train[mi[:int(att)].index].values, y_word_vec_train)
    y_pred = svc.predict(X_word_vec_test[mi[:int(att)].index].values)
    acc = accuracy_score(y_word_vec_test , y_pred)
    
    list_scores_svc.append(acc)

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

int_atts = [int(att) for att in atts]

sns.set()
plt.figure(figsize=(14,7))
sns.lineplot(y=list_scores_svc, x=atts)
plt.xticks(ticks=np.arange(0.00, 0.25, 0.01))
plt.show()

In [None]:
svc.fit(train_vecs.values, train.target)
y_pred = svc.predict(test_vecs.values)

sub_svc = sample_submission.copy()
sub_svc['target'] = y_sub_svc
sub_svc.set_index('id',inplace=True)

sub_svc.to_csv("./submissions/sub_svc_word_vec.csv")

In [None]:
# from sklearn.tree import DecisionTreeClassifier

# tree = DecisionTreeClassifier(criterion="entropy")

# scores_tree = cross_val_score(tree, X, y, cv=5)


In [None]:
# scores_tree.mean()

In [None]:
# X_tree = tfidf_df.loc[:,~(tfidf_df.columns == 'target_column')]
# Y_tree = tfidf_df.target_column

# accuracy_test = []

# kf = KFold(n_splits=10)

# X = X_tree.values
# y = Y_tree.values

# for train_index, test_index in kf.split(X_tree):
#     X_train, X_test = X[train_index], X[test_index]
#     y_train, y_test = y[train_index], y[test_index]
    
#     tree = DecisionTreeClassifier(criterion="entropy", ccp_alpha=0.001226509672564484)
    
#     tree.fit(X_train, y_train)
#     y_test_pred = tree.predict(X_test)
#     accuracy_test.append(metrics.accuracy_score(y_test, y_test_pred))


# st.t.interval(0.99, len(accuracy_test) - 1, loc=np.mean(accuracy_test), scale=st.sem(accuracy_test))

## To-Do
- Selecionar variaveis mais importantes (Chi^2 | Informacao Mutua)
- Selecionar colunas contendo essas variaveis tanto no treino quanto no teste
- Testar selecao de variaveis antes para todos os tokens do treino
- Testar outros modelos (SVC, NaiveBayes, RidgeClassifier, ...)
