In [54]:
from gensim.parsing.preprocessing import preprocess_string
from gensim.parsing.preprocessing import strip_punctuation,strip_multiple_whitespaces,remove_stopwords
import pandas as pd
from gensim.corpora import Dictionary
import numpy as np
import xgboost
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix

In [55]:
# read data into pandas data frame
train_df = pd.read_csv("nlp-getting-started/train.csv")
df = pd.read_csv("nlp-getting-started/train.csv")

In [56]:
train_df.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [57]:
# clean tweets of white spaces, punctuations, stopwords, and make all letters lowercase
custom_filters = [lambda x: x.lower(),
                  strip_multiple_whitespaces,
                  strip_punctuation,
                  remove_stopwords]

def clean_string(row):
    return preprocess_string(row['text'], custom_filters)

In [58]:
# append cleaned tweets to dataframe
train_df['cleaned_tweets'] = train_df.apply(clean_string, axis=1)

train_df.head()

Unnamed: 0,id,keyword,location,text,target,cleaned_tweets
0,1,,,Our Deeds are the Reason of this #earthquake M...,1,"[deeds, reason, earthquake, allah, forgive]"
1,4,,,Forest fire near La Ronge Sask. Canada,1,"[forest, near, la, ronge, sask, canada]"
2,5,,,All residents asked to 'shelter in place' are ...,1,"[residents, asked, shelter, place, notified, o..."
3,6,,,"13,000 people receive #wildfires evacuation or...",1,"[13, 000, people, receive, wildfires, evacuati..."
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1,"[got, sent, photo, ruby, alaska, smoke, wildfi..."


In [59]:
cleaned_tweets_lst = train_df['cleaned_tweets'].to_list()

In [60]:
cleaned_tweets_lst = [' '.join(cleaned_tweet) for cleaned_tweet in cleaned_tweets_lst]

In [61]:
# convert cleaned_tweets column to dictionary
dct = Dictionary(train_df.cleaned_tweets)

In [62]:
# convert dictionary to matrix
corpus = [dct.doc2bow(line) for line in train_df.cleaned_tweets]

In [63]:
# create model
from gensim.models import TfidfModel
model = TfidfModel(corpus)

In [64]:
model

<gensim.models.tfidfmodel.TfidfModel at 0x20b7a692408>

In [65]:
# fit model
vector = model[corpus]

In [66]:
print(vector)

<gensim.interfaces.TransformedCorpus object at 0x0000020B7A7404C8>


In [67]:
vector_0 = model[corpus[0]]

In [68]:
vector_1 = model[corpus[1]]

In [69]:
# The above code does not seem to be correct because it is treating redundent words as unique. Create dictionary of
# frequency counts of unique words
from collections import defaultdict
frequency = defaultdict(int)
for text in train_df.cleaned_tweets:
    for token in text:
        frequency[token] += 1

In [70]:
# convert default dictionary to dictionary
frequency = dict(frequency)

# The data above has been trained on the entire data set. We need to split the data into train and test

In [71]:
# split data into train and test sets
train_words, test_words = train_test_split(train_df['cleaned_tweets'], test_size=0.3)

In [72]:
train_words_dct = Dictionary(train_words)

In [73]:
train_corpus = [train_words_dct.doc2bow(line) for line in train_words]

In [74]:
train_model = TfidfModel(train_corpus)

In [75]:
model[corpus[0]]

[(0, 0.4321836454783851),
 (1, 0.5286228491374555),
 (2, 0.3304298084620385),
 (3, 0.5286228491374555),
 (4, 0.3809845206833893)]

# All of these analyses are on dense vectors. We need to analyze sparse vectors

In [76]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split

In [77]:
X_train_split, X_test_split, y_train_split, y_test_split = train_test_split(cleaned_tweets_lst, train_df['target'], test_size = 0.3, random_state = 42)

In [78]:
train_corpus = X_train_split

train_vectorizer = CountVectorizer()

X_train = train_vectorizer.fit_transform(train_corpus)

In [79]:
test_corpus = X_test_split

X_test = train_vectorizer.transform(test_corpus)

In [80]:
X_train_shaped = X_train.shape

X_train_shaped

(5329, 16711)

In [81]:
X_test_shaped = X_test.shape

X_test_shaped

(2284, 16711)

In [82]:
train_vectorizer.vocabulary_

{'ashes': 1698,
 '2015': 276,
 'australia': 1820,
 'ûªs': 16682,
 'collapse': 3475,
 'trent': 14934,
 'bridge': 2602,
 'worst': 16118,
 'history': 7013,
 'england': 5146,
 'bundled': 2725,
 '60': 668,
 'http': 7186,
 't5trhjuau0': 14250,
 'great': 6516,
 'michigan': 9567,
 'technique': 14397,
 'camp': 2890,
 'b1g': 1899,
 'thanks': 14505,
 'bmurph1019': 2428,
 'hail': 6708,
 'youtsey': 16475,
 'termn8r13': 14449,
 'goblue': 6406,
 'wrestleon': 16152,
 'oaskgki6qj': 10508,
 'cnn': 3423,
 'tennessee': 14435,
 'movie': 9878,
 'theater': 14517,
 'shooting': 13254,
 'suspect': 14160,
 'killed': 8357,
 'police': 11389,
 'di8elzswnr': 4387,
 'rioting': 12479,
 'couple': 3743,
 'hours': 7150,
 'left': 8714,
 'class': 3350,
 'crack': 3778,
 'path': 11031,
 'wiped': 16001,
 'morning': 9830,
 'beach': 2108,
 'run': 12708,
 'surface': 14136,
 'wounds': 16126,
 'elbow': 5010,
 'right': 12463,
 'knee': 8418,
 'yaqrsximph': 16376,
 'experts': 5382,
 'france': 5953,
 'begin': 2153,
 'examining': 5329,

In [83]:
feature_names = train_vectorizer.get_feature_names()

In [84]:
sparse_vectors = X_train.toarray()

In [85]:
sparse_vectors[0]

array([0, 0, 0, ..., 0, 0, 0], dtype=int64)

# Logistic Regression

In [86]:
from sklearn.linear_model import LogisticRegression

In [87]:
model = LogisticRegression().fit(X_train, y_train_split)

In [88]:
model.score(X_train, y_train_split)

0.9711015199849878

In [89]:
model.predict(X_test)

array([0, 0, 1, ..., 1, 1, 0], dtype=int64)

In [90]:
model.score(X_test, y_test_split)

0.7981611208406305

# Random Forest

In [91]:
from sklearn.ensemble import RandomForestClassifier

In [92]:
rf_model = RandomForestClassifier().fit(X_train, y_train_split)

In [94]:
rf_train_score = rf_model.score(X_train, y_train_split)

rf_test_score = rf_model.score(X_train, y_train_split)

print('test score:', rf_test_score)
print(classification_report(y_test_split, rf_model.predict(X_test)))

test score: 0.9971852129855507
              precision    recall  f1-score   support

           0       0.75      0.91      0.82      1318
           1       0.83      0.59      0.69       966

    accuracy                           0.77      2284
   macro avg       0.79      0.75      0.76      2284
weighted avg       0.78      0.77      0.77      2284

