<a href="https://colab.research.google.com/github/philipayazi/Disaster_Tweets/blob/master/Disaster_Tweets.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
from gensim.parsing.preprocessing import preprocess_string
from gensim.parsing.preprocessing import strip_punctuation,strip_multiple_whitespaces,remove_stopwords
import pandas as pd
from gensim.corpora import Dictionary
import numpy as np
import xgboost
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix

In [0]:
# read data into pandas data frame
# train_df = pd.read_csv("nlp-getting-started/train.csv")
# df = pd.read_csv("nlp-getting-started/train.csv")

url = "https://github.com/philipayazi/Disaster_Tweets/raw/master/nlp-getting-started/train.csv"
train_df = pd.read_csv(url)

In [3]:
train_df.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [0]:
# clean tweets of white spaces, punctuations, stopwords, and make all letters lowercase
custom_filters = [lambda x: x.lower(),
                  strip_multiple_whitespaces,
                  strip_punctuation,
                  remove_stopwords]

def clean_string(row):
    return preprocess_string(row['text'], custom_filters)

In [5]:
# append cleaned tweets to dataframe
train_df['cleaned_tweets'] = train_df.apply(clean_string, axis=1)

train_df.head()

Unnamed: 0,id,keyword,location,text,target,cleaned_tweets
0,1,,,Our Deeds are the Reason of this #earthquake M...,1,"[deeds, reason, earthquake, allah, forgive]"
1,4,,,Forest fire near La Ronge Sask. Canada,1,"[forest, near, la, ronge, sask, canada]"
2,5,,,All residents asked to 'shelter in place' are ...,1,"[residents, asked, shelter, place, notified, o..."
3,6,,,"13,000 people receive #wildfires evacuation or...",1,"[13, 000, people, receive, wildfires, evacuati..."
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1,"[got, sent, photo, ruby, alaska, smoke, wildfi..."


In [0]:
cleaned_tweets_lst = train_df['cleaned_tweets'].to_list()

In [0]:
cleaned_tweets_lst

In [0]:
cleaned_tweets_lst = [' '.join(cleaned_tweet) for cleaned_tweet in cleaned_tweets_lst]

In [0]:
# convert cleaned_tweets column to dictionary
dct = Dictionary(train_df.cleaned_tweets)

In [0]:
# convert dictionary to matrix
gen_corpus = [dct.doc2bow(line) for line in train_df.cleaned_tweets]

In [10]:
gen_corpus[:5]

[[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1)],
 [(5, 1), (6, 1), (7, 1), (8, 1), (9, 1), (10, 1)],
 [(11, 1),
  (12, 1),
  (13, 1),
  (14, 1),
  (15, 1),
  (16, 1),
  (17, 2),
  (18, 1),
  (19, 2)],
 [(12, 1), (16, 1), (20, 1), (21, 1), (22, 1), (23, 1), (24, 1), (25, 1)],
 [(25, 1),
  (26, 1),
  (27, 1),
  (28, 1),
  (29, 1),
  (30, 1),
  (31, 1),
  (32, 1),
  (33, 1)]]

In [0]:
# create model
from gensim.models import TfidfModel
# td_model = TfidfModel(gen_corpus)

In [0]:
# model

In [0]:
# fit model
# vector = model[corpus]

In [0]:
# print(vector)

In [0]:
# vector_0 = model[corpus[0]]

In [0]:
# vector_1 = model[corpus[1]]

In [0]:
# The above code does not seem to be correct because it is treating redundent words as unique. Create dictionary of
# frequency counts of unique words
from collections import defaultdict
frequency = defaultdict(int)
for text in train_df.cleaned_tweets:
    for token in text:
        frequency[token] += 1

In [0]:
# convert default dictionary to dictionary
frequency = dict(frequency)

# The data above has been trained on the entire data set. We need to split the data into train and test

In [0]:
# split data into train and test sets
train_words, test_words = train_test_split(train_df['cleaned_tweets'], test_size=0.3)

In [0]:
train_words_dct = Dictionary(train_words)

In [0]:
train_corpus = [train_words_dct.doc2bow(line) for line in train_words]

In [0]:
train_model = TfidfModel(train_corpus)

In [0]:
# model[corpus[0]]

# All of these analyses are on dense vectors. We need to analyze sparse vectors

In [0]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split

In [0]:
X_train_split, X_test_split, y_train_split, y_test_split = train_test_split(cleaned_tweets_lst, train_df['target'], test_size = 0.3, random_state = 42)

In [0]:
train_corpus = X_train_split

train_vectorizer = CountVectorizer()

X_train = train_vectorizer.fit_transform(train_corpus)

In [0]:
X_train_split

In [15]:
X_train

<5329x16712 sparse matrix of type '<class 'numpy.int64'>'
	with 51307 stored elements in Compressed Sparse Row format>

In [0]:
test_corpus = X_test_split

X_test = train_vectorizer.transform(test_corpus)

In [17]:
X_train_shaped = X_train.shape

X_train_shaped

(5329, 16712)

In [18]:
X_test_shaped = X_test.shape

X_test_shaped

(2284, 16712)

In [0]:
train_vectorizer.vocabulary_

In [0]:
feature_names = train_vectorizer.get_feature_names()

In [0]:
sparse_vectors = X_train.toarray()

In [22]:
sparse_vectors[0]

array([0, 0, 0, ..., 0, 0, 0])

# Logistic Regression

In [0]:
from sklearn.linear_model import LogisticRegression

In [0]:
model = LogisticRegression().fit(X_train, y_train_split)

In [0]:
model.score(X_train, y_train_split)

0.9711015199849878

In [0]:
model.predict(X_test)

array([0, 0, 1, ..., 1, 1, 0])

In [0]:
model.score(X_test, y_test_split)

0.7981611208406305

# Random Forest

In [0]:
from sklearn.ensemble import RandomForestClassifier

In [0]:
rf_model = RandomForestClassifier().fit(X_train, y_train_split)

In [0]:
rf_train_score = rf_model.score(X_train, y_train_split)

rf_test_score = rf_model.score(X_train, y_train_split)

print('test score:', rf_test_score)
print(classification_report(y_test_split, rf_model.predict(X_test)))

test score: 0.9971852129855507
              precision    recall  f1-score   support

           0       0.75      0.91      0.83      1318
           1       0.83      0.60      0.69       966

    accuracy                           0.78      2284
   macro avg       0.79      0.75      0.76      2284
weighted avg       0.79      0.78      0.77      2284



# Word Embeddings (Word2Vec & GloVe)

In [0]:
from gensim.models import Word2Vec

In [0]:
trained_W2V_model = Word2Vec(train_corpus, size=50, min_count=1, window=5)

In [0]:
train_corpus[:5]

['ashes 2015 australia\x89ûªs collapse trent bridge worst history england bundled australia 60 http t t5trhjuau0',
 'great michigan technique camp b1g thanks bmurph1019 hail youtsey termn8r13 goblue wrestleon http t oaskgki6qj',
 'cnn tennessee movie theater shooting suspect killed police http t di8elzswnr',
 'rioting couple hours left class',
 'crack path wiped morning beach run surface wounds left elbow right knee http t yaqrsximph']

In [0]:
trained_W2V_model.most_similar('ashes', topn=5)

  """Entry point for launching an IPython kernel.


KeyError: ignored

In [0]:
y_train_split.shape

(5329,)

In [0]:
X_train.shape

(5329, 16712)

In [0]:
LogisticRegression().fit(trained_W2V_model, y_train_split)

ValueError: ignored

#Using SKLearn with Gensim

In [0]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [0]:
# !ls "/content/drive/My Drive"

In [0]:
from gensim.sklearn_api import W2VTransformer

In [0]:
# Load Google's pre-trained Word2Vec model.
# from gensim import models
# embedding = models.KeyedVectors.load_word2vec_format('/content/drive/My Drive/GoogleNews-vectors-negative300.bin.gz', binary=True)  

  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


In [0]:
# embedding['queen']

In [0]:
W2V_model = W2VTransformer(size = 300, min_count=1, iter=1)

In [0]:
# wordvecs = W2V_model.fit(embedding).transform(cleaned_tweets_lst)

TypeError: ignored

# Switching to SpaCy

In [0]:
import spacy
import gensim.downloader as api
from gensim.test.utils import get_tmpfile
from gensim.models import Word2Vec

In [0]:
# !python -m spacy download en_core_web_lg

In [0]:
nlp = spacy.load("en_core_web_lg")

In [46]:
train_corpus[:3]

['ashes 2015 australia\x89ûªs collapse trent bridge worst history england bundled australia 60 http t t5trhjuau0',
 'great michigan technique camp b1g thanks bmurph1019 hail youtsey termn8r13 goblue wrestleon http t oaskgki6qj',
 'cnn tennessee movie theater shooting suspect killed police http t di8elzswnr']

In [0]:
cleaned_tweets_lst.to_string()

In [0]:
train_df['corpus'] = cleaned_tweets_lst

In [29]:
train_df.head()

Unnamed: 0,id,keyword,location,text,target,cleaned_tweets,corpus
0,1,,,Our Deeds are the Reason of this #earthquake M...,1,"[deeds, reason, earthquake, allah, forgive]",deeds reason earthquake allah forgive
1,4,,,Forest fire near La Ronge Sask. Canada,1,"[forest, near, la, ronge, sask, canada]",forest near la ronge sask canada
2,5,,,All residents asked to 'shelter in place' are ...,1,"[residents, asked, shelter, place, notified, o...",residents asked shelter place notified officer...
3,6,,,"13,000 people receive #wildfires evacuation or...",1,"[13, 000, people, receive, wildfires, evacuati...",13 000 people receive wildfires evacuation ord...
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1,"[got, sent, photo, ruby, alaska, smoke, wildfi...",got sent photo ruby alaska smoke wildfires pou...


In [0]:
# train each tweet in dataframe with model
# tweet_tokens = []
# for tweet in train_df['corpus']:
#   tweet_tokens.append(nlp(tweet))

train_tweet_tokens = []
for tweet in train_corpus:
  train_tweet_tokens.append(nlp(tweet))

In [49]:
train_tweet_tokens [:5]

[ashes 2015 australiaûªs collapse trent bridge worst history england bundled australia 60 http t t5trhjuau0,
 great michigan technique camp b1g thanks bmurph1019 hail youtsey termn8r13 goblue wrestleon http t oaskgki6qj,
 cnn tennessee movie theater shooting suspect killed police http t di8elzswnr,
 rioting couple hours left class,
 crack path wiped morning beach run surface wounds left elbow right knee http t yaqrsximph]

In [0]:
# tweet_vectors = []
# for token in tweet_tokens:
#   tweet_vectors.append(token.vector)

train_tweet_vectors = []
for token in train_tweet_tokens:
  train_tweet_vectors.append(token.vector)

In [0]:
train_tweet_vectors[:2]

In [52]:
len(train_tweet_vectors)

5329

In [0]:
y_train_split.shape

(5329,)

#Feature: Word2Vec, Classifier: Logistic Regression

In [0]:
w2v_lr = LogisticRegression(n_jobs=-1).fit(train_tweet_vectors, y_train_split)

In [83]:
w2v_lr.score(train_tweet_vectors, y_train_split)

0.8166635391255395

In [0]:
# Now run model on test

In [85]:
len(test_corpus)

2284

In [0]:
test_tweet_tokens = []
for tweet in test_corpus:
  test_tweet_tokens.append(nlp(tweet))

In [0]:
test_tweet_tokens[:5]

In [0]:
test_tweet_vectors = []
for token in test_tweet_tokens:
  test_tweet_vectors.append(token.vector)

In [0]:
w2v_lr_test_score = w2v_lr.score(test_tweet_vectors, y_test_split)

In [88]:
print('test score = ', w2v_lr_test_score)
print(classification_report(y_test_split, w2v_lr.predict(test_tweet_vectors)))

test score =  0.7933450087565674
              precision    recall  f1-score   support

           0       0.81      0.84      0.82      1318
           1       0.77      0.73      0.75       966

    accuracy                           0.79      2284
   macro avg       0.79      0.78      0.79      2284
weighted avg       0.79      0.79      0.79      2284



#Feature: Word2Vec, Classifier: Random Forest

In [0]:
from sklearn.ensemble import RandomForestClassifier

In [0]:
w2v_rf = RandomForestClassifier(n_jobs=-1).fit(train_tweet_vectors, y_train_split)

In [79]:
w2v_rf_train_score = w2v_rf.score(train_tweet_vectors, y_train_split)
w2v_rf_train_score

0.9885531994745731

In [80]:
w2v_rf_test_score  = w2v_rf.score(test_tweet_vectors, y_test_split)
w2v_rf_test_score

0.8051663747810858

In [81]:
print('test score = ', w2v_rf_test_score)
print(classification_report(y_test_split, w2v_rf.predict(test_tweet_vectors)))

test score =  0.8051663747810858
              precision    recall  f1-score   support

           0       0.79      0.90      0.84      1318
           1       0.84      0.67      0.74       966

    accuracy                           0.81      2284
   macro avg       0.81      0.79      0.79      2284
weighted avg       0.81      0.81      0.80      2284



#Feature: Word2Vec, Classifier: XGBoost

In [0]:
import xgboost as xgb

In [0]:
w2v_xgb = xgb.XGBClassifier(objective="binary:logistic", n_jobs=-1, random_state= 42)

In [0]:
train_tweet_vectors = np.asarray(train_tweet_vectors)
w2v_xgb = w2v_xgb.fit(train_tweet_vectors, y_train_split)

In [100]:
w2v_xgb_train_score = w2v_xgb.score(train_tweet_vectors, y_train_split)
w2v_xgb_train_score

0.8722086695440046

In [103]:
w2v_xgb_test_score = w2v_xgb.score(test_tweet_vectors, y_test_split)
w2v_xgb_test_score

0.8169877408056042

In [104]:
print('test score = ', w2v_xgb_test_score)
print(classification_report(y_test_split, w2v_xgb.predict(test_tweet_vectors)))

test score =  0.8169877408056042
              precision    recall  f1-score   support

           0       0.82      0.87      0.85      1318
           1       0.81      0.74      0.77       966

    accuracy                           0.82      2284
   macro avg       0.82      0.81      0.81      2284
weighted avg       0.82      0.82      0.82      2284



# Starting over with Word2Vec

In [0]:
import gensim.downloader as api
from gensim.test.utils import get_tmpfile
from gensim.models import Word2Vec

In [0]:
corpus = api.load('wiki-english-20171001')



KeyboardInterrupt: ignored

In [0]:
corpus

NameError: ignored

In [0]:
model_w2v = Word2Vec(corpus)

  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


KeyboardInterrupt: ignored

In [0]:
text8 = api.load('text8')



In [0]:
text8_vecs = W2V_model.fit(text8).transform(X_train)

  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


TypeError: ignored