## Load Imports

In [29]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

from keras.preprocessing.text import Tokenizer
from keras.utils.np_utils import to_categorical

from keras import models
from keras import regularizers
from keras import layers

# from sklearn.preprocessing import StandardScaler
from sklearn.feature_extraction.text import CountVectorizer
# from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.linear_model import LogisticRegression
# from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
# from sklearn.metrics import confusion_matrix

import gensim

from gensim.parsing.preprocessing import preprocess_string
from gensim.parsing.preprocessing import strip_punctuation,strip_multiple_whitespaces,remove_stopwords
from gensim.corpora import Dictionary

# import xgboost as xgb

from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

## Load Tweet Dataset and Clean it 

In [13]:
# read data into pandas data frame
initial_df = pd.read_csv("nlp-getting-started/train.csv")
initial_df.head()

#---------- from Google colab use this:
# url = "https://github.com/philipayazi/Disaster_Tweets/raw/master/nlp-getting-started/train.csv"
# initial_df = pd.read_csv(url)

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [14]:
# clean tweets of white spaces, punctuations, stopwords, and make all letters lowercase
custom_filters = [lambda x: x.lower(),
                  strip_multiple_whitespaces,
                  strip_punctuation,
                  remove_stopwords]

def clean_string(row):
    return preprocess_string(row['text'], custom_filters)

In [15]:
# append cleaned tweets to dataframe
initial_df['cleaned_tweets'] = initial_df.apply(clean_string, axis=1)
initial_df.head()

Unnamed: 0,id,keyword,location,text,target,cleaned_tweets
0,1,,,Our Deeds are the Reason of this #earthquake M...,1,"[deeds, reason, earthquake, allah, forgive]"
1,4,,,Forest fire near La Ronge Sask. Canada,1,"[forest, near, la, ronge, sask, canada]"
2,5,,,All residents asked to 'shelter in place' are ...,1,"[residents, asked, shelter, place, notified, o..."
3,6,,,"13,000 people receive #wildfires evacuation or...",1,"[13, 000, people, receive, wildfires, evacuati..."
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1,"[got, sent, photo, ruby, alaska, smoke, wildfi..."


In [16]:
cleaned_tweets_lst = initial_df['cleaned_tweets'].to_list()
cleaned_tweets_lst = [' '.join(cleaned_tweet) for cleaned_tweet in cleaned_tweets_lst]
cleaned_tweets_lst

['deeds reason earthquake allah forgive',
 'forest near la ronge sask canada',
 'residents asked shelter place notified officers evacuation shelter place orders expected',
 '13 000 people receive wildfires evacuation orders california',
 'got sent photo ruby alaska smoke wildfires pours school',
 'rockyfire update california hwy 20 closed directions lake county cafire wildfires',
 'flood disaster heavy rain causes flash flooding streets manitou colorado springs areas',
 'm hill woods',
 's emergency evacuation happening building street',
 'm afraid tornado coming area',
 'people died heat wave far',
 'haha south tampa getting flooded hah wait second live south tampa gonna gonna fvck flooding',
 'raining flooding florida tampabay tampa 18 19 days ve lost count',
 'flood bago myanmar arrived bago',
 'damage school bus 80 multi car crash breaking',
 's man',
 'love fruits',
 'summer lovely',
 'car fast',
 'goooooooaaaaaal',
 'ridiculous',
 'london cool',
 'love skiing',
 'wonderful day',


## Split data into TRAIN and TEST sets

In [17]:
X_train_list, X_test_list, y_train_list, y_test_list = train_test_split(cleaned_tweets_lst, initial_df['target'], test_size =0.3, random_state =42)

## Feature Engieering via "CountVectorizer"  

- From List to Matrix via "CountVectorizer" 

- List shape: N_tweets x 1
- Matrix shape: N_tweets x N_WordsInCorpus

In [18]:
train_vectorizer = CountVectorizer()

Xcv_train_matrix = train_vectorizer.fit_transform(X_train_list)
Xcv_test_matrix = train_vectorizer.transform(X_test_list)

In [19]:
print('Train matrix shape', Xcv_train_matrix.shape)
print('Test matrix shape', Xcv_test_matrix.shape)

Train matrix shape (5329, 16711)
Test matrix shape (2284, 16711)


In [20]:
feature_names = train_vectorizer.get_feature_names()
#print(feature_names)
train_vectorizer.vocabulary_

{'ashes': 1698,
 '2015': 276,
 'australia': 1820,
 'ûªs': 16682,
 'collapse': 3475,
 'trent': 14934,
 'bridge': 2602,
 'worst': 16118,
 'history': 7013,
 'england': 5146,
 'bundled': 2725,
 '60': 668,
 'http': 7186,
 't5trhjuau0': 14250,
 'great': 6516,
 'michigan': 9567,
 'technique': 14397,
 'camp': 2890,
 'b1g': 1899,
 'thanks': 14505,
 'bmurph1019': 2428,
 'hail': 6708,
 'youtsey': 16475,
 'termn8r13': 14449,
 'goblue': 6406,
 'wrestleon': 16152,
 'oaskgki6qj': 10508,
 'cnn': 3423,
 'tennessee': 14435,
 'movie': 9878,
 'theater': 14517,
 'shooting': 13254,
 'suspect': 14160,
 'killed': 8357,
 'police': 11389,
 'di8elzswnr': 4387,
 'rioting': 12479,
 'couple': 3743,
 'hours': 7150,
 'left': 8714,
 'class': 3350,
 'crack': 3778,
 'path': 11031,
 'wiped': 16001,
 'morning': 9830,
 'beach': 2108,
 'run': 12708,
 'surface': 14136,
 'wounds': 16126,
 'elbow': 5010,
 'right': 12463,
 'knee': 8418,
 'yaqrsximph': 16376,
 'experts': 5382,
 'france': 5953,
 'begin': 2153,
 'examining': 5329,

In [21]:
sparse_vectors = Xcv_train_matrix.toarray()
Tweet_nbr =0;
Word_nbr =1820;  #e.g., 1820 = 'australia'
sparse_vectors[0,1820]  # we get ones (or twos) at the right place

2

## Features: CountVectorizer, Classifier: Logistic Regression

In [26]:
model_CV_LR =LogisticRegression(n_jobs=-1).fit(Xcv_train_matrix, y_train_list)

CV_LR_score_train =model_CV_LR.score(Xcv_train_matrix, y_train_list)
CV_LR_score_test =model_CV_LR.score(Xcv_test_matrix, y_test_list)

print('score for test data:', CV_LR_score_test)
print(classification_report(y_test_list, model_CV_LR.predict(Xcv_test_matrix)) )

score for test data: 0.7981611208406305
              precision    recall  f1-score   support

           0       0.80      0.87      0.83      1318
           1       0.80      0.70      0.75       966

    accuracy                           0.80      2284
   macro avg       0.80      0.79      0.79      2284
weighted avg       0.80      0.80      0.80      2284



### Wor2Vec - feature generation using Embeedings

In [27]:
# tockenize 
w2v_train = pd.DataFrame(X_train_list)
w2v_train.columns = ['clean_text']
tmp_train = w2v_train.clean_text.apply(lambda x : word_tokenize(x))

w2v_test = pd.DataFrame(X_test_list)
w2v_test.columns = ['clean_text']
tmp_test = w2v_test.clean_text.apply(lambda x : word_tokenize(x))

tmp_test

0           [new, weapon, cause, imaginable, destruction]
1       [f, amp, ing, things, gishwhes, got, soaked, d...
2       [dt, georgegalloway, rt, galloway4mayor, ûïth...
3       [aftershock, school, kick, great, want, thank,...
4       [response, trauma, children, addicts, develop,...
                              ...                        
2279    [new, cocktail, list, el, diablo, mas, verde, ...
2280    [bend, post, office, roofers, cut, gas, line, ...
2281    [monsoon, flooding, monsoon, rains, hit, india...
2282    [remember, massacre, civilians, hiroshima, htt...
2283    [liked, youtube, video, http, t, itnkbxgwlh, l...
Name: clean_text, Length: 2284, dtype: object

In [30]:
# train Word2Vec model using Training Tweets
SIZE = 25

modelW2V = gensim.models.Word2Vec(tmp_train                               
, min_count=1
, size=SIZE
, window=5
, workers=4)

In [31]:
modelW2V.most_similar('boy', topn=3)

  """Entry point for launching an IPython kernel.


[('bomb', 0.9965035915374756),
 ('year', 0.9959594011306763),
 ('water', 0.9956170320510864)]

In [32]:
modelW2V.similarity('place', 'alabama')

  """Entry point for launching an IPython kernel.


0.9548828

In [33]:
modelW2V.similarity('michigan', 'ago')

  """Entry point for launching an IPython kernel.


0.7530632

In [34]:
def w2v_add_tweet_words(w2v_dict, tweet):
    list_of_word_vectors = [w2v_dict[w] for w in tweet if w in w2v_dict.vocab.keys()]
    
    if len(list_of_word_vectors) == 0:
        result = [0.0]*SIZE
    else:
        result = np.sum(list_of_word_vectors, axis=0)
        
    return result

In [35]:
# map to each tweet to (dense) vectors 

X_train_w2v = tmp_train.apply(lambda x: w2v_add_tweet_words(modelW2V.wv, x))
X_test_w2v = tmp_test.apply(lambda x: w2v_add_tweet_words(modelW2V.wv, x))

In [36]:
# convert to matrices to be passed to Classfiers

Xw2v_train_matrix = np.vstack(X_train_w2v.values)
print('shape of Train Matrix:', Xw2v_train_matrix.shape)

Xw2v_test_matrix = np.vstack(X_test_w2v.values)
print('shape of Test Matrix:', Xw2v_test_matrix.shape)

shape of Train Matrix: (5329, 25)
shape of Test Matrix: (2284, 25)


### W2V + LR

In [37]:
model =LogisticRegression(n_jobs=-1).fit(Xw2v_train_matrix, y_train_list)

In [38]:
W2V_LR_score_train =model.score(Xw2v_train_matrix, y_train_list)

W2V_LR_score_test =model.score(Xw2v_test_matrix, y_test_list)

In [39]:
print('score for train data:', W2V_LR_score_train)

print('score for test data:', W2V_LR_score_test)

print(classification_report(y_test_list, model.predict(Xw2v_test_matrix)) )

score for train data: 0.6301369863013698
score for test data: 0.6304728546409807
              precision    recall  f1-score   support

           0       0.64      0.81      0.72      1318
           1       0.60      0.38      0.47       966

    accuracy                           0.63      2284
   macro avg       0.62      0.60      0.59      2284
weighted avg       0.62      0.63      0.61      2284



# ML part

In [40]:
## define some constants needed for KERAS

...

In [41]:
X_train_list, X_test_list, y_train_list, y_test_list = train_test_split(cleaned_tweets_lst, initial_df['target'], test_size =0.3, random_state =42)