## Load Imports

In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

from gensim.parsing.preprocessing import preprocess_string
from gensim.parsing.preprocessing import strip_punctuation,strip_multiple_whitespaces,remove_stopwords
from gensim.corpora import Dictionary

import xgboost as xgb

## Load Tweet Dataset and Clean it 

In [2]:
# read data into pandas data frame
initial_df = pd.read_csv("nlp-getting-started/train.csv")
df = pd.read_csv("nlp-getting-started/train.csv")

initial_df.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [3]:
# clean tweets of white spaces, punctuations, stopwords, and make all letters lowercase
custom_filters = [lambda x: x.lower(),
                  strip_multiple_whitespaces,
                  strip_punctuation,
                  remove_stopwords]

def clean_string(row):
    return preprocess_string(row['text'], custom_filters)

In [4]:
# append cleaned tweets to dataframe
initial_df['cleaned_tweets'] = initial_df.apply(clean_string, axis=1)

initial_df.head()

Unnamed: 0,id,keyword,location,text,target,cleaned_tweets
0,1,,,Our Deeds are the Reason of this #earthquake M...,1,"[deeds, reason, earthquake, allah, forgive]"
1,4,,,Forest fire near La Ronge Sask. Canada,1,"[forest, near, la, ronge, sask, canada]"
2,5,,,All residents asked to 'shelter in place' are ...,1,"[residents, asked, shelter, place, notified, o..."
3,6,,,"13,000 people receive #wildfires evacuation or...",1,"[13, 000, people, receive, wildfires, evacuati..."
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1,"[got, sent, photo, ruby, alaska, smoke, wildfi..."


In [5]:
cleaned_tweets_lst = initial_df['cleaned_tweets'].to_list()
cleaned_tweets_lst = [' '.join(cleaned_tweet) for cleaned_tweet in cleaned_tweets_lst]
cleaned_tweets_lst

['deeds reason earthquake allah forgive',
 'forest near la ronge sask canada',
 'residents asked shelter place notified officers evacuation shelter place orders expected',
 '13 000 people receive wildfires evacuation orders california',
 'got sent photo ruby alaska smoke wildfires pours school',
 'rockyfire update california hwy 20 closed directions lake county cafire wildfires',
 'flood disaster heavy rain causes flash flooding streets manitou colorado springs areas',
 'm hill woods',
 's emergency evacuation happening building street',
 'm afraid tornado coming area',
 'people died heat wave far',
 'haha south tampa getting flooded hah wait second live south tampa gonna gonna fvck flooding',
 'raining flooding florida tampabay tampa 18 19 days ve lost count',
 'flood bago myanmar arrived bago',
 'damage school bus 80 multi car crash breaking',
 's man',
 'love fruits',
 'summer lovely',
 'car fast',
 'goooooooaaaaaal',
 'ridiculous',
 'london cool',
 'love skiing',
 'wonderful day',


## Split data into TRAIN and TEST sets

In [6]:
X_train_list, X_test_list, y_train_list, y_test_list = train_test_split(cleaned_tweets_lst, initial_df['target'], test_size = 0.3, random_state = 42)

In [7]:
print(type(X_train_list))
print(X_train_list)

<class 'list'>


In [8]:
print(type(y_train_list))
print(y_train_list)

<class 'pandas.core.series.Series'>
1186    0
4071    1
5461    1
5787    1
7445    0
       ..
5226    0
5390    0
860     0
7603    1
7270    1
Name: target, Length: 5329, dtype: int64


## Feature Engieering via "CountVectorizer"  

- From List to Matrix via "CountVectorizer" 

- List shape: N_tweets x 1
- Matrix shape: N_tweets x N_WordsInCorpus

In [9]:
train_vectorizer = CountVectorizer()
Xcv_train_matrix = train_vectorizer.fit_transform(X_train_list)
Xcv_test_matrix = train_vectorizer.transform(X_test_list)

print('Train matrix shape', Xcv_train_matrix.shape)
print('Test matrix shape', Xcv_test_matrix.shape)

Train matrix shape (5329, 16711)
Test matrix shape (2284, 16711)


In [10]:
feature_names = train_vectorizer.get_feature_names()
#print(feature_names)
train_vectorizer.vocabulary_


{'ashes': 1698,
 '2015': 276,
 'australia': 1820,
 'ûªs': 16682,
 'collapse': 3475,
 'trent': 14934,
 'bridge': 2602,
 'worst': 16118,
 'history': 7013,
 'england': 5146,
 'bundled': 2725,
 '60': 668,
 'http': 7186,
 't5trhjuau0': 14250,
 'great': 6516,
 'michigan': 9567,
 'technique': 14397,
 'camp': 2890,
 'b1g': 1899,
 'thanks': 14505,
 'bmurph1019': 2428,
 'hail': 6708,
 'youtsey': 16475,
 'termn8r13': 14449,
 'goblue': 6406,
 'wrestleon': 16152,
 'oaskgki6qj': 10508,
 'cnn': 3423,
 'tennessee': 14435,
 'movie': 9878,
 'theater': 14517,
 'shooting': 13254,
 'suspect': 14160,
 'killed': 8357,
 'police': 11389,
 'di8elzswnr': 4387,
 'rioting': 12479,
 'couple': 3743,
 'hours': 7150,
 'left': 8714,
 'class': 3350,
 'crack': 3778,
 'path': 11031,
 'wiped': 16001,
 'morning': 9830,
 'beach': 2108,
 'run': 12708,
 'surface': 14136,
 'wounds': 16126,
 'elbow': 5010,
 'right': 12463,
 'knee': 8418,
 'yaqrsximph': 16376,
 'experts': 5382,
 'france': 5953,
 'begin': 2153,
 'examining': 5329,

In [11]:
sparse_vectors = Xcv_train_matrix.toarray()

Tweet_nbr =0;
Word_nbr =1820;  #e.g., 1820 = 'australia'

sparse_vectors[0,1820]  # we get ones (or twos) at the right place

2

## Features: CountVectorizer, Classifier: Logistic Regression

In [12]:
model_CV_LR =LogisticRegression().fit(Xcv_train_matrix, y_train_list)

CV_LR_score_train =model_CV_LR.score(Xcv_train_matrix, y_train_list)
#model_LR.predict(Xcv_test_matrix)
CV_LR_score_test =model_CV_LR.score(Xcv_test_matrix, y_test_list)

print('score for test data:', CV_LR_score_test)
print(classification_report(y_test_list, model_CV_LR.predict(Xcv_test_matrix)) )

score for test data: 0.7981611208406305
              precision    recall  f1-score   support

           0       0.80      0.87      0.83      1318
           1       0.80      0.70      0.75       966

    accuracy                           0.80      2284
   macro avg       0.80      0.79      0.79      2284
weighted avg       0.80      0.80      0.80      2284



## Features: TFIDF,  Classifier: Logistic Regression

In [13]:
train_vectorizer = TfidfVectorizer()

In [14]:
Xtf_train_matrix = train_vectorizer.fit_transform(X_train_list)
Xtf_test_matrix = train_vectorizer.transform(X_test_list)

print('Train matrix shape', Xtf_train_matrix.shape)
print('Test matrix shape', Xtf_test_matrix.shape)

Train matrix shape (5329, 16711)
Test matrix shape (2284, 16711)


In [15]:
Xtf_train_matrix[0,1820]

Xtf_train_matrix[0,276]

0.19644613494912055

In [16]:
feature_names = train_vectorizer.get_feature_names()
## print(feature_names)
train_vectorizer.vocabulary_

{'ashes': 1698,
 '2015': 276,
 'australia': 1820,
 'ûªs': 16682,
 'collapse': 3475,
 'trent': 14934,
 'bridge': 2602,
 'worst': 16118,
 'history': 7013,
 'england': 5146,
 'bundled': 2725,
 '60': 668,
 'http': 7186,
 't5trhjuau0': 14250,
 'great': 6516,
 'michigan': 9567,
 'technique': 14397,
 'camp': 2890,
 'b1g': 1899,
 'thanks': 14505,
 'bmurph1019': 2428,
 'hail': 6708,
 'youtsey': 16475,
 'termn8r13': 14449,
 'goblue': 6406,
 'wrestleon': 16152,
 'oaskgki6qj': 10508,
 'cnn': 3423,
 'tennessee': 14435,
 'movie': 9878,
 'theater': 14517,
 'shooting': 13254,
 'suspect': 14160,
 'killed': 8357,
 'police': 11389,
 'di8elzswnr': 4387,
 'rioting': 12479,
 'couple': 3743,
 'hours': 7150,
 'left': 8714,
 'class': 3350,
 'crack': 3778,
 'path': 11031,
 'wiped': 16001,
 'morning': 9830,
 'beach': 2108,
 'run': 12708,
 'surface': 14136,
 'wounds': 16126,
 'elbow': 5010,
 'right': 12463,
 'knee': 8418,
 'yaqrsximph': 16376,
 'experts': 5382,
 'france': 5953,
 'begin': 2153,
 'examining': 5329,

In [17]:
model_TF_LR =LogisticRegression().fit(Xtf_train_matrix, y_train_list)

In [18]:
TF_LR_score_train =model_TF_LR.score(Xtf_train_matrix, y_train_list)

TF_LR_score_test =model_TF_LR.score(Xtf_test_matrix, y_test_list)

print('score for test data:', TF_LR_score_test)
print(classification_report(y_test_list, model_TF_LR.predict(Xtf_test_matrix)) )

score for test data: 0.7990367775831874
              precision    recall  f1-score   support

           0       0.80      0.88      0.83      1318
           1       0.81      0.69      0.74       966

    accuracy                           0.80      2284
   macro avg       0.80      0.78      0.79      2284
weighted avg       0.80      0.80      0.80      2284



## Features: TFIDF,  Classifier: Randon Forest

In [19]:
model_TF_RF =RandomForestClassifier().fit(Xtf_train_matrix, y_train_list)

In [20]:
TF_RF_score_train =model_TF_RF.score(Xtf_train_matrix, y_train_list)

TF_RF_score_test =model_TF_RF.score(Xtf_test_matrix, y_test_list)

print('score for test data:', TF_RF_score_test)
print(classification_report(y_test_list, model_TF_RF.predict(Xtf_test_matrix)) )

score for test data: 0.7907180385288967
              precision    recall  f1-score   support

           0       0.77      0.90      0.83      1318
           1       0.82      0.64      0.72       966

    accuracy                           0.79      2284
   macro avg       0.80      0.77      0.78      2284
weighted avg       0.80      0.79      0.79      2284



In [21]:
model_TF_RF.feature_importances_

array([1.76307748e-04, 4.60748141e-06, 1.07280788e-07, ...,
       3.55573785e-05, 1.03022154e-04, 3.81499096e-06])

In [22]:
feat_sortedlist =sorted(zip(model_TF_RF.feature_importances_, feature_names), reverse=True)

# top 10 features
feat_sortedlist[0:12]

[(0.02580620504542998, 'http'),
 (0.007805966040333038, 'hiroshima'),
 (0.0068372513135023325, 'fires'),
 (0.006446961644316657, 'california'),
 (0.005049010201850318, 'suicide'),
 (0.004967358977339859, 'killed'),
 (0.004878303955375317, 'buildings'),
 (0.004439094565908974, 'police'),
 (0.004420519791855518, 'wildfire'),
 (0.004354305768179102, 'storm'),
 (0.004051751876106769, 'train'),
 (0.0040174188805683604, 'disaster')]

## Features: TFIDF,  Classifier: XGBooster

In [23]:
model_TF_XGB = xgb.XGBClassifier(objective="binary:logistic", random_state=42)
model_TF_XGB.fit(Xtf_train_matrix, y_train_list)

XGBClassifier(base_score=0.5, booster=None, colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints=None,
              learning_rate=0.300000012, max_delta_step=0, max_depth=6,
              min_child_weight=1, missing=nan, monotone_constraints=None,
              n_estimators=100, n_jobs=0, num_parallel_tree=1,
              objective='binary:logistic', random_state=42, reg_alpha=0,
              reg_lambda=1, scale_pos_weight=1, subsample=1, tree_method=None,
              validate_parameters=False, verbosity=None)

In [24]:
y_pred = model_TF_XGB.predict(Xtf_test_matrix)
#print(confusion_matrix(y_test_list, y_pred))

TF_XGB_score_train =model_TF_XGB.score(Xtf_train_matrix, y_train_list)
TF_XGB_score_test =model_TF_XGB.score(Xtf_test_matrix, y_test_list)

print('score for test data:', TF_XGB_score_test)
print(classification_report(y_test_list, model_TF_XGB.predict(Xtf_test_matrix)) )

score for test data: 0.7683887915936952
              precision    recall  f1-score   support

           0       0.75      0.90      0.82      1318
           1       0.81      0.59      0.68       966

    accuracy                           0.77      2284
   macro avg       0.78      0.74      0.75      2284
weighted avg       0.77      0.77      0.76      2284



## Features: CountVectorizer,  Classifier: XGBooster

In [25]:
model_CV_XGB = xgb.XGBClassifier(objective="binary:logistic", random_state=42)
model_CV_XGB.fit(Xcv_train_matrix, y_train_list)

XGBClassifier(base_score=0.5, booster=None, colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints=None,
              learning_rate=0.300000012, max_delta_step=0, max_depth=6,
              min_child_weight=1, missing=nan, monotone_constraints=None,
              n_estimators=100, n_jobs=0, num_parallel_tree=1,
              objective='binary:logistic', random_state=42, reg_alpha=0,
              reg_lambda=1, scale_pos_weight=1, subsample=1, tree_method=None,
              validate_parameters=False, verbosity=None)

In [26]:
y_pred = model_TF_XGB.predict(Xcv_test_matrix)
#print(confusion_matrix(y_test_list, y_pred))

CV_XGB_score_train =model_CV_XGB.score(Xcv_train_matrix, y_train_list)
CV_XGB_score_test =model_CV_XGB.score(Xcv_test_matrix, y_test_list)

print('score for test data:', CV_XGB_score_test)
print(classification_report(y_test_list, model_CV_XGB.predict(Xcv_test_matrix)) )

score for test data: 0.7850262697022767
              precision    recall  f1-score   support

           0       0.77      0.90      0.83      1318
           1       0.83      0.62      0.71       966

    accuracy                           0.79      2284
   macro avg       0.80      0.76      0.77      2284
weighted avg       0.79      0.79      0.78      2284



## Features: CountVectorizer,  Classifier: Randon Forest

In [27]:
model_CV_RF =RandomForestClassifier().fit(Xcv_train_matrix, y_train_list)

In [28]:
CV_RF_score_train =model_CV_RF.score(Xcv_train_matrix, y_train_list)

CV_RF_score_test =model_CV_RF.score(Xcv_test_matrix, y_test_list)

print('score for test data:', CV_RF_score_test)
print(classification_report(y_test_list, model_CV_RF.predict(Xcv_test_matrix)) )

score for test data: 0.7784588441330998
              precision    recall  f1-score   support

           0       0.75      0.92      0.83      1318
           1       0.84      0.59      0.69       966

    accuracy                           0.78      2284
   macro avg       0.80      0.75      0.76      2284
weighted avg       0.79      0.78      0.77      2284

