In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import sklearn
from sklearn.feature_extraction.text import TfidfVectorizer
import pickle

In [2]:
data = pd.read_csv("clean.csv")
data.head()

Unnamed: 0.1,Unnamed: 0,keyword,location,text,emergency,clean_text,tokenized,no_stopwords,lemmatized,body_len,sentiment
0,0,,,Our Deeds are the Reason of this #earthquake M...,1,Our Deeds are the Reason of this earthquake Ma...,"['our', 'deeds', 'are', 'the', 'reason', 'of',...","['deeds', 'reason', 'earthquake', 'may', 'alla...","['deed', 'reason', 'earthquake', 'may', 'allah...",57,0.2732
1,1,,,Forest fire near La Ronge Sask. Canada,1,Forest fire near La Ronge Sask Canada,"['forest', 'fire', 'near', 'la', 'ronge', 'sas...","['forest', 'fire', 'near', 'la', 'ronge', 'sas...","['forest', 'fire', 'near', 'la', 'ronge', 'sas...",32,-0.34
2,2,,,All residents asked to 'shelter in place' are ...,1,All residents asked to shelter in place are be...,"['all', 'residents', 'asked', 'to', 'shelter',...","['residents', 'asked', 'shelter', 'place', 'no...","['resident', 'asked', 'shelter', 'place', 'not...",112,-0.296
3,3,,,"13,000 people receive #wildfires evacuation or...",1,13000 people receive wildfires evacuation orde...,"['13000', 'people', 'receive', 'wildfires', 'e...","['13000', 'people', 'receive', 'wildfires', 'e...","['13000', 'people', 'receive', 'wildfire', 'ev...",57,0.0
4,4,,,Just got sent this photo from Ruby #Alaska as ...,1,Just got sent this photo from Ruby Alaska as s...,"['just', 'got', 'sent', 'this', 'photo', 'from...","['got', 'sent', 'photo', 'ruby', 'alaska', 'sm...","['got', 'sent', 'photo', 'ruby', 'alaska', 'sm...",72,0.0


# Perform vectorizing on data

- make use of tf-idf to find prominent words within each sentence to be used for classification
- turn individual words into features

In [3]:
#tf-idf vectorization
tfidf = TfidfVectorizer()
X_tfidf = tfidf.fit_transform(data['lemmatized'])

# splits into features: body length, vader score and each number represents a single word
X_tfidf_feat = pd.concat([pd.DataFrame(X_tfidf.toarray())], axis=1)
X_tfidf_feat.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,20953,20954,20955,20956,20957,20958,20959,20960,20961,20962
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# Split data into training & validation set

In [4]:
from sklearn.metrics import precision_recall_fscore_support as score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV

In [5]:
# split data with X as features and y as the label
X_features = X_tfidf_feat
X_train, X_test, y_train, y_test = train_test_split(X_features, data['emergency'], test_size=0.3, shuffle=True,
                                                   random_state=51, stratify=data.emergency)

# Test different models

In [6]:
## function for printing results

def print_results(results):
    print('BEST PARAMS: {}\n'.format(results.best_params_))

    means = results.cv_results_['mean_test_score']
    stds = results.cv_results_['std_test_score']
    for mean, std, params in zip(means, stds, results.cv_results_['params']):
        print('{} (+/-{}) for {}'.format(round(mean, 3), round(std * 2, 3), params))

### 1. Naive Bayes

In [7]:
from sklearn.naive_bayes import GaussianNB

gnb = GaussianNB()
gnb_model = gnb.fit(X_train, y_train)
y_pred = gnb_model.predict(X_test)
precision, recall, fscore, support = score(y_test, y_pred, average='weighted')
    
print('Precision:{} / Recall:{} / Accuracy:{}'.format(round(precision,3), round(recall,3),
    round((y_pred==y_test).sum() / len(y_pred),3)))

Precision:0.642 / Recall:0.608 / Accuracy:0.608


### 2. Support Vector Machine

In [8]:
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV

In [9]:
# svc = SVC()
# parameters = {
#     'kernel': ['linear', 'rbf', 'poly'],
#     'C': [0.1, 1, 10],
#     'degree': [1,2,3],
#     'probability': [True]
# }

# cv = GridSearchCV(svc, parameters, cv=5)
# cv.fit(X_train, y_train)

# print_results(cv)

### 3. Log Reg

In [None]:
from sklearn.linear_model import SGDClassifier

lr = SGDClassifier(loss = 'log')
parameters = {'alpha' : [10**(-x) for x in range(7)],
             'penalty' : ['l1', 'l2', 'elasticnet'],
             'l1_ratio' : [0.15, 0.25, 0.5, 0.75]}

cv = GridSearchCV(lr, parameters, cv=5)
cv.fit(X_train, y_train)

print_results(cv)

In [None]:
cv.best_estimator_

Pkl_Filename = "LR_Model.pkl"  
with open(Pkl_Filename, 'wb') as file:  
    pickle.dump(cv.best_estimator_, file)

### 4. Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier()
param = {'n_estimators': [10,50,100,150,300],
        'max_depth': [10,20,30,40,50,None]}

gs = GridSearchCV(rf, param, cv=5, n_jobs=-1)
gs_fit = gs.fit(X_train, y_train)
pd.DataFrame(gs_fit.cv_results_).sort_values('mean_test_score', ascending=False)

In [None]:
final_rf = RandomForestClassifier(n_estimators=50, max_depth=40, n_jobs=-1)
final_rf_model = final_rf.fit(X_train, y_train)

In [None]:
Pkl_Filename = "RF_Model.pkl"  
with open(Pkl_Filename, 'wb') as file:  
    pickle.dump(final_rf_model, file)

### 5. Gradient Boosting

In [None]:
from sklearn.ensemble import GradientBoostingClassifier

gb = GradientBoostingClassifier()
param = {'n_estimators': [10,50,100,150,300],
        'max_depth': [3,7,11,15],
        'learning_rate': [0.01, 0.1, 1]}

gs = GridSearchCV(gb, param, cv=5, n_jobs=-1)
gs_fit = gs.fit(X_train, y_train)
pd.DataFrame(gs_fit.cv_results_).sort_values('mean_test_score', ascending=False)

In [None]:
final_gb = GradientBoostingClassifier(n_estimators=100, max_depth=3, learning_rate=0.1)
final_gb_model = final_gb.fit(X_train, y_train)

In [None]:
Pkl_Filename = "GB_Model.pkl"  
with open(Pkl_Filename, 'wb') as file:  
    pickle.dump(final_gb_model, file)