# Preprossesing & Modeling

This notebook consists of various classification models to predict whether a tweet is related to an emergency situation or not during a wildfire. The goal is to choose best model that will generalize the information on unseen data with the highest accuracy score.

In [15]:
# Imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer, ENGLISH_STOP_WORDS
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB, GaussianNB
from sklearn.metrics import confusion_matrix, accuracy_score, classification
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.preprocessing import StandardScaler, FunctionTransformer

import nltk
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

import regex as re

from bs4 import BeautifulSoup

In [16]:
import warnings
warnings.filterwarnings("ignore")

### Read the File

We have combined post-fire Twitter scrape files from Tick Fire, Saddleridge Fire, Kincade Fire, Maria Fire, Colorado fire occured in 2019. We have labeled tweets based on relevance to a natural disaster and combined them with pre-fire tweets within the same location to get 70/30 ratio of classes.

In [17]:
# read the file
labeled_df = pd.read_csv('./data/combined_tweets.csv')

### Data Cleaning 

In [18]:
labeled_df.drop('Unnamed: 0', axis=1, inplace=True)

In [19]:
labeled_df.head()

Unnamed: 0,target,text,time,sia_positive,sia_negative,sia_neutral,sia_compound
0,0,RT @JulianCastro: My grandmother was a domesti...,10/29/2019 22:56,0.0,0.0,1.0,0.0
1,1,RT @MayorOfLA: #GettyFire update | 8AM:\n\n- 5...,10/29/2019 22:56,0.0,0.0,1.0,0.0
2,1,Getty Fire Ignited by Power Line in Sepulveda ...,10/29/2019 22:56,0.0,0.211,0.789,-0.34
3,1,"RT @latimes: In an ominous new warning, the Na...",10/29/2019 22:56,0.0,0.202,0.798,-0.5859
4,1,Arson investigators from the Los Angeles Fire ...,10/29/2019 22:56,0.066,0.122,0.812,-0.2732


In [20]:
labeled_df.dtypes

target            int64
text             object
time             object
sia_positive    float64
sia_negative    float64
sia_neutral     float64
sia_compound    float64
dtype: object

In [21]:
labeled_df['target'].value_counts()

0    18719
1     5691
Name: target, dtype: int64

In [22]:
labeled_df.isnull().sum()

target          0
text            0
time            0
sia_positive    0
sia_negative    0
sia_neutral     0
sia_compound    0
dtype: int64

In [23]:
labeled_df['target'].value_counts(normalize=True)

0    0.766858
1    0.233142
Name: target, dtype: float64

In [24]:
custom_stop = list(ENGLISH_STOP_WORDS)
custom_stop.extend(["b'RT", "xe2", "x80", "x99", "xf0", "htpps", "xa6", "x9f", "Getty Center", "Los Angeles",
                    "Los Angele", "Getty", "x99m", "x99s", "Los", "Angele", "taco truck", "outfit", "taco", "truck",
                   "http", "https", "x94", "xa5", "nhttp", "nhttps", "b'", "Center", "amp", "GettyFire", "RT", "www",
                   "instagram", "xa6'b'RT", "xa6'b'", "xa6'RT", "xb8", "x9d", "xef", "x8f", 'http','colorado','co',
                    'springs','waldo','canyon','cofire','rt','boulder','waldocanyonfire', 'highparkfire','denverpost',
                    'denver','colo', 'tickfire', 'kincade', 'tick', 'kincadefire', 'getty', 'mariafire', 'saddleridge', 
                    'angele', 'angeles', 'center', 'gettyfire', 'los', 'sonoma', 'sonoma county'])

In [25]:
# define a function to clean text from html tags, non-leters, english stop words, custom stop words

def cleanup_lemmatize_text(document):
    # remove HTML
    text = BeautifulSoup(document).get_text()
    # remove non-letter characters
    letters = re.sub("[^a-zA-Z]", " ", document)    
    # all words lower case
    words = letters.lower().split()
    # remove stop words (english + custom)
    stops = set(custom_stop)
    clean_words = [word for word in words if word not in stops]
    # lemmatize cleaned up words
    lem_clean_words = [lemmatizer.lemmatize(token) for token in clean_words]
    
    # returned the cleand up string
    return(" ".join(clean_words))

In [26]:
# initialize an empty list to hold the clean text
clean_text = []

for text in labeled_df['text']:
#     convert title to words, then append to clean_text list
   clean_text.append(cleanup_lemmatize_text(text))

In [27]:
len(clean_text)

24410

In [28]:
labeled_df['text'] = clean_text

### Modeling

- Logistic Regression / Count Vectorizer (Only Text)
- Logistic Regression / Count Vectorizer (Text & Sentiment Scores)
- Logistic Regression / Tfidf Vectorizer (Text & Sentiment Scores)  
- Random Forest / Count Vectorizer (Only Text)

#### Model 1 - LR and CVEC (Text)

In [29]:
# define features variable(s) with only text column
X = labeled_df['text']

# define target variable
y = labeled_df['target']

In [30]:
# split the data into the training and testing sets
# set a random state for reproducibility 
# stratify y to combat slightly unbalanced classes
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.33,
                                                    stratify=y,
                                                    random_state=42)

In [31]:
# Baseline accuracy on test data
y_test.value_counts(normalize=True)

0    0.766882
1    0.233118
Name: target, dtype: float64

In [32]:
# CountVectorizer (transformer) & LogisticRegression (estimator)

pipe1 = Pipeline([
        ('cvec', CountVectorizer(stop_words=custom_stop, max_features=120, ngram_range=(1,2))),
        ('logreg', LogisticRegression(solver = 'lbfgs'))
])

pipe1_params = {
  #  'cvec__max_features': [1200, 1800, 2000, 2500],
  #  'cvec__stop_words': ['custom_stop'],
  #  'cvec__ngram_range': [(1,1), (1,2), (1,3)],
    'cvec__min_df': [50, 100, 200],
    'cvec__max_df': [.55, .75, .85],
 #   'cvec__strip_accents': ['ascii'],
  #  'logreg__penalty': ['l1', 'l2'],
  #  'logreg__C': [2, 2.5, 3]
}

gs1 = GridSearchCV(pipe1,  # object to be optimized
                   pipe1_params, # parameter values to be searched
                   cv=5, # 5 folds
                   verbose = 1,
             #      n_jobs = -1
                  )

In [33]:
# fit gridsearch CV to train data
gs1.fit(X_train, y_train)

Fitting 5 folds for each of 9 candidates, totalling 45 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  45 out of  45 | elapsed:   24.9s finished


GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=Pipeline(memory=None,
                                steps=[('cvec',
                                        CountVectorizer(analyzer='word',
                                                        binary=False,
                                                        decode_error='strict',
                                                        dtype=<class 'numpy.int64'>,
                                                        encoding='utf-8',
                                                        input='content',
                                                        lowercase=True,
                                                        max_df=1.0,
                                                        max_features=120,
                                                        min_df=1,
                                                        ngram_range=(1, 2),
                                             

In [34]:
# Best score
gs1.best_score_

0.888039623333741

In [35]:
gs1.best_params_

{'cvec__max_df': 0.55, 'cvec__min_df': 100}

In [36]:
# Save the best model
gs1_model = gs1.best_estimator_

In [37]:
# Score best model on train set
print(f"Model_1 Train score: {gs1_model.score(X_train, y_train)}")
# Score best model on test set
print(f"Model_1 Test score: {gs1_model.score(X_test, y_test)}")

Model_1 Train score: 0.8922587746117158
Model_1 Test score: 0.8848063555114201


In [38]:
# Generate predictions
preds =gs1_model.predict(X_test)

In [39]:
X_test

8708     happy lightning mcqueen day apparently cars la...
6173     b preevergreen bf aries sun just setting build...
18558               artistcouture slid dms earlier morning
13741                                u long wow queue time
14541    saturnawards tallshipprods outlander starz sta...
                               ...                        
6022     b lafdchief following orders cooperating helps...
3910     b week started bang amazing projects worked co...
4498     b country running santa clarita threatening ho...
20871    promises promises kept thank potus making word...
7882                                     deserves u dallas
Name: text, Length: 8056, dtype: object

In [40]:
# Generate confusion matrix
confusion_matrix(y_test, # True values
                 preds)  # Predicted values

array([[5972,  206],
       [ 722, 1156]], dtype=int64)

In [41]:
confusion_matrix_df = pd.DataFrame(confusion_matrix(y_test,preds).ravel(), index=['TN', 'FP', 'FN', 'TP'], 
                      columns=['Count'])
confusion_matrix_df

Unnamed: 0,Count
TN,5972
FP,206
FN,722
TP,1156


In [42]:
text_features = gs1_model.named_steps['cvec'].get_feature_names()

In [43]:
gs1_coefs = gs1_model.named_steps['logreg'].coef_[0]

In [44]:
coef_dict = dict(zip(text_features, gs1_coefs))

In [45]:
coef_df = pd.DataFrame(coef_dict.items(), columns=['text_feature', 'coef'])

In [46]:
# Positive coefficients
coef_df.sort_values('coef', ascending=False).head(50)

Unnamed: 0,text_feature,coef
20,evacuation,3.73378
21,evacuations,3.325012
106,wildfire,2.826109
38,homes,2.818517
0,acres,2.676426
6,brush,2.482233
83,smoke,2.409133
58,near,2.378743
109,winds,2.358835
78,santa,2.282188


In [47]:
# Negative coefficients
coef_df.sort_values('coef').head(50)

Unnamed: 0,text_feature,coef
11,com,-3.529315
51,lmao,-1.865128
41,igshid,-1.809033
96,twitter com,-1.706517
52,lol,-1.688368
118,years,-1.354208
34,happy,-1.333582
50,ll,-1.286526
56,make,-1.235769
46,let,-1.130742


In [48]:
import pickle

In [49]:
pickle.dump(gs1_model, open('model.p', 'wb+'))

In [50]:
model_from_pickle = pickle.load(open('model.p', 'rb'))
model_from_pickle

Pipeline(memory=None,
         steps=[('cvec',
                 CountVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.int64'>, encoding='utf-8',
                                 input='content', lowercase=True, max_df=0.55,
                                 max_features=120, min_df=100,
                                 ngram_range=(1, 2), preprocessor=None,
                                 stop_words=['not', 'onto', 'part', 'often',
                                             'whenever', 'ever', 'etc', 'alone',
                                             'where', 'system', 'fifty'...
                                             'me', 'sincere', 'their', 'nobody', ...],
                                 strip_accents=None,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, vocabulary=None)),
                ('logreg',
   

In [51]:
# testing the model with sample words
model_from_pickle.predict(np.array(['fire wind help']))

array([1], dtype=int64)

#### Model 2 - LR and CVEC (Text & Numeric Data)

In [52]:
# set X and y
# this time we will include some numeric features to see their impact
X_union = labeled_df[['text', 'sia_positive', 'sia_negative','sia_neutral', 'sia_compound']]
y_union = labeled_df['target']

In [53]:
# split the data
X_union_train, X_union_test, y_union_train, y_union_test = train_test_split(X_union, y_union, test_size=0.33, 
                                                                               stratify=y, random_state=42)

In [54]:
# create functions to group features based on data type
def get_text(data):
    return data['text']

def get_numeric(data):
    return data[['sia_positive', 'sia_negative','sia_neutral', 'sia_compound']]

In [55]:
# create function transformer for text features
get_text_tf = FunctionTransformer(get_text, validate=False)
# create function transformer for numeric features
get_numeric_tf = FunctionTransformer(get_numeric, validate=False)

In [56]:
# create the new pipeline
pipe2 = Pipeline([
    # feature union
    ('union', FeatureUnion([
        # numeric
        ('numeric', get_numeric_tf),
        # text
        ('text', Pipeline([
            # extract text
            ('selector', get_text_tf),
            # vectorize
            ('cvec', CountVectorizer(stop_words=custom_stop, max_features=1200, ngram_range=(1,2)))
        ]))
    ])),
    # model
    ('logreg', LogisticRegression(penalty='l2', C=1, solver='liblinear'))
])

pipe2_params = {}

In [57]:
gs2 = GridSearchCV(pipe2, pipe2_params, cv=5)

In [58]:
gs2.fit(X_union_train, y_union_train)

GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=Pipeline(memory=None,
                                steps=[('union',
                                        FeatureUnion(n_jobs=None,
                                                     transformer_list=[('numeric',
                                                                        FunctionTransformer(accept_sparse=False,
                                                                                            check_inverse=True,
                                                                                            func=<function get_numeric at 0x0000018BE814E798>,
                                                                                            inv_kw_args=None,
                                                                                            inverse_func=None,
                                                                                            kw_args=None,
          

In [59]:
# Score the model on train set
print(f"Model_2 Train score: {gs2.score(X_union_train, y_union_train)}")
# Score the model on test set
print(f"Model_2 Test score: {gs2.score(X_union_test, y_union_test)}")

Model_2 Train score: 0.9356120826709062
Model_2 Test score: 0.9152184707050646


#### Model 3 - LR & Tfidf (Text & Numeric Data)

In [60]:
# create the new pipeline
pipe3 = Pipeline([
    # feature union
    ('union', FeatureUnion([
        # numeric
        ('numeric', get_numeric_tf),
        # text
        ('text', Pipeline([
            # extract text
            ('selector', get_text_tf),
            # vectorize
            ('tvec', TfidfVectorizer(stop_words=custom_stop, max_features=1200, 
                                     ngram_range=(1,2), min_df=100, max_df=.75))
        ]))
    ])),
    # model
    ('logreg', LogisticRegression(penalty='l1', C=1, solver = 'liblinear'))
])

pipe3_params = {}

In [61]:
gs3 = GridSearchCV(pipe3, pipe3_params, cv=5)

In [62]:
gs3.fit(X_union_train, y_union_train)

GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=Pipeline(memory=None,
                                steps=[('union',
                                        FeatureUnion(n_jobs=None,
                                                     transformer_list=[('numeric',
                                                                        FunctionTransformer(accept_sparse=False,
                                                                                            check_inverse=True,
                                                                                            func=<function get_numeric at 0x0000018BE814E798>,
                                                                                            inv_kw_args=None,
                                                                                            inverse_func=None,
                                                                                            kw_args=None,
          

In [63]:
# Score the model on train set
print(f"Model_3 Train score: {gs3.score(X_union_train, y_union_train)}")
# Score the model on test set
print(f"Model_3 Test score: {gs3.score(X_union_test, y_union_test)}")

Model_3 Train score: 0.9022869022869023
Model_3 Test score: 0.8933714001986097


#### Model 4 - Random Forest & CVEC (Text)

In [64]:
# CountVectorizer (transformer) & RandomForestClassifier (estimator)

pipe4 = Pipeline([
        ('cvec', CountVectorizer(stop_words=custom_stop, max_features=1200, ngram_range=(1,2))),
        ('rf', RandomForestClassifier(random_state=42))
])

pipe4_params = {
  #  'cvec__max_features': [1200, 1800, 2000, 2500],
  #  'cvec__stop_words': ['custom_stop'],
  #  'cvec__ngram_range': [(1,1), (1,2), (1,3)],
    'cvec__min_df': [50, 70],
    'cvec__max_df': [.55, .75],
    'rf__n_estimators': [10, 12],
    'rf__max_depth': [None, 5, 6],
    'rf__max_features': [None, 30],
}

gs4 = GridSearchCV(pipe4,  # object to be optimized
                   pipe4_params, # parameter values to be searched
                   cv=5, # 5 folds
                   verbose = 1,
                   n_jobs = -1
                  )

In [65]:
gs4.fit(X_train, y_train)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


Fitting 5 folds for each of 48 candidates, totalling 240 fits


[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   17.2s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:   54.5s
[Parallel(n_jobs=-1)]: Done 240 out of 240 | elapsed:  1.1min finished


GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=Pipeline(memory=None,
                                steps=[('cvec',
                                        CountVectorizer(analyzer='word',
                                                        binary=False,
                                                        decode_error='strict',
                                                        dtype=<class 'numpy.int64'>,
                                                        encoding='utf-8',
                                                        input='content',
                                                        lowercase=True,
                                                        max_df=1.0,
                                                        max_features=1200,
                                                        min_df=1,
                                                        ngram_range=(1, 2),
                                            

In [66]:
gs4.best_params_

{'cvec__max_df': 0.55,
 'cvec__min_df': 50,
 'rf__max_depth': None,
 'rf__max_features': None,
 'rf__n_estimators': 12}

In [67]:
gs4.best_score_

0.894826953650483

In [68]:
# Score the model on train set
print(f"Model_4 Train score: {gs4.score(X_train, y_train)}")
# Score the model on test set
print(f"Model_4 Test score: {gs4.score(X_test, y_test)}")

Model_4 Train score: 0.9706493824140883
Model_4 Test score: 0.8938679245283019


#### Model 5 - Random Forest & Tfidf Vectorizer

In [69]:
# CountVectorizer (transformer) & RandomForestClassifier (estimator)

pipe5 = Pipeline([
        ('tvec', CountVectorizer(stop_words=custom_stop, max_features=1200, ngram_range=(1,2))),
        ('rf', RandomForestClassifier(random_state=42))
])

pipe5_params = {
  #  'cvec__max_features': [1200, 1800, 2000, 2500],
  #  'cvec__stop_words': ['custom_stop'],
  #  'cvec__ngram_range': [(1,1), (1,2), (1,3)],
    'tvec__min_df': [50, 70],
    'tvec__max_df': [.55, .75],
    'rf__n_estimators': [10, 12],
    'rf__max_depth': [None, 5, 6],
    'rf__max_features': [None, 30, "sqrt"],
}

gs5 = GridSearchCV(pipe5,  # object to be optimized
                   pipe5_params, # parameter values to be searched
                   cv=5, # 5 folds
                   verbose = 1,
                   n_jobs = -1
                  )

In [70]:
gs5.fit(X_train, y_train)

Fitting 5 folds for each of 72 candidates, totalling 360 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   15.0s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:   51.0s
[Parallel(n_jobs=-1)]: Done 360 out of 360 | elapsed:  1.4min finished


GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=Pipeline(memory=None,
                                steps=[('tvec',
                                        CountVectorizer(analyzer='word',
                                                        binary=False,
                                                        decode_error='strict',
                                                        dtype=<class 'numpy.int64'>,
                                                        encoding='utf-8',
                                                        input='content',
                                                        lowercase=True,
                                                        max_df=1.0,
                                                        max_features=1200,
                                                        min_df=1,
                                                        ngram_range=(1, 2),
                                            

In [71]:
gs5.best_score_

0.8961110431698667

In [None]:
gs5.best