# Preprossesing & Modeling

This notebook consists of various classification models to predict whether a tweet is related to an emergency situation or not during a wildfire. The goal is to choose best model that will generalize the information on unseen data with the highest accuracy score.

In [1]:
# Imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer, ENGLISH_STOP_WORDS
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB, GaussianNB
from sklearn.metrics import confusion_matrix, accuracy_score, classification
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.preprocessing import StandardScaler, FunctionTransformer

import nltk
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

import regex as re

from bs4 import BeautifulSoup

In [2]:
import warnings
warnings.filterwarnings("once")

### Read the File

We have combined post-fire Twitter scrape files from Tick Fire, Saddleridge Fire, Kincade Fire, Maria Fire, Colorado fire occured in 2019. We have labeled tweets based on relevance to a natural disaster and combined them with pre-fire tweets within the same location to get 70/30 ratio of classes.

In [3]:
# read the file
labeled_df = pd.read_csv('./data/combined_tweets.csv')

### Data Cleaning 

In [4]:
labeled_df.drop('Unnamed: 0', axis=1, inplace=True)

In [5]:
labeled_df.head()

Unnamed: 0,target,text,time,sia_positive,sia_negative,sia_neutral,sia_compound
0,0,RT @JulianCastro: My grandmother was a domesti...,10/29/2019 22:56,0.0,0.0,1.0,0.0
1,1,RT @MayorOfLA: #GettyFire update | 8AM:\n\n- 5...,10/29/2019 22:56,0.0,0.0,1.0,0.0
2,1,Getty Fire Ignited by Power Line in Sepulveda ...,10/29/2019 22:56,0.0,0.211,0.789,-0.34
3,1,"RT @latimes: In an ominous new warning, the Na...",10/29/2019 22:56,0.0,0.202,0.798,-0.5859
4,1,Arson investigators from the Los Angeles Fire ...,10/29/2019 22:56,0.066,0.122,0.812,-0.2732


In [6]:
labeled_df.dtypes

target            int64
text             object
time             object
sia_positive    float64
sia_negative    float64
sia_neutral     float64
sia_compound    float64
dtype: object

In [7]:
labeled_df['target'].value_counts()

0    18719
1     5691
Name: target, dtype: int64

In [8]:
labeled_df.isnull().sum()

target          0
text            0
time            0
sia_positive    0
sia_negative    0
sia_neutral     0
sia_compound    0
dtype: int64

In [9]:
labeled_df['target'].value_counts(normalize=True)

0    0.766858
1    0.233142
Name: target, dtype: float64

In [10]:
custom_stop = list(ENGLISH_STOP_WORDS)
custom_stop.extend(["b'RT", "xe2", "x80", "x99", "xf0", "htpps", "xa6", "x9f", "Getty Center", "Los Angeles",
                    "Los Angele", "Getty", "x99m", "x99s", "Los", "Angele", "taco truck", "outfit", "taco", "truck",
                   "http", "https", "x94", "xa5", "nhttp", "nhttps", "b'", "Center", "amp", "GettyFire", "RT", "www",
                   "instagram", "xa6'b'RT", "xa6'b'", "xa6'RT", "xb8", "x9d", "xef", "x8f", 'http','colorado','co',
                    'springs','waldo','canyon','cofire','rt','boulder','waldocanyonfire', 'highparkfire','denverpost',
                    'denver','colo', 'tickfire', 'kincade', 'tick', 'kincadefire', 'getty', 'mariafire', 'saddleridge', 
                    'angele', 'angeles', 'center', 'gettyfire', 'los', 'sonoma', 'sonoma county'])

In [11]:
# define a function to clean text from html tags, non-leters, english stop words, custom stop words

def cleanup_lemmatize_text(document):
    # remove HTML
    text = BeautifulSoup(document).get_text()
    # remove non-letter characters
    letters = re.sub("[^a-zA-Z]", " ", document)    
    # all words lower case
    words = letters.lower().split()
    # remove stop words (english + custom)
    stops = set(custom_stop)
    clean_words = [word for word in words if word not in stops]
    # lemmatize cleaned up words
    lem_clean_words = [lemmatizer.lemmatize(token) for token in clean_words]
    
    # returned the cleand up string
    return(" ".join(clean_words))

In [12]:
# # initialize an empty list to hold the clean text
# clean_text = []

# for text in labeled_df['text']:
# #     convert title to words, then append to clean_text list
#    clean_text.append(cleanup_lemmatize_text(text))

In [13]:
# len(clean_text)

In [14]:
# labeled_df['text'] = clean_text

### Modeling

- Logistic Regression / Count Vectorizer (Only Text)
- Logistic Regression / Count Vectorizer (Text & Sentiment Scores)
- Logistic Regression / Tfidf Vectorizer (Text & Sentiment Scores)  
- Random Forest / Count Vectorizer (Only Text)

#### Model 1 - LR and CVEC (Text)

In [15]:
# define features variable(s) with only text column
X = labeled_df['text']

# define target variable
y = labeled_df['target']

In [16]:
# split the data into the training and testing sets
# set a random state for reproducibility 
# stratify y to combat slightly unbalanced classes
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.33,
                                                    stratify=y,
                                                    random_state=42)

In [17]:
# Baseline accuracy on test data
y_test.value_counts(normalize=True)

0    0.766882
1    0.233118
Name: target, dtype: float64

In [18]:
# CountVectorizer (transformer) & LogisticRegression (estimator)

pipe1 = Pipeline([
        ('cvec', CountVectorizer(stop_words=custom_stop, max_features=120, ngram_range=(1,2))),
        ('logreg', LogisticRegression(solver = 'lbfgs'))
])

pipe1_params = {
  #  'cvec__max_features': [1200, 1800, 2000, 2500],
  #  'cvec__stop_words': ['custom_stop'],
  #  'cvec__ngram_range': [(1,1), (1,2), (1,3)],
    'cvec__min_df': [50, 100, 200],
    'cvec__max_df': [.55, .75, .85],
 #   'cvec__strip_accents': ['ascii'],
  #  'logreg__penalty': ['l1', 'l2'],
  #  'logreg__C': [2, 2.5, 3]
}

gs1 = GridSearchCV(pipe1,  # object to be optimized
                   pipe1_params, # parameter values to be searched
                   cv=5, # 5 folds
                   verbose = 1,
             #      n_jobs = -1
                  )

In [19]:
# fit gridsearch CV to train data
gs1.fit(X_train, y_train)

Fitting 5 folds for each of 9 candidates, totalling 45 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
  'stop_words.' % sorted(inconsistent))
  'stop_words.' % sorted(inconsistent))
  'stop_words.' % sorted(inconsistent))
  'stop_words.' % sorted(inconsistent))
  'stop_words.' % sorted(inconsistent))
  'stop_words.' % sorted(inconsistent))
  'stop_words.' % sorted(inconsistent))
  'stop_words.' % sorted(inconsistent))
  'stop_words.' % sorted(inconsistent))
  'stop_words.' % sorted(inconsistent))
  'stop_words.' % sorted(inconsistent))
  'stop_words.' % sorted(inconsistent))
  'stop_words.' % sorted(inconsistent))
  'stop_words.' % sorted(inconsistent))
  'stop_words.' % sorted(inconsistent))
  'stop_words.' % sorted(inconsistent))
  'stop_words.' % sorted(inconsistent))
  'stop_words.' % sorted(inconsistent))
  'stop_words.' % sorted(inconsistent))
  'stop_words.' % sorted(inconsistent))
  'stop_words.' % sorted(inconsistent))
  'stop_words.' % sorted(inconsistent))
  'stop_words.' % sorted(inconsistent))

  'stop_words.' % sorted(inconsistent))
  'stop_words.' % sorted(inconsistent))
  'stop_words.' % sorted(inconsistent))
  'stop_words.' % sorted(inconsistent))
  'stop_words.' % sorted(inconsistent))
  'stop_words.' % sorted(inconsistent))
  'stop_words.' % sorted(inconsistent))
  'stop_words.' % sorted(inconsistent))
  'stop_words.' % sorted(inconsistent))
  'stop_words.' % sorted(inconsistent))
  'stop_words.' % sorted(inconsistent))
  'stop_words.' % sorted(inconsistent))
  'stop_words.' % sorted(inconsistent))
  'stop_words.' % sorted(inconsistent))
  'stop_words.' % sorted(inconsistent))
  'stop_words.' % sorted(inconsistent))
  'stop_words.' % sorted(inconsistent))
[Parallel(n_jobs=1)]: Done  45 out of  45 | elapsed:   28.7s finished
  'stop_words.' % sorted(inconsistent))


GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=Pipeline(memory=None,
                                steps=[('cvec',
                                        CountVectorizer(analyzer='word',
                                                        binary=False,
                                                        decode_error='strict',
                                                        dtype=<class 'numpy.int64'>,
                                                        encoding='utf-8',
                                                        input='content',
                                                        lowercase=True,
                                                        max_df=1.0,
                                                        max_features=120,
                                                        min_df=1,
                                                        ngram_range=(1, 2),
                                             

In [20]:
# Best score
gs1.best_score_

0.8888956830133301

In [21]:
gs1.best_params_

{'cvec__max_df': 0.55, 'cvec__min_df': 50}

In [22]:
# Save the best model
gs1_model = gs1.best_estimator_

In [23]:
# Score best model on train set
print(f"Model_1 Train score: {gs1_model.score(X_train, y_train)}")
# Score best model on test set
print(f"Model_1 Test score: {gs1_model.score(X_test, y_test)}")

Model_1 Train score: 0.8906689494924789
Model_1 Test score: 0.8831926514399205


In [24]:
# Generate predictions
preds =gs1_model.predict(X_test)

In [25]:
X_test

8708     happy lightning mcqueen day apparently? . cars...
6173     b'@preevergreen my bf and I are both aries sun...
18558    @artistcouture Slid into my DMs earlier and ma...
13741            u have a long wow queue time or something
14541    @SaturnAwards @TallShipProds @Outlander_STARZ ...
                               ...                        
6022     b'@LAFDChief Following these orders and cooper...
3910     b'This week started of with a bang with some a...
4498     b'Canyon Country. #TickFire. Fire is running t...
20871    Promises made, promises kept thank you @POTUS ...
7882                 It deserves to u and I am from DALLAS
Name: text, Length: 8056, dtype: object

In [26]:
# Generate confusion matrix
confusion_matrix(y_test, # True values
                 preds)  # Predicted values

array([[6010,  168],
       [ 773, 1105]], dtype=int64)

In [27]:
confusion_matrix_df = pd.DataFrame(confusion_matrix(y_test,preds).ravel(), index=['TN', 'FP', 'FN', 'TP'], 
                      columns=['Count'])
confusion_matrix_df

Unnamed: 0,Count
TN,6010
FP,168
FN,773
TP,1105


In [28]:
text_features = gs1_model.named_steps['cvec'].get_feature_names()

In [29]:
gs1_coefs = gs1_model.named_steps['logreg'].coef_[0]

In [30]:
coef_dict = dict(zip(text_features, gs1_coefs))

In [31]:
coef_df = pd.DataFrame(coef_dict.items(), columns=['text_feature', 'coef'])

In [32]:
# Positive coefficients
coef_df.sort_values('coef', ascending=False).head(50)

Unnamed: 0,text_feature,coef
22,evacuation,3.996611
23,evacuations,3.526895
41,homes,2.840575
8,brush,2.737031
2,acres,2.691052
113,winds,2.671636
110,wildfire,2.558585
101,update,2.520255
82,santa,2.492025
63,near,2.48452


In [33]:
# Negative coefficients
coef_df.sort_values('coef').head(50)

Unnamed: 0,text_feature,coef
13,com,-3.81277
54,lmao,-2.141699
55,lol,-2.012674
100,twitter com,-1.920742
44,igshid,-1.912717
53,ll,-1.435654
45,im,-1.404584
37,happy,-1.265407
119,yes,-1.068881
59,love,-1.053417


In [34]:
import pickle

In [35]:
pickle.dump(gs1_model, open('model.p', 'wb+'))

  """Entry point for launching an IPython kernel.


In [36]:
model_from_pickle = pickle.load(open('model.p', 'rb'))
model_from_pickle

  """Entry point for launching an IPython kernel.


Pipeline(memory=None,
         steps=[('cvec',
                 CountVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.int64'>, encoding='utf-8',
                                 input='content', lowercase=True, max_df=0.55,
                                 max_features=120, min_df=50,
                                 ngram_range=(1, 2), preprocessor=None,
                                 stop_words=['have', 'co', 'through', 'himself',
                                             'where', 'top', 'toward', 'cry',
                                             'enough', 'whole', 'none',...
                                             'under', ...],
                                 strip_accents=None,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, vocabulary=None)),
                ('logreg',
                 LogisticRegre

In [37]:
model_from_pickle.predict(np.array(['fire wind help']))

  'stop_words.' % sorted(inconsistent))


array([1], dtype=int64)

#### Model 2 - LR and CVEC (Text & Numeric Data)

In [38]:
# set X and y
# this time we will include some numeric features to see their impact
X_union = labeled_df[['text', 'sia_positive', 'sia_negative','sia_neutral', 'sia_compound']]
y_union = labeled_df['target']

In [39]:
# split the data
X_union_train, X_union_test, y_union_train, y_union_test = train_test_split(X_union, y_union, test_size=0.33, 
                                                                               stratify=y, random_state=42)

In [40]:
# create functions to group features based on data type
def get_text(data):
    return data['text']

def get_numeric(data):
    return data[['sia_positive', 'sia_negative','sia_neutral', 'sia_compound']]

In [41]:
# create function transformer for text features
get_text_tf = FunctionTransformer(get_text, validate=False)
# create function transformer for numeric features
get_numeric_tf = FunctionTransformer(get_numeric, validate=False)

In [45]:
# create the new pipeline
pipe2 = Pipeline([
    # feature union
    ('union', FeatureUnion([
        # numeric
        ('numeric', get_numeric_tf),
        # text
        ('text', Pipeline([
            # extract text
            ('selector', get_text_tf),
            # vectorize
            ('cvec', CountVectorizer(stop_words=custom_stop, max_features=1200, ngram_range=(1,2)))
        ]))
    ])),
    # model
    ('logreg', LogisticRegression(penalty='l2', C=1, solver = 'lbfgs'))
])

pipe2_params = {}

In [46]:
gs2 = GridSearchCV(pipe2, pipe2_params, cv=5)

In [47]:
gs2.fit(X_union_train, y_union_train)

  'stop_words.' % sorted(inconsistent))
  'stop_words.' % sorted(inconsistent))
  'stop_words.' % sorted(inconsistent))
  'stop_words.' % sorted(inconsistent))
  'stop_words.' % sorted(inconsistent))
  'stop_words.' % sorted(inconsistent))


GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=Pipeline(memory=None,
                                steps=[('union',
                                        FeatureUnion(n_jobs=None,
                                                     transformer_list=[('numeric',
                                                                        FunctionTransformer(accept_sparse=False,
                                                                                            check_inverse=True,
                                                                                            func=<function get_numeric at 0x000001E722051708>,
                                                                                            inv_kw_args=None,
                                                                                            inverse_func=None,
                                                                                            kw_args=None,
          

In [48]:
# Score the model on train set
print(f"Model_2 Train score: {gs2.score(X_union_train, y_union_train)}")
# Score the model on test set
print(f"Model_2 Test score: {gs2.score(X_union_test, y_union_test)}")

Model_2 Train score: 0.9335942277118747
Model_2 Test score: 0.9129841112214498


#### Model 3 - LR & Tfidf (Text & Numeric Data)

In [49]:
# create the new pipeline
pipe3 = Pipeline([
    # feature union
    ('union', FeatureUnion([
        # numeric
        ('numeric', get_numeric_tf),
        # text
        ('text', Pipeline([
            # extract text
            ('selector', get_text_tf),
            # vectorize
            ('tvec', TfidfVectorizer(stop_words=custom_stop, max_features=1200, 
                                     ngram_range=(1,2), min_df=100, max_df=.75))
        ]))
    ])),
    # model
    ('logreg', LogisticRegression(penalty='l1', C=1, solver = 'lbfgs'))
])

pipe3_params = {}

In [50]:
gs3 = GridSearchCV(pipe3, pipe3_params, cv=5)

In [51]:
gs3.fit(X_union_train, y_union_train)

  'stop_words.' % sorted(inconsistent))


ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got l1 penalty.

In [None]:
# Score the model on train set
print(f"Model_3 Train score: {gs3.score(X_union_train, y_union_train)}")
# Score the model on test set
print(f"Model_3 Test score: {gs3.score(X_union_test, y_union_test)}")

#### Model 4 - Random Forest & CVEC (Text)

In [52]:
# CountVectorizer (transformer) & RandomForestClassifier (estimator)

pipe4 = Pipeline([
        ('cvec', CountVectorizer(stop_words=custom_stop, max_features=1200, ngram_range=(1,2))),
        ('rf', RandomForestClassifier(random_state=42))
])

pipe4_params = {
  #  'cvec__max_features': [1200, 1800, 2000, 2500],
  #  'cvec__stop_words': ['custom_stop'],
  #  'cvec__ngram_range': [(1,1), (1,2), (1,3)],
    'cvec__min_df': [50, 70],
    'cvec__max_df': [.55, .75],
    'rf__n_estimators': [10, 12],
    'rf__max_depth': [None, 5, 6],
    'rf__max_features': [None, 30],
}

gs4 = GridSearchCV(pipe4,  # object to be optimized
                   pipe4_params, # parameter values to be searched
                   cv=5, # 5 folds
                   verbose = 1,
                   n_jobs = -1
                  )

In [53]:
gs4.fit(X_train, y_train)

Fitting 5 folds for each of 48 candidates, totalling 240 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   12.1s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:   50.5s
[Parallel(n_jobs=-1)]: Done 240 out of 240 | elapsed:  1.1min finished
  'stop_words.' % sorted(inconsistent))


GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=Pipeline(memory=None,
                                steps=[('cvec',
                                        CountVectorizer(analyzer='word',
                                                        binary=False,
                                                        decode_error='strict',
                                                        dtype=<class 'numpy.int64'>,
                                                        encoding='utf-8',
                                                        input='content',
                                                        lowercase=True,
                                                        max_df=1.0,
                                                        max_features=1200,
                                                        min_df=1,
                                                        ngram_range=(1, 2),
                                            

In [54]:
gs4.best_params_

{'cvec__max_df': 0.55,
 'cvec__min_df': 50,
 'rf__max_depth': None,
 'rf__max_features': 30,
 'rf__n_estimators': 10}

In [55]:
gs4.best_score_

0.8918307447719213

In [56]:
# Score the model on train set
print(f"Model_4 Train score: {gs4.score(X_train, y_train)}")
# Score the model on test set
print(f"Model_4 Test score: {gs4.score(X_test, y_test)}")

Model_4 Train score: 0.9619664913782561
Model_4 Test score: 0.8954816285998014


#### Model 5 - Random Forest & Tfidf Vectorizer

In [57]:
# CountVectorizer (transformer) & RandomForestClassifier (estimator)

pipe5 = Pipeline([
        ('tvec', CountVectorizer(stop_words=custom_stop, max_features=1200, ngram_range=(1,2))),
        ('rf', RandomForestClassifier(random_state=42))
])

pipe5_params = {
  #  'cvec__max_features': [1200, 1800, 2000, 2500],
  #  'cvec__stop_words': ['custom_stop'],
  #  'cvec__ngram_range': [(1,1), (1,2), (1,3)],
    'tvec__min_df': [50, 70],
    'tvec__max_df': [.55, .75],
    'rf__n_estimators': [10, 12],
    'rf__max_depth': [None, 5, 6],
    'rf__max_features': [None, 30, "sqrt"],
}

gs5 = GridSearchCV(pipe5,  # object to be optimized
                   pipe5_params, # parameter values to be searched
                   cv=5, # 5 folds
                   verbose = 1,
                   n_jobs = -1
                  )

In [58]:
gs5.fit(X_train, y_train)

Fitting 5 folds for each of 72 candidates, totalling 360 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   15.6s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:   53.6s
[Parallel(n_jobs=-1)]: Done 360 out of 360 | elapsed:  1.6min finished
  'stop_words.' % sorted(inconsistent))


GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=Pipeline(memory=None,
                                steps=[('tvec',
                                        CountVectorizer(analyzer='word',
                                                        binary=False,
                                                        decode_error='strict',
                                                        dtype=<class 'numpy.int64'>,
                                                        encoding='utf-8',
                                                        input='content',
                                                        lowercase=True,
                                                        max_df=1.0,
                                                        max_features=1200,
                                                        min_df=1,
                                                        ngram_range=(1, 2),
                                            

In [59]:
gs5.best_score_

0.8918307447719213