In [1]:
# Imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer, ENGLISH_STOP_WORDS
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB, GaussianNB
from sklearn.metrics import confusion_matrix, accuracy_score, classification
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.preprocessing import StandardScaler, FunctionTransformer

import nltk
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

import regex as re

from bs4 import BeautifulSoup

In [2]:
import warnings
warnings.filterwarnings('ignore')

In [3]:
# read the file
fires_labeled_df = pd.read_csv('./fire_filtered.csv')

In [44]:
nonfires_labeled_df = pd.read_csv('./la_nonfire_1600_tweets.csv')

In [46]:
nonfires_labeled_df.head()

Unnamed: 0,text,time,long,lat,target
0,pic.twitter.com/3VCVQKyBmm,2019-08-20 10:00:00,34.1826,-118.4397,0
1,"Tuesday, i saw no more #ghosttownalive founder...",2019-08-20 10:00:00,34.1826,-118.4397,0
2,Of course ALL THE ABOVE!!,2019-08-20 10:00:00,34.1826,-118.4397,0
3,I'm too busy looking for the talent.,2019-08-20 10:00:00,34.1826,-118.4397,0
4,Lucky enough to celebrate the end of Justin’s ...,2019-08-20 10:00:00,34.1826,-118.4397,0


In [4]:
fires_labeled_df.drop('Unnamed: 0', axis=1, inplace=True)

In [5]:
fires_labeled_df.head()

Unnamed: 0,time,related,informative,text
0,10/29/2019 22:56,U,N,RT @JulianCastro: My grandmother was a domesti...
1,10/29/2019 22:56,R,I,RT @MayorOfLA: #GettyFire update | 8AM:\n\n- 5...
2,10/29/2019 22:56,R,I,Getty Fire Ignited by Power Line in Sepulveda ...
3,10/29/2019 22:56,R,I,"RT @latimes: In an ominous new warning, the Na..."
4,10/29/2019 22:56,R,I,Arson investigators from the Los Angeles Fire ...


In [6]:
fires_labeled_df.dtypes

time           object
related        object
informative    object
text           object
dtype: object

In [7]:
fires_labeled_df['target'] = np.where((fires_labeled_df['related'] == 'R') & (fires_labeled_df['informative'] == 'I'), 1, 0)

In [8]:
fires_labeled_df

Unnamed: 0,time,related,informative,text,target
0,10/29/2019 22:56,U,N,RT @JulianCastro: My grandmother was a domesti...,0
1,10/29/2019 22:56,R,I,RT @MayorOfLA: #GettyFire update | 8AM:\n\n- 5...,1
2,10/29/2019 22:56,R,I,Getty Fire Ignited by Power Line in Sepulveda ...,1
3,10/29/2019 22:56,R,I,"RT @latimes: In an ominous new warning, the Na...",1
4,10/29/2019 22:56,R,I,Arson investigators from the Los Angeles Fire ...,1
...,...,...,...,...,...
2190,10/25/2019 18:27,R,I,"b""Friday fire update from Healdsburg. It's smo...",1
2191,10/25/2019 18:23,R,I,b'RT @tedappel: Broken equipment found on @PGE...,1
2192,10/25/2019 18:21,R,I,"b'Whelp, nother wildfire about 30 miles north,...",1
2193,10/25/2019 18:11,R,I,"b'RT @KinsellaWine: Fire in Geyserville, one t...",1


In [40]:
fires_labeled_df['target'].value_counts()

1    1884
0     311
Name: target, dtype: int64

In [51]:
fires_labeled_df.drop(columns=['related', 'informative'], inplace=True)

In [52]:
nonfires_labeled_df.drop(columns=['long', 'lat'], inplace=True)

In [84]:
labeled_df = pd.concat([fires_labeled_df, nonfires_labeled_df])
labeled_df = labeled_df.drop_duplicates().dropna().reset_index(drop=True)

In [85]:
labeled_df

Unnamed: 0,target,text,time
0,0,rt juliancastro grandmother domestic worker cl...,10/29/2019 22:56
1,1,rt mayorofla gettyfire update n n containment ...,10/29/2019 22:56
2,1,getty ignited power line sepulveda pass t f tx...,10/29/2019 22:56
3,1,rt latimes ominous new warning national weathe...,10/29/2019 22:56
4,1,arson investigators los angeles department sai...,10/29/2019 22:56
...,...,...,...
3789,0,Too awesome! Will be shopping for my Titan pup...,2019-08-20 10:00:00
3790,0,This was an awesome find by @MitchTheFort. So ...,2019-08-20 10:00:00
3791,0,Some great company here*. *ignoring 2015https:...,2019-08-20 10:00:00
3792,0,ur coming out im so proud of u,2019-08-20 10:00:00


In [86]:
labeled_df.isnull().sum()

target    0
text      0
time      0
dtype: int64

In [63]:
labeled_df['target'].value_counts(normalize=True)

0    0.503557
1    0.496443
Name: target, dtype: float64

In [64]:
custom_stop = list(ENGLISH_STOP_WORDS)
custom_stop.extend(["b'RT", "xe2", "x80", "x99", "xf0", "htpps", "xa6", "x9f", "Getty Center", "Los Angeles",
                    "Los Angele", "Getty", "x99m", "x99s", "Los", "Angele", "taco truck", "outfit", "taco", "truck",
                   "http", "https", "x94", "xa5", "nhttp", "nhttps", "b'", "Center", "amp", "GettyFire", "RT", "www",
                   "instagram", "xa6'b'RT", "xa6'b'", "xa6'RT", "xb8", "x9d", "xef", "x8f"])

In [70]:
# define a function to clean text from html tags, non-leters, english stop words, custom stop words

def cleanup_lemmatize_text(document):
    # remove HTML
    # text = BeautifulSoup(document).get_text()
    # remove non-letter characters
    letters = re.sub("[^a-zA-Z]", " ", document)    
    # all words lower case
    words = letters.lower().split()
    # remove stop words (english + custom)
    stops = set(custom_stop)
    clean_words = [word for word in words if word not in stops]
    # lemmatize cleaned up words
    # lem_clean_words = [lemmatizer.lemmatize(token) for token in clean_words]
    
    # returned the cleand up string
    return(" ".join(clean_words))

In [71]:
# # initialize an empty list to hold the clean titles
# clean_text = []

# for text in labeled_df['text']:
#     # convert title to words, then append to clean_titles list
#     clean_text.append(cleanup_lemmatize_text(text))

In [67]:
len(clean_text)

2342

In [13]:
fires_labeled_df['text'] = clean_text

In [88]:
# define features variable(s) with only title column
# I will not add body because first subreddit has a lot of blank posts that would cause unbalanced features
X = labeled_df['text']

# define target variable
y = labeled_df['target']

In [89]:
# split the data into the training and testing sets
# set a random state for reproducibility 
# stratify y to combat slightly unbalanced classes
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.33,
                                                    stratify=y,
                                                    random_state=42)

In [78]:
# Baseline accuracy on test data
y_test.value_counts(normalize=True)

0    0.503591
1    0.496409
Name: target, dtype: float64

In [90]:
# CountVectorizer (transformer) & LogisticRegression (estimator)

pipe1 = Pipeline([
        ('cvec', CountVectorizer(stop_words=custom_stop, max_features=1200, ngram_range=(1,2))),
        ('logreg', LogisticRegression())
])

pipe1_params = {
  #  'cvec__max_features': [1200, 1800, 2000, 2500],
  #  'cvec__stop_words': ['custom_stop'],
  #  'cvec__ngram_range': [(1,1), (1,2), (1,3)],
    'cvec__min_df': [50, 100, 200],
    'cvec__max_df': [.55, .75, .85],
 #   'cvec__strip_accents': ['ascii'],
  #  'logreg__penalty': ['l1', 'l2'],
  #  'logreg__C': [2, 2.5, 3]
}

gs1 = GridSearchCV(pipe1,  # object to be optimized
                   pipe1_params, # parameter values to be searched
                   cv=5, # 5 folds
                   verbose = 1,
             #      n_jobs = -1
                  )

In [91]:
# fit gridsearch CV to train data
gs1.fit(X_train, y_train)

Fitting 5 folds for each of 9 candidates, totalling 45 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  45 out of  45 | elapsed:    9.0s finished


GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=Pipeline(memory=None,
                                steps=[('cvec',
                                        CountVectorizer(analyzer='word',
                                                        binary=False,
                                                        decode_error='strict',
                                                        dtype=<class 'numpy.int64'>,
                                                        encoding='utf-8',
                                                        input='content',
                                                        lowercase=True,
                                                        max_df=1.0,
                                                        max_features=1200,
                                                        min_df=1,
                                                        ngram_range=(1, 2),
                                            

In [92]:
# Best score
gs1.best_score_

0.8815426997245179

In [93]:
gs1.best_params_

{'cvec__max_df': 0.55, 'cvec__min_df': 50}

In [94]:
# Save the best model
gs1_model = gs1.best_estimator_

In [95]:
# Score best model on train set
print(f"Model_1 Train score: {gs1_model.score(X_train, y_train)}")
# Score best model on test set
print(f"Model_1 Test score: {gs1_model.score(X_test, y_test)}")

Model_1 Train score: 0.8902007083825265
Model_1 Test score: 0.8898643256185156


In [96]:
# Generate predictions
preds =gs1_model.predict(X_test)

In [97]:
# Generate confusion matrix
confusion_matrix(y_test, # True values
                 preds)  # Predicted values

array([[575,  56],
       [ 82, 540]], dtype=int64)

In [32]:
tn, fp, fn, tp = confusion_matrix(y_test,
                                  preds).ravel()

In [34]:
fp

101

In [35]:
fn

4

In [36]:
tp

618

In [37]:
tn

2