In [241]:
# Imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer, ENGLISH_STOP_WORDS
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB, GaussianNB
from sklearn.metrics import confusion_matrix, accuracy_score, classification
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.preprocessing import StandardScaler, FunctionTransformer

import nltk
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

import regex as re

from bs4 import BeautifulSoup

In [242]:
import warnings
warnings.filterwarnings('ignore')

In [141]:
# read the file
fires_labeled_df = pd.read_csv('./fire_filtered.csv')

In [142]:
nonfires_labeled_df = pd.read_csv('./la_nonfire_1600_tweets.csv')

In [155]:
colorado_labeled_df = pd.read_csv('./Colorado tweets & target.csv')

In [198]:
extra1200 = pd.read_csv('./la_nonfire_1200_tweets.csv')

In [199]:
nonfires_labeled_df.head()

Unnamed: 0,text,time,target
0,pic.twitter.com/3VCVQKyBmm,2019-08-20 10:00:00,0
1,"Tuesday, i saw no more #ghosttownalive founder...",2019-08-20 10:00:00,0
2,Of course ALL THE ABOVE!!,2019-08-20 10:00:00,0
3,I'm too busy looking for the talent.,2019-08-20 10:00:00,0
4,Lucky enough to celebrate the end of Justin’s ...,2019-08-20 10:00:00,0


In [200]:
colorado_labeled_df

Unnamed: 0,time,text,target
0,2.110410e+17,#Intern #US #TATTOO #Wisconsin #Ohio #NC #PA #...,0
1,2.111120e+17,RT @Jack4Ward: Get in on the fun every Thursda...,0
2,2.111570e+17,Welcome to our newest STUDENTathlete- Reagan B...,0
3,2.111630e+17,Denver Post: #Colorado governor signs bill cre...,0
4,2.112170e+17,Pretty sure I'm going to live in Manitou Sprin...,0
...,...,...,...
1195,2.220450e+17,Colorado’s Waldo Canyon Fire reaches 98% conta...,1
1196,2.220710e+17,RT @cnnbrk: Colorado governor lifts statewide ...,1
1197,2.220720e+17,#SiguemeYTeSigo Colorado governor lifts statew...,1
1198,2.220820e+17,RT @News1130radio: BC Wildfire Management says...,1


In [202]:
fires_labeled_df.head()

Unnamed: 0,time,target,text
0,10/29/2019 22:56,0,RT @JulianCastro: My grandmother was a domesti...
1,10/29/2019 22:56,1,RT @MayorOfLA: #GettyFire update | 8AM:\n\n- 5...
2,10/29/2019 22:56,1,Getty Fire Ignited by Power Line in Sepulveda ...
3,10/29/2019 22:56,1,"RT @latimes: In an ominous new warning, the Na..."
4,10/29/2019 22:56,1,Arson investigators from the Los Angeles Fire ...


In [203]:
fires_labeled_df.dtypes

time      object
target     int64
text      object
dtype: object

In [7]:
# fires_labeled_df['target'] = np.where((fires_labeled_df['related'] == 'R') & (fires_labeled_df['informative'] == 'I'), 1, 0)

In [204]:
fires_labeled_df

Unnamed: 0,time,target,text
0,10/29/2019 22:56,0,RT @JulianCastro: My grandmother was a domesti...
1,10/29/2019 22:56,1,RT @MayorOfLA: #GettyFire update | 8AM:\n\n- 5...
2,10/29/2019 22:56,1,Getty Fire Ignited by Power Line in Sepulveda ...
3,10/29/2019 22:56,1,"RT @latimes: In an ominous new warning, the Na..."
4,10/29/2019 22:56,1,Arson investigators from the Los Angeles Fire ...
...,...,...,...
7207,10/23/2019 20:04,1,b'LA 6407 Dix St **Report of Fire** https://t....
7208,10/23/2019 20:03,0,b'RT @AyseTrixstarEnt: @eyemnomadic Fire \xf0\...
7209,10/23/2019 20:01,0,b'Fireman: the roof is on fire \n\nMe: ayyyy \...
7210,10/23/2019 20:00,0,"b'RT @MusicForRelief: There were still 19,925 ..."


In [205]:
fires_labeled_df['target'].value_counts()

1    5007
0    2205
Name: target, dtype: int64

In [150]:
# fires_labeled_df.drop(columns=['related', 'informative'], inplace=True)

In [207]:
extra1200.drop(columns=['long', 'lat'], inplace=True)

In [208]:
labeled_df = pd.concat([fires_labeled_df, nonfires_labeled_df, colorado_labeled_df, extra1200])
labeled_df = labeled_df.drop_duplicates().dropna().reset_index(drop=True)

In [209]:
labeled_df

Unnamed: 0,target,text,time
0,0,RT @JulianCastro: My grandmother was a domesti...,10/29/2019 22:56
1,1,RT @MayorOfLA: #GettyFire update | 8AM:\n\n- 5...,10/29/2019 22:56
2,1,Getty Fire Ignited by Power Line in Sepulveda ...,10/29/2019 22:56
3,1,"RT @latimes: In an ominous new warning, the Na...",10/29/2019 22:56
4,1,Arson investigators from the Los Angeles Fire ...,10/29/2019 22:56
...,...,...,...
11205,0,Yessss lolhttps://twitter.com/FamIsAllIGot/sta...,2019-08-20 10:00:00
11206,0,pic.twitter.com/KuY8zjW97s,2019-08-20 10:00:00
11207,0,Mom was kinda weird about the whole lesbian ba...,2019-08-20 10:00:00
11208,0,Drinking a Monk's Lunch by @RitualBrewing @The...,2019-08-20 10:00:00


## Adding Sentiment Score to the DF

In [210]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer

In [211]:
# instantiate the sentiment analyzer
sia = SentimentIntensityAnalyzer()

In [212]:
sia_table = []
for i in labeled_df['text']:
    sia_scores = {}
    sia_scores['sia_positive'] = sia.polarity_scores(i)['pos']
    sia_scores['sia_negative'] = sia.polarity_scores(i)['neg']
    sia_scores['sia_neutral'] = sia.polarity_scores(i)['neu']
    sia_scores['sia_compound'] = sia.polarity_scores(i)['compound']
    
    sia_table.append(sia_scores)

In [213]:
sia_df = pd.DataFrame(sia_table)

In [214]:
sia_df.head()

Unnamed: 0,sia_positive,sia_negative,sia_neutral,sia_compound
0,0.0,0.0,1.0,0.0
1,0.0,0.0,1.0,0.0
2,0.0,0.211,0.789,-0.34
3,0.0,0.202,0.798,-0.5859
4,0.066,0.122,0.812,-0.2732


In [215]:
labeled_df.head()

Unnamed: 0,target,text,time
0,0,RT @JulianCastro: My grandmother was a domesti...,10/29/2019 22:56
1,1,RT @MayorOfLA: #GettyFire update | 8AM:\n\n- 5...,10/29/2019 22:56
2,1,Getty Fire Ignited by Power Line in Sepulveda ...,10/29/2019 22:56
3,1,"RT @latimes: In an ominous new warning, the Na...",10/29/2019 22:56
4,1,Arson investigators from the Los Angeles Fire ...,10/29/2019 22:56


In [216]:
labeled_df = pd.concat([labeled_df, sia_df], axis=1)

In [217]:
labeled_df

Unnamed: 0,target,text,time,sia_positive,sia_negative,sia_neutral,sia_compound
0,0,RT @JulianCastro: My grandmother was a domesti...,10/29/2019 22:56,0.000,0.000,1.000,0.0000
1,1,RT @MayorOfLA: #GettyFire update | 8AM:\n\n- 5...,10/29/2019 22:56,0.000,0.000,1.000,0.0000
2,1,Getty Fire Ignited by Power Line in Sepulveda ...,10/29/2019 22:56,0.000,0.211,0.789,-0.3400
3,1,"RT @latimes: In an ominous new warning, the Na...",10/29/2019 22:56,0.000,0.202,0.798,-0.5859
4,1,Arson investigators from the Los Angeles Fire ...,10/29/2019 22:56,0.066,0.122,0.812,-0.2732
...,...,...,...,...,...,...,...
11205,0,Yessss lolhttps://twitter.com/FamIsAllIGot/sta...,2019-08-20 10:00:00,0.000,0.000,1.000,0.0000
11206,0,pic.twitter.com/KuY8zjW97s,2019-08-20 10:00:00,0.000,0.000,1.000,0.0000
11207,0,Mom was kinda weird about the whole lesbian ba...,2019-08-20 10:00:00,0.063,0.137,0.800,-0.2975
11208,0,Drinking a Monk's Lunch by @RitualBrewing @The...,2019-08-20 10:00:00,0.000,0.000,1.000,0.0000


In [218]:
labeled_df['target'].value_counts()

1    5691
0    5519
Name: target, dtype: int64

In [219]:
labeled_df.isnull().sum()

target          0
text            0
time            0
sia_positive    0
sia_negative    0
sia_neutral     0
sia_compound    0
dtype: int64

labeled_df['target'].value_counts(normalize=True)

In [221]:
custom_stop = list(ENGLISH_STOP_WORDS)
custom_stop.extend(["b'RT", "xe2", "x80", "x99", "xf0", "htpps", "xa6", "x9f", "Getty Center", "Los Angeles",
                    "Los Angele", "Getty", "x99m", "x99s", "Los", "Angele", "taco truck", "outfit", "taco", "truck",
                   "http", "https", "x94", "xa5", "nhttp", "nhttps", "b'", "Center", "amp", "GettyFire", "RT", "www",
                   "instagram", "xa6'b'RT", "xa6'b'", "xa6'RT", "xb8", "x9d", "xef", "x8f", 'http','colorado','co',
                    'springs','waldo','canyon','cofire','rt','boulder','waldocanyonfire', 'highparkfire',
                    'denverpost','denver','colo'])

In [222]:
# define a function to clean text from html tags, non-leters, english stop words, custom stop words

def cleanup_lemmatize_text(document):
    # remove HTML
    # text = BeautifulSoup(document).get_text()
    # remove non-letter characters
    letters = re.sub("[^a-zA-Z]", " ", document)    
    # all words lower case
    words = letters.lower().split()
    # remove stop words (english + custom)
    stops = set(custom_stop)
    clean_words = [word for word in words if word not in stops]
    # lemmatize cleaned up words
    # lem_clean_words = [lemmatizer.lemmatize(token) for token in clean_words]
    
    # returned the cleand up string
    return(" ".join(clean_words))

In [71]:
# # initialize an empty list to hold the clean titles
# clean_text = []

# for text in labeled_df['text']:
#     # convert title to words, then append to clean_titles list
#     clean_text.append(cleanup_lemmatize_text(text))

In [67]:
len(clean_text)

2342

In [13]:
fires_labeled_df['text'] = clean_text

In [223]:
# define features variable(s) with only title column
# I will not add body because first subreddit has a lot of blank posts that would cause unbalanced features
X = labeled_df['text']

# define target variable
y = labeled_df['target']

In [224]:
# split the data into the training and testing sets
# set a random state for reproducibility 
# stratify y to combat slightly unbalanced classes
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.33,
                                                    stratify=y,
                                                    random_state=42)

In [225]:
# Baseline accuracy on test data
y_test.value_counts(normalize=True)

1    0.507568
0    0.492432
Name: target, dtype: float64

In [234]:
# CountVectorizer (transformer) & LogisticRegression (estimator)

pipe1 = Pipeline([
        ('cvec', CountVectorizer(stop_words=custom_stop, max_features=1200, ngram_range=(1,2))),
        ('logreg', LogisticRegression())
])

pipe1_params = {
  #  'cvec__max_features': [1200, 1800, 2000, 2500],
  #  'cvec__stop_words': ['custom_stop'],
  #  'cvec__ngram_range': [(1,1), (1,2), (1,3)],
    'cvec__min_df': [20, 40, 50],
    'cvec__max_df': [.35, .45, .55],
 #   'cvec__strip_accents': ['ascii'],
  #  'logreg__penalty': ['l1', 'l2'],
  #  'logreg__C': [2, 2.5, 3]
}

gs1 = GridSearchCV(pipe1,  # object to be optimized
                   pipe1_params, # parameter values to be searched
                   cv=5, # 5 folds
                   verbose = 1,
             #      n_jobs = -1
                  )

In [235]:
# fit gridsearch CV to train data
gs1.fit(X_train, y_train)

Fitting 5 folds for each of 9 candidates, totalling 45 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  45 out of  45 | elapsed:   28.8s finished


GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=Pipeline(memory=None,
                                steps=[('cvec',
                                        CountVectorizer(analyzer='word',
                                                        binary=False,
                                                        decode_error='strict',
                                                        dtype=<class 'numpy.int64'>,
                                                        encoding='utf-8',
                                                        input='content',
                                                        lowercase=True,
                                                        max_df=1.0,
                                                        max_features=1200,
                                                        min_df=1,
                                                        ngram_range=(1, 2),
                                            

In [236]:
# Best score
gs1.best_score_

0.8398135818908122

In [237]:
gs1.best_params_

{'cvec__max_df': 0.35, 'cvec__min_df': 20}

In [238]:
# Save the best model
gs1_model = gs1.best_estimator_

In [239]:
# Score best model on train set
print(f"Model_1 Train score: {gs1_model.score(X_train, y_train)}")
# Score best model on test set
print(f"Model_1 Test score: {gs1_model.score(X_test, y_test)}")

Model_1 Train score: 0.8697736351531291
Model_1 Test score: 0.8418918918918918


In [232]:
# Generate predictions
preds =gs1_model.predict(X_test)

In [233]:
# Generate confusion matrix
confusion_matrix(y_test, # True values
                 preds)  # Predicted values

array([[1623,  199],
       [ 432, 1446]], dtype=int64)

In [136]:
tn, fp, fn, tp = confusion_matrix(y_test,
                                  preds).ravel()

In [137]:
fp

157

In [138]:
fn

364

In [139]:
tp

1289

In [140]:
tn

1098

## Explort DataFrame

In [240]:
labeled_df.to_csv('labeled_df.csv')