In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import pickle
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV

%matplotlib inline

In [2]:
combined_pre = pd.read_csv('../assets/combined_pre_classed.csv', index_col= 0)
combined_post = pd.read_csv('../assets/combined_post_classed.csv', index_col=0)

In [3]:
combined_post.head()

Unnamed: 0,text,disaster
0,live in rd ward at peggy park giving bacc to...,0
1,accident cleared in bellaire on w lp nb ...,0
2,a nice cold saintarnoldbrewing art car ipa af...,0
3,where to shop sale of unique items helps aid...,1
4,aerial tour shows devastation of epic floodin...,1


In [4]:
combined_pre.head()

Unnamed: 0,text,disaster
0,baytown pd man sought after teen suspect kil...,0
1,amy st houston tx,0
2,crazy about this sisterhood stephen f austi...,0
3,drinking a steam punk rousing steam lager by ...,0
4,reposted with the right cosplayer this time ...,0


In [5]:
combined_post.disaster.value_counts()

0    44896
1    22231
Name: disaster, dtype: int64

In [6]:
disaster_keywords = []
with open('../assets/disaster_keywords.txt') as f:
    for phrase in f:
        disaster_keywords.append(phrase.strip())
disaster_keywords[:5]

['earthquake', 'hurricane', 'fire', 'emergency', 'help']

In [7]:
add_stop = ['rt', 'twitter', 'com', 'net', 'carr', 'irma', 'harvey',
            'maria', 'carrfire', 'camp', 'campfire', 'woolsey', 
            'woolseyfire', 'prague', 'north', 'northbay', 'news',
            'hurricaneirma', 'hurricaneharvey', 'hurricanemaria',
            'headline', 'breaking']

In [8]:
disaster_set = set(disaster_keywords)
disaster_list = list(disaster_set)

In [9]:
disaster_list.extend(add_stop)

In [10]:
stopwords = nltk.corpus.stopwords.words('english')

In [11]:
stopwords.extend(disaster_list)

In [12]:
combined_df = pd.concat([combined_pre, 
                        combined_post], 
                       ignore_index=True)

In [13]:
combined_df.shape

(98550, 2)

In [14]:
combined_df.head(2)

Unnamed: 0,text,disaster
0,baytown pd man sought after teen suspect kil...,0
1,amy st houston tx,0


In [15]:
combined_df.disaster.value_counts(normalize=True)

0    0.774419
1    0.225581
Name: disaster, dtype: float64

In [16]:
y = combined_df['disaster']

# Set X as text column.
X = combined_df['text']

In [40]:
tfidf = TfidfVectorizer(stop_words = stopwords, 
                        max_df=0.95, 
                        min_df=5, max_features=10000)

In [41]:
combined_tfidf = tfidf.fit_transform(X)

  'stop_words.' % sorted(inconsistent))


In [42]:
combined_df_tfidf = pd.SparseDataFrame(combined_tfidf,
                             columns=tfidf.get_feature_names())

In [43]:
combined_df_tfidf.shape

(98550, 10000)

In [44]:
import sys

In [45]:
sys.getsizeof(combined_df_tfidf) * 1e-9

0.011253576000000001

In [46]:
combined_df_tfidf.head()

Unnamed: 0,__,aa,aaa,aan,aaron,aaroncarter,ab,abandoned,abatido,abby,...,zswaggers,zswagtour,zt,zu,zumba,zv,zw,zx,zy,zz
0,,,,,,,,,,,...,,,,,,,,,,
1,,,,,,,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,


In [47]:
combined_df_tfidf.fillna(0, inplace=True)

In [48]:
combined_df_tfidf.sum().sort_values(ascending = False).head(20)

florida       4685.343695
keys          4240.856064
pic           2697.864702
california    1886.096603
houston       1882.239273
oaks          1818.031640
thousand      1744.670375
key           1317.453544
shooting      1314.783959
bar           1117.782834
texas         1100.405905
people         987.690306
west           980.511379
landfall       829.844184
via            760.199409
like           750.710482
category       701.086951
love           682.817763
get            661.755415
mass           661.354474
dtype: float64

In [49]:
y.shape

(98550,)

In [50]:
combined_df_tfidf.shape

(98550, 10000)

In [51]:
combined_array = np.asarray(combined_df_tfidf)

In [54]:
sys.getsizeof(combined_array)

112

In [52]:
sys.getsizeof(combined_array) * 1e-9

1.1200000000000001e-07

In [None]:
# Train/test split our data.
# X_train, X_test, y_train, y_test = train_test_split(X,
#                                                     y,
#                                                     test_size = 0.33,
#                                                     random_state = 42)

In [55]:
# Import PCA.
from sklearn.decomposition import PCA

In [56]:
# Instantiate PCA.
pca = PCA(n_components=100)

In [57]:
pd.DataFrame(combined_array).shape

(98550, 10000)

In [None]:
# Fit PCA on the training data.
Z_combined = pca.fit_transform(combined_array)

# n_component will default to number of features in this case 9999 features.

In [38]:
pd.DataFrame(Z_combined).describe()

# column 0 is the first principal component

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
count,98550.0,98550.0,98550.0,98550.0,98550.0,98550.0,98550.0,98550.0,98550.0,98550.0,...,98550.0,98550.0,98550.0,98550.0,98550.0,98550.0,98550.0,98550.0,98550.0,98550.0
mean,-1.778606e-15,-1.010376e-15,-4.18962e-16,-9.012930000000001e-17,-7.028897e-17,1.597978e-16,8.339843000000001e-17,-2.131758e-16,-9.182959e-17,2.7024230000000002e-17,...,5.062048e-18,2.5300240000000002e-17,1.4362850000000002e-17,1.0935450000000001e-17,-5.541969e-17,-1.895483e-17,9.368444e-18,-2.0966130000000002e-17,1.652707e-17,4.0255969999999997e-19
std,0.1650207,0.1402026,0.1190333,0.1152824,0.09929207,0.08122586,0.07848397,0.07489747,0.07280805,0.0722526,...,0.04182133,0.04164577,0.04147493,0.04145434,0.04122584,0.0411507,0.04090029,0.04082382,0.04058531,0.0402356
min,-0.4537709,-0.6122365,-0.1691763,-0.2467681,-0.2595318,-0.2737119,-0.1931123,-0.6342293,-0.3328845,-0.5230166,...,-0.2576357,-0.3348824,-0.2701302,-0.2466682,-0.2320357,-0.283889,-0.2941234,-0.2629867,-0.2785617,-0.2292427
25%,-0.07326945,-0.0281067,-0.09020374,-0.06105505,-0.03671743,-0.0214589,-0.02338607,-0.005491663,-0.02188994,-0.007039934,...,-0.01790846,-0.01658479,-0.01597144,-0.01845799,-0.01779103,-0.0178309,-0.01793489,-0.01690202,-0.01679546,-0.01819264
50%,-0.05886256,0.01101424,-0.02435538,-0.01047768,-0.02221602,0.0004776592,-0.005437872,0.0006003797,0.0002718021,-0.001817432,...,-0.003576961,0.001391813,0.002642997,0.001298277,-0.001868363,-0.003417279,-0.001719528,0.001793307,0.003450941,0.0007620825
75%,0.1265291,0.03890583,0.0535759,-0.003932853,-0.00398295,0.004613698,0.004999671,0.005792153,0.009349469,0.00717168,...,0.01608945,0.01638583,0.01593548,0.01775526,0.01503107,0.01688448,0.01618591,0.01698397,0.0181362,0.01590833
max,0.7564889,0.6902804,0.6468355,0.9442297,0.8762766,0.7320055,0.8423308,0.7004911,0.7105983,0.8021622,...,0.3386977,0.3713849,0.4458523,0.3145512,0.3606883,0.3316814,0.3226459,0.307184,0.2891618,0.2649695


In [39]:
var_exp = pca.explained_variance_ratio_
print(f'Explained variance: {np.round(var_exp,3)}')

cum_var_exp = np.cumsum(var_exp)
print(f'Cumulative explained variance: {np.round(cum_var_exp,3)}')

Explained variance: [0.029 0.021 0.015 0.014 0.01  0.007 0.006 0.006 0.006 0.005 0.005 0.005
 0.005 0.005 0.005 0.004 0.004 0.004 0.004 0.004 0.004 0.004 0.004 0.004
 0.004 0.004 0.004 0.003 0.003 0.003 0.003 0.003 0.003 0.003 0.003 0.003
 0.003 0.003 0.003 0.003 0.003 0.003 0.003 0.003 0.003 0.003 0.003 0.003
 0.003 0.003 0.003 0.003 0.003 0.003 0.003 0.003 0.003 0.002 0.002 0.002
 0.002 0.002 0.002 0.002 0.002 0.002 0.002 0.002 0.002 0.002 0.002 0.002
 0.002 0.002 0.002 0.002 0.002 0.002 0.002 0.002 0.002 0.002 0.002 0.002
 0.002 0.002 0.002 0.002 0.002 0.002 0.002 0.002 0.002 0.002 0.002 0.002
 0.002 0.002 0.002 0.002]
Cumulative explained variance: [0.029 0.049 0.064 0.078 0.089 0.096 0.102 0.108 0.114 0.119 0.125 0.129
 0.134 0.139 0.143 0.148 0.152 0.156 0.16  0.164 0.168 0.172 0.176 0.18
 0.183 0.187 0.19  0.194 0.197 0.201 0.204 0.207 0.21  0.214 0.217 0.22
 0.223 0.226 0.229 0.232 0.235 0.238 0.24  0.243 0.246 0.249 0.252 0.254
 0.257 0.26  0.262 0.265 0.268 0.27  0.273 0.275 