In [95]:
from time import time

import nltk
from nltk.corpus import stopwords

nltk.download('words')
nltk.download('stopwords')
import pandas as pd
import numpy as np

from nltk.tokenize import sent_tokenize
from nltk.tokenize import word_tokenize

from sklearn.svm import LinearSVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.multioutput import MultiOutputClassifier
from sklearn.ensemble import RandomForestClassifier

from data_utils import DataUtils
from model_utils import ModelUtils
from nlp_utils import NLPUtils

database_filepath = 'data/DisasterResponse.db'
model_filepath = 'models/classifier.pkl'

[nltk_data] Downloading package words to /home/mrugeles/nltk_data...
[nltk_data]   Package words is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/mrugeles/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [96]:
stopwords_list = stopwords.words("english")
stopwords_list += ['null']

In [97]:
modelUtils = ModelUtils()
dataUtils = DataUtils()
nlpUtils = NLPUtils()
X = None
Y = None
category_names = None

In [98]:
X, Y, category_names = dataUtils.load_db_data(database_filepath)

sqlite:///data/DisasterResponse.db


In [99]:
X = nlpUtils.create_vector_model(X)

TfidfTransformer features: (26216, 29228)


In [100]:
X = X.head()

In [101]:
from tqdm import tqdm
tqdm.pandas(desc="feature_spellcheck")

matrix = X
start = time()
columns_df = pd.DataFrame(list(matrix.columns), columns = ['feature'])
columns_df['feature_spellcheck'] = columns_df['feature'].progress_apply(lambda word: nlpUtils.spellcheck(word, 0.7))
print(f'feature_spellcheck time: {time() - start}')


  from pandas import Panel
feature_spellcheck: 100%|██████████| 29228/29228 [48:28<00:00, 10.05it/s]  

feature_spellcheck time: 2908.1207251548767





In [102]:
columns_df[columns_df['feature'].str.contains('null')]

Unnamed: 0,feature,feature_spellcheck
18103,,
18104,nullah,-1
18105,nullification,nullification


In [113]:
columns_df.loc[
    (columns_df['feature_spellcheck'] == "-1") |
    (columns_df['feature_spellcheck'] == "null") |
    (columns_df['feature_spellcheck'] == "nan")
]

Unnamed: 0,feature,feature_spellcheck
0,aa,-1
1,aaa,-1
2,aaaaaaaa,-1
3,aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa...,-1
4,aaaaand,-1
...,...,...
29221,zwkmjkkz,-1
29223,zxje,-1
29224,zxlo,-1
29226,zz,-1


In [94]:
start = time()
drop_columns = columns_df.loc[
    (columns_df['feature_spellcheck'] == "-1") |
    (columns_df['feature_spellcheck'] == "null") |
    (columns_df['feature_spellcheck'] == "nan")
]['feature'].values
matrix = matrix.drop(drop_columns, axis = 1)
print(f'drop_columns time: {time() - start}')

KeyError: "['aa' 'aaa' 'aaaaaaaa' ... 'zxlo' 'zz' 'zzz'] not found in axis"

In [38]:
array_cols = matrix.columns.values.astype(str)
array_cols.dtype

dtype('<U18')

In [39]:
start = time()
renamed_columns = dict(columns_df.loc[columns_df['feature_spellcheck'] != "-1"].to_dict('split')['data'])
matrix = matrix.rename(columns = renamed_columns)
print(f'renamed_columns time: {time() - start}')

renamed_columns time: 0.03940320014953613


In [45]:
matrix.columns.isnull()
matrix.columns[matrix.columns.isnull()]

Index([], dtype='object')

In [47]:
matrix = nlpUtils.drop_duplicated(matrix)

100%|██████████| 6219/6219 [00:04<00:00, 1331.81it/s]


In [48]:
matrix.columns[matrix.columns.isnull()]

Index([], dtype='object')

In [49]:
matrix = matrix.reindex(sorted(matrix.columns), axis=1)

In [57]:
matrix.columns[matrix.columns.isnull()]

Index([], dtype='object')

In [63]:
m_columns = matrix.columns.values
m_columns

array(['a', 'aah', 'aaron', ..., 'zone', 'zu', 'zubov'], dtype=object)

In [66]:
m_columns[6915:6925]

array(['nowhere', 'noxious', 'nuclear', 'null', 'nullification', 'numb',
       'number', 'numerous', 'nur', 'nurse'], dtype=object)

In [67]:
m_columns[6700:6710]

array(['namely', 'names', 'nan', 'nancy', 'nap', 'naphthol', 'napkin',
       'napkins', 'narrow', 'narrowed'], dtype=object)

In [58]:
model_features = pd.DataFrame(matrix.columns.values, columns = ['feature'])

In [59]:
model_features.columns[model_features.columns.isnull()]

Index([], dtype='object')

In [60]:
model_features.to_csv('model_features.csv', index = False)

In [61]:
model_features = pd.read_csv('model_features.csv')

In [62]:
model_features.loc[model_features['feature'].isna()]

Unnamed: 0,feature
6702,
6918,
