In [1]:
import sys
import sys
import pandas as pd
import numpy as np
import pickle
from sqlalchemy import create_engine
import nltk
nltk.download(['stopwords','wordnet','punkt','averaged_perceptron_tagger'])
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.multioutput import MultiOutputClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.model_selection import GridSearchCV


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\manor\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\manor\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\manor\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\manor\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [8]:
# load data
engine = create_engine('sqlite:///DisasterResponse_try3.db')
df = pd.read_sql_table("disaster_messages", engine)
df = df.sample(frac=0.1)
X = df['message']
Y = df.iloc[:, 4:]
category_names = Y.columns.tolist()
# X, Y, category_names

In [9]:
df.related.unique()

array([1, 0], dtype=int64)

In [10]:
print(len(df))

2622


In [11]:
print(X)

23538    Facilitation of voluntary repatriation of Buru...
2217     Children and elderly are still hurt but haven'...
10492    Dozed off earlier and awoke about 20 minutes a...
20370    He said that, since the devastation, despite t...
20499    The next steps include forming a convention of...
                               ...                        
24112    Russia, the European Union, the U.S., Turkey a...
9308     some important previon for this period of cycl...
20512    Officials here worry that Sindh because of its...
17101    Also, heavy rains during the weekend caused se...
12586    MDuross @lucyintheusa @kaybellor hubby making ...
Name: message, Length: 2622, dtype: object


In [12]:
def tokenize(text):
    tokens = word_tokenize(text)
    lemmatizer = WordNetLemmatizer()
    clean_tokens = []
    for tok in tokens:
        clean_tok = lemmatizer.lemmatize(tok).lower().strip()
        clean_tokens.append(clean_tok)
    return clean_tokens

In [13]:

def build_model():
    pipeline = Pipeline([
        ('vect', CountVectorizer(tokenizer=tokenize)),
        ('tfidf', TfidfTransformer()),
        ('clf', MultiOutputClassifier(RandomForestClassifier()))
    ])

    parameters = {
        'tfidf__use_idf': (True, False),
        'tfidf__smooth_idf': [True, False],
        'vect__max_df': (0.5, 0.75, 1.0),
        'vect__max_features': (None, 5000, 10000),
        'clf__estimator__n_estimators': [50, 100],
        'clf__estimator__min_samples_split': [2, 4]
    }
    cv = GridSearchCV(pipeline, param_grid=parameters)

    return cv

In [14]:
model = build_model()
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.25, random_state=42)

In [15]:
model.fit(X_train, y_train)
y_pred = model.predict(X_test)



In [16]:
print(model.get_params())

{'cv': None, 'error_score': nan, 'estimator__memory': None, 'estimator__steps': [('vect', CountVectorizer(tokenizer=<function tokenize at 0x00000240367E1A60>)), ('tfidf', TfidfTransformer()), ('clf', MultiOutputClassifier(estimator=RandomForestClassifier()))], 'estimator__verbose': False, 'estimator__vect': CountVectorizer(tokenizer=<function tokenize at 0x00000240367E1A60>), 'estimator__tfidf': TfidfTransformer(), 'estimator__clf': MultiOutputClassifier(estimator=RandomForestClassifier()), 'estimator__vect__analyzer': 'word', 'estimator__vect__binary': False, 'estimator__vect__decode_error': 'strict', 'estimator__vect__dtype': <class 'numpy.int64'>, 'estimator__vect__encoding': 'utf-8', 'estimator__vect__input': 'content', 'estimator__vect__lowercase': True, 'estimator__vect__max_df': 1.0, 'estimator__vect__max_features': None, 'estimator__vect__min_df': 1, 'estimator__vect__ngram_range': (1, 1), 'estimator__vect__preprocessor': None, 'estimator__vect__stop_words': None, 'estimator__v

In [17]:
print(model.get_params().keys())

dict_keys(['cv', 'error_score', 'estimator__memory', 'estimator__steps', 'estimator__verbose', 'estimator__vect', 'estimator__tfidf', 'estimator__clf', 'estimator__vect__analyzer', 'estimator__vect__binary', 'estimator__vect__decode_error', 'estimator__vect__dtype', 'estimator__vect__encoding', 'estimator__vect__input', 'estimator__vect__lowercase', 'estimator__vect__max_df', 'estimator__vect__max_features', 'estimator__vect__min_df', 'estimator__vect__ngram_range', 'estimator__vect__preprocessor', 'estimator__vect__stop_words', 'estimator__vect__strip_accents', 'estimator__vect__token_pattern', 'estimator__vect__tokenizer', 'estimator__vect__vocabulary', 'estimator__tfidf__norm', 'estimator__tfidf__smooth_idf', 'estimator__tfidf__sublinear_tf', 'estimator__tfidf__use_idf', 'estimator__clf__estimator__bootstrap', 'estimator__clf__estimator__ccp_alpha', 'estimator__clf__estimator__class_weight', 'estimator__clf__estimator__criterion', 'estimator__clf__estimator__max_depth', 'estimator__

In [18]:
y_pred = model.predict(X_test)
for i, category in enumerate(category_names):
    print(f'Category: {category}')
    print(classification_report(y_test.iloc[:, i], y_pred[:, i]))

Category: related
              precision    recall  f1-score   support

           0       0.72      0.18      0.29       168
           1       0.78      0.98      0.86       488

    accuracy                           0.77       656
   macro avg       0.75      0.58      0.58       656
weighted avg       0.76      0.77      0.72       656

Category: request
              precision    recall  f1-score   support

           0       0.88      1.00      0.94       553
           1       0.97      0.30      0.46       103

    accuracy                           0.89       656
   macro avg       0.93      0.65      0.70       656
weighted avg       0.90      0.89      0.86       656

Category: offer
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       653
           1       0.00      0.00      0.00         3

    accuracy                           1.00       656
   macro avg       0.50      0.50      0.50       656
weighted avg       0.9

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_pr

              precision    recall  f1-score   support

           0       0.98      1.00      0.99       642
           1       0.00      0.00      0.00        14

    accuracy                           0.98       656
   macro avg       0.49      0.50      0.49       656
weighted avg       0.96      0.98      0.97       656

Category: tools
              precision    recall  f1-score   support

           0       0.99      1.00      1.00       650
           1       0.00      0.00      0.00         6

    accuracy                           0.99       656
   macro avg       0.50      0.50      0.50       656
weighted avg       0.98      0.99      0.99       656

Category: hospitals
              precision    recall  f1-score   support

           0       0.99      1.00      0.99       647
           1       0.00      0.00      0.00         9

    accuracy                           0.99       656
   macro avg       0.49      0.50      0.50       656
weighted avg       0.97      0.99     

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_pr

In [19]:

def save_model(model, model_filepath):
    with open(model_filepath, 'wb') as file:
        pickle.dump(model, file)

In [22]:
save_model(model=model)

TypeError: save_model() missing 1 required positional argument: 'model_filepath'