In [1]:
import sys
import sys
import pandas as pd
import numpy as np
import pickle
from sqlalchemy import create_engine
import nltk
nltk.download(['stopwords','wordnet','punkt','averaged_perceptron_tagger'])
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.multioutput import MultiOutputClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.model_selection import GridSearchCV


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\manor\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\manor\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\manor\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\manor\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [8]:
# load data
engine = create_engine('sqlite:///DisasterResponse_try3.db')
df = pd.read_sql_table("disaster_messages", engine)
df = df.sample(frac=0.1)
X = df['message']
Y = df.iloc[:, 4:]
category_names = Y.columns.tolist()
# X, Y, category_names

In [9]:
df.related.unique()

array([1, 0], dtype=int64)

In [10]:
print(len(df))

2622


In [11]:
print(X)

23538    Facilitation of voluntary repatriation of Buru...
2217     Children and elderly are still hurt but haven'...
10492    Dozed off earlier and awoke about 20 minutes a...
20370    He said that, since the devastation, despite t...
20499    The next steps include forming a convention of...
                               ...                        
24112    Russia, the European Union, the U.S., Turkey a...
9308     some important previon for this period of cycl...
20512    Officials here worry that Sindh because of its...
17101    Also, heavy rains during the weekend caused se...
12586    MDuross @lucyintheusa @kaybellor hubby making ...
Name: message, Length: 2622, dtype: object


In [12]:
def tokenize(text):
    tokens = word_tokenize(text)
    lemmatizer = WordNetLemmatizer()
    clean_tokens = []
    for tok in tokens:
        clean_tok = lemmatizer.lemmatize(tok).lower().strip()
        clean_tokens.append(clean_tok)
    return clean_tokens

In [13]:

def build_model():
    pipeline = Pipeline([
        ('vect', CountVectorizer(tokenizer=tokenize)),
        ('tfidf', TfidfTransformer()),
        ('clf', MultiOutputClassifier(RandomForestClassifier()))
    ])

    parameters = {
        'tfidf__use_idf': (True, False),
        'tfidf__smooth_idf': [True, False],
        'vect__max_df': (0.5, 0.75, 1.0),
        'vect__max_features': (None, 5000, 10000),
        'clf__estimator__n_estimators': [50, 100],
        'clf__estimator__min_samples_split': [2, 4]
    }
    cv = GridSearchCV(pipeline, param_grid=parameters)

    return cv

In [14]:
model = build_model()
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.25, random_state=42)

In [15]:
model.fit(X_train, y_train)
y_pred = model.predict(X_test)



In [None]:
print(model.get_params())

In [None]:
print(model.get_params().keys())

In [None]:
Y_pred = model.predict(X_test)
for i, category in enumerate(category_names):
    print(f'Category: {category}')
    print(classification_report(Y_test.iloc[:, i], Y_pred[:, i]))

In [None]:

def save_model(model, model_filepath):
    with open(model_filepath, 'wb') as file:
        pickle.dump(model, file)