In [None]:
# import packages
import sys
from data.process_data import clean_data, save_data
from models.train_classifier import *

import pandas as pd
import numpy as np
from sqlalchemy import create_engine
import nltk
nltk.download(['stopwords','wordnet','punkt','averaged_perceptron_tagger'])
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.multioutput import MultiOutputClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.model_selection import GridSearchCV

In [48]:
from sqlalchemy import create_engine
from sqlalchemy import inspect

# Create an engine to connect to the database
engine = create_engine('sqlite:///Disaster_pipedata1.db')

# Create an inspector object
inspector = inspect(engine)

# Get the table names
table_names = inspector.get_table_names()

# Print the table names
for table_name in table_names:
    print(table_name)

disaster_messages


In [49]:
df = pd.read_sql_table('disaster_messages', engine)
len(df)

26215

In [50]:
def load_data(data_file):
    # read in file
    engine = create_engine('sqlite:///' + data_file)
    df = pd.read_sql_table('disaster_messages', engine)

    # clean data  # load to database

    # define features and label arrays
    X = df['message']
    Y = df.iloc[:, 4:]
    category_names = Y.columns.tolist()
    return X, Y, category_names

In [51]:
load_data(data_file="Disaster_pipedata1.db")

(0        Weather update - a cold front from Cuba that c...
 1                  Is the Hurricane over or is it not over
 2                          Looking for someone but no name
 3        UN reports Leogane 80-90 destroyed. Only Hospi...
 4        says: west side of Haiti, rest of the country ...
                                ...                        
 26210    The training demonstrated how to enhance micro...
 26211    A suitable candidate has been selected and OCH...
 26212    Proshika, operating in Cox's Bazar municipalit...
 26213    Some 2,000 women protesting against the conduc...
 26214    A radical shift in thinking came about as a re...
 Name: message, Length: 26215, dtype: object,
        related  request  offer  aid_related  medical_help  medical_products  \
 0            1        0      0            0             0                 0   
 1            1        0      0            1             0                 0   
 2            1        0      0            0          

In [53]:
X, y, category_names  = load_data(data_file="Disaster_pipedata1.db")
print(len(X))
print(len(y))
print(category_names)


26215
26215
['related', 'request', 'offer', 'aid_related', 'medical_help', 'medical_products', 'search_and_rescue', 'security', 'military', 'child_alone', 'water', 'food', 'shelter', 'clothing', 'money', 'missing_people', 'refugees', 'death', 'other_aid', 'infrastructure_related', 'transport', 'buildings', 'electricity', 'tools', 'hospitals', 'shops', 'aid_centers', 'other_infrastructure', 'weather_related', 'floods', 'storm', 'fire', 'earthquake', 'cold', 'other_weather', 'direct_report']


In [54]:
from models.train_classifier import tokenize

In [59]:
small_pipe = Pipeline([
    ('vect', CountVectorizer(tokenizer=tokenize)),
    ('tfidf', TfidfTransformer()),
    ('clf', MultiOutputClassifier(RandomForestClassifier()))
])

In [57]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)
print(f"{len(X_train)}\n{len(X_test)}\n{len(y_train)}\n{len(y_test)}")

19661
6554
19661
6554


In [60]:
small_pipe.fit(X_train, y_train)



In [None]:
y_pred = small_pipe.predict(X_test)

In [56]:

def build_model():
    # text processing and model pipeline
    model_pipeline = Pipeline([
        ('vect', CountVectorizer(tokenizer=tokenize)),
        ('tfidf', TfidfTransformer()),
        ('clf', MultiOutputClassifier(RandomForestClassifier()))
    ])

    # define parameters for GridSearchCV
    parameters = {
        'tfidf__use_idf': (True, False),
        'tfidf__smooth_idf': [True, False],
        'vect__max_df': (0.5, 0.75, 1.0),
        'vect__max_features': (None, 5000, 10000),
        'clf__estimator__n_estimators': [50, 100],
        'clf__estimator__min_samples_split': [2, 4]
    }
    cv = GridSearchCV(model_pipeline, param_grid=parameters)

    # create gridsearch object and return as final model pipeline

    return cv

In [None]:

model = build_model()

In [None]:

def train(X, y, model):
    # train test split
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)
    print(f"{len(X_train)}\n{len(X_test)}\n{len(y_train)}\n{len(y_test)}")

    # fit model with hyper parameters
    model.fit(X_train, y_train)

    # output model test results
    # predict on new model
    y_pred = model.predict(X_test)
    for i, category in enumerate(category_names):
        print(f'Category: {category}')
        print(classification_report(y_test.iloc[:, i], y_pred[:, i]))

    print(f"Model parameters:\n{model.get_params()}")
    return model

In [None]:
trained_model = train(X,y, model)

In [None]:

def export_model(model, model_filepath):
    # Export model as a pickle file
    with open(model_filepath, 'wb') as file:
        pickle.dump(model, file)

In [None]:
export_model(model=trained_model, model_filepath="")

In [None]:

def run_pipeline(data_file):
    X, y = load_data(data_file)  # run ETL pipeline
    model = build_model()  # build model pipeline
    model = train(X, y, model)  # train model pipeline
    export_model(model)  # save model


if __name__ == '__main__':
    data_file = sys.argv[1]  # get filename of dataset
    run_pipeline(data_file)  # run data pipeline
