# ML Pipeline Preparation
Follow the instructions below to help you create your ML pipeline.
### 1. Import libraries and load data from database.
- Import Python libraries
- Load dataset from database with [`read_sql_table`](https://pandas.pydata.org/pandas-docs/stable/generated/pandas.read_sql_table.html)
- Define feature and target variables X and Y

In [2]:
# import libraries
from sqlalchemy import create_engine
import pandas as pd
import numpy as np
import sqlite3
import pickle

import nltk
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')
from nltk import word_tokenize
from nltk import sent_tokenize
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.tokenize import word_tokenize
import re

from sklearn.datasets import make_multilabel_classification
from sklearn.multioutput import MultiOutputClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [3]:
# load data from database
engine = create_engine('sqlite:///DisasterResponse.db')
df = pd.read_sql_table('DisasterResponse', engine)

df.head()

Unnamed: 0,id,message,original,genre,related,request,offer,aid_related,medical_help,medical_products,...,aid_centers,other_infrastructure,weather_related,floods,storm,fire,earthquake,cold,other_weather,direct_report
0,2,Weather update - a cold front from Cuba that c...,Un front froid se retrouve sur Cuba ce matin. ...,direct,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,7,Is the Hurricane over or is it not over,Cyclone nan fini osinon li pa fini,direct,1,0,0,1,0,0,...,0,0,1,0,1,0,0,0,0,0
2,8,Looking for someone but no name,"Patnm, di Maryani relem pou li banm nouvel li ...",direct,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,9,UN reports Leogane 80-90 destroyed. Only Hospi...,UN reports Leogane 80-90 destroyed. Only Hospi...,direct,1,1,0,1,0,1,...,0,0,0,0,0,0,0,0,0,0
4,12,"says: west side of Haiti, rest of the country ...",facade ouest d Haiti et le reste du pays aujou...,direct,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


### 2. Write a tokenization function to process your text data

In [4]:
def tokenize(text):
    text = re.sub(r"[^a-zA-Z0-9]", " ", text.lower())
    lemmatizer = WordNetLemmatizer()
    tokens = word_tokenize(text)
    clean_tokens = []
    for tok in tokens:
        clean_tok = lemmatizer.lemmatize(tok).strip()
        clean_tokens.append(clean_tok)
    return clean_tokens

### 3. Build a machine learning pipeline
This machine pipeline should take in the `message` column as input and output classification results on the other 36 categories in the dataset. You may find the [MultiOutputClassifier](http://scikit-learn.org/stable/modules/generated/sklearn.multioutput.MultiOutputClassifier.html) helpful for predicting multiple target variables.

In [5]:
# def X and y
X = df['message']
y = df[['related', 'request', 'offer', 'aid_related', 'medical_help',
        'medical_products', 'search_and_rescue', 'security', 'military',
        'child_alone', 'water', 'food', 'shelter', 'clothing', 'money',
        'missing_people', 'refugees', 'death', 'other_aid', 'infrastructure_related',
        'transport', 'buildings', 'electricity', 'tools', 'hospitals', 'shops',
        'aid_centers', 'other_infrastructure', 'weather_related', 'floods',
        'storm', 'fire', 'earthquake', 'cold', 'other_weather', 'direct_report']]

#define pipeline stage
count_vect = CountVectorizer()
tfidf_transformer = TfidfTransformer()
rf_classifier = RandomForestClassifier()

#define pipeline to data
pipeline = Pipeline([
    ('vect', count_vect),
    ('tfidf', tfidf_transformer),
    ('clf', MultiOutputClassifier(rf_classifier))
])

### 4. Train pipeline
- Split data into train and test sets
- Train pipeline

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y)

pipeline.fit(X_train, y_train)

Pipeline(memory=None,
     steps=[('vect', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip...oob_score=False, random_state=None, verbose=0,
            warm_start=False),
           n_jobs=1))])

### 5. Test your model
Report the f1 score, precision and recall for each output category of the dataset. You can do this by iterating through the columns and calling sklearn's `classification_report` on each.

In [7]:
y_pred = pipeline.predict(X_test)

for i, col in enumerate(y_test.columns):
    print(f'Category: {col}\n')
    print(classification_report(y_test.iloc[:, i], y_pred[:, i]))

Category: related

             precision    recall  f1-score   support

          0       0.62      0.36      0.45      1510
          1       0.83      0.93      0.88      5000
          2       0.59      0.59      0.59        44

avg / total       0.78      0.80      0.78      6554

Category: request

             precision    recall  f1-score   support

          0       0.89      0.98      0.93      5444
          1       0.84      0.40      0.54      1110

avg / total       0.88      0.89      0.87      6554

Category: offer

             precision    recall  f1-score   support

          0       1.00      1.00      1.00      6530
          1       0.00      0.00      0.00        24

avg / total       0.99      1.00      0.99      6554

Category: aid_related

             precision    recall  f1-score   support

          0       0.73      0.87      0.79      3824
          1       0.75      0.55      0.63      2730

avg / total       0.74      0.74      0.73      6554

Category:

  'precision', 'predicted', average, warn_for)


### 6. Improve your model
Use grid search to find better parameters. 

In [8]:
# Split the data into a subset for grid search
X_subset, _, y_subset, _ = train_test_split(X_train, y_train, test_size=0.01, random_state=42)

params = {
    'vect__max_df': [0.5, 1.0],
    'clf__estimator': [RandomForestClassifier(n_estimators=50), RandomForestClassifier(n_estimators=100)],
}

#params additional     'tfidf__use_idf': [True, False],    'clf__estimator__n_estimators': [50, 100],

grid_search = GridSearchCV(pipeline, param_grid=params, cv=2, n_jobs=-1)
grid_search.fit(X_subset, y_subset)

print(grid_search.best_params_)

{'clf__estimator': RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False), 'vect__max_df': 1.0}


### 7. Test your model
Show the accuracy, precision, and recall of the tuned model.  

Since this project focuses on code quality, process, and  pipelines, there is no minimum performance metric needed to pass. However, make sure to fine tune your models for accuracy, precision and recall to make your project stand out - especially for your portfolio!

In [23]:
X_train, X_test, y_train, y_test = train_test_split(X, y)

count_vect = CountVectorizer(max_df=1.0)  # Set max_df to 1.0
tfidf_transformer = TfidfTransformer()
rf_classifier = RandomForestClassifier(n_estimators=100)  # Set n_estimators to 100

# Build the pipeline with the updated settings
pipeline2 = Pipeline([
    ('vect', count_vect),
    ('tfidf', tfidf_transformer),
    ('clf', rf_classifier),
])

# Get the best tuned model
best_model = pipeline2.fit(X_train, y_train)

# Make predictions on the test data
y_pred = y_pred.astype(int)
y_pred = best_model.predict(X_test)

# Calculate accuracy, precision, and recall
# accuracy = accuracy_score(y_test, y_pred)
# precision = precision_score(y_test, y_pred, average='weighted')
# recall = recall_score(y_test, y_pred, average='weighted')

# print("Accuracy:", accuracy)
# print("Precision:", precision)
# print("Recall:", recall)

report = classification_report(y_test, y_pred)
print(report)

ValueError: Unknown label type: (       related  request  offer  aid_related  medical_help  medical_products  \
20134        0        0      0            0             0                 0   
19356        0        0      0            0             0                 0   
23039        1        0      0            1             0                 0   
22644        1        0      0            1             0                 1   
11105        1        1      0            1             1                 1   
14640        1        0      0            0             0                 0   
20593        0        0      0            0             0                 0   
16829        1        0      0            1             0                 0   
2710         0        0      0            0             0                 0   
11243        1        0      0            0             0                 0   
12298        1        0      0            1             0                 0   
6857         1        0      0            0             0                 0   
20456        1        0      0            1             0                 0   
12062        0        0      0            0             0                 0   
26051        1        0      0            0             0                 0   
2638         0        0      0            0             0                 0   
12931        1        0      0            0             0                 0   
4478         0        0      0            0             0                 0   
25689        1        0      0            1             1                 0   
5127         0        0      0            0             0                 0   
23122        0        0      0            0             0                 0   
20008        1        0      0            0             0                 0   
13655        1        0      0            1             1                 0   
7445         1        0      0            1             0                 0   
21008        1        0      0            1             0                 0   
21124        0        0      0            0             0                 0   
12807        1        0      0            0             0                 0   
3758         1        1      0            1             0                 0   
14670        1        0      0            1             1                 0   
21977        1        0      0            0             0                 0   
...        ...      ...    ...          ...           ...               ...   
3453         0        0      0            0             0                 0   
11418        1        0      0            0             0                 0   
17           0        0      0            0             0                 0   
3092         1        0      0            0             0                 0   
17500        1        0      0            1             0                 0   
1025         0        0      0            0             0                 0   
9222         1        1      0            1             0                 0   
26164        0        0      0            0             0                 0   
24156        0        0      0            0             0                 0   
21643        1        0      0            1             0                 0   
6100         0        0      0            0             0                 0   
13743        1        0      0            0             0                 0   
1540         1        0      0            0             0                 0   
23125        1        1      0            1             1                 1   
8386         1        1      0            1             0                 0   
3373         0        0      0            0             0                 0   
15687        1        0      0            1             0                 0   
20779        0        0      0            0             0                 0   
21315        1        1      0            1             0                 0   
10156        1        0      0            1             0                 0   
24363        0        0      0            0             0                 0   
2716         1        0      0            1             0                 0   
3900         1        0      0            0             0                 0   
19927        1        0      0            1             0                 0   
9846         1        0      0            0             0                 0   
4589         1        1      0            0             0                 0   
6267         1        0      0            0             0                 0   
25187        1        0      0            0             0                 0   
8517         0        0      0            0             0                 0   
14501        1        0      0            1             1                 0   

       search_and_rescue  security  military  child_alone      ...        \
20134                  0         0         0            0      ...         
19356                  0         0         0            0      ...         
23039                  0         0         1            0      ...         
22644                  0         0         0            0      ...         
11105                  0         0         0            0      ...         
14640                  0         0         0            0      ...         
20593                  0         0         0            0      ...         
16829                  0         0         0            0      ...         
2710                   0         0         0            0      ...         
11243                  0         0         0            0      ...         
12298                  0         0         0            0      ...         
6857                   0         0         0            0      ...         
20456                  0         0         0            0      ...         
12062                  0         0         0            0      ...         
26051                  0         0         0            0      ...         
2638                   0         0         0            0      ...         
12931                  0         0         0            0      ...         
4478                   0         0         0            0      ...         
25689                  0         0         0            0      ...         
5127                   0         0         0            0      ...         
23122                  0         0         0            0      ...         
20008                  0         0         0            0      ...         
13655                  0         0         0            0      ...         
7445                   0         0         0            0      ...         
21008                  0         1         0            0      ...         
21124                  0         0         0            0      ...         
12807                  0         0         0            0      ...         
3758                   0         0         0            0      ...         
14670                  0         0         1            0      ...         
21977                  0         0         0            0      ...         
...                  ...       ...       ...          ...      ...         
3453                   0         0         0            0      ...         
11418                  0         0         0            0      ...         
17                     0         0         0            0      ...         
3092                   0         0         0            0      ...         
17500                  0         0         0            0      ...         
1025                   0         0         0            0      ...         
9222                   0         0         0            0      ...         
26164                  0         0         0            0      ...         
24156                  0         0         0            0      ...         
21643                  0         0         0            0      ...         
6100                   0         0         0            0      ...         
13743                  0         0         0            0      ...         
1540                   0         0         0            0      ...         
23125                  0         0         0            0      ...         
8386                   0         0         0            0      ...         
3373                   0         0         0            0      ...         
15687                  0         0         0            0      ...         
20779                  0         0         0            0      ...         
21315                  0         0         0            0      ...         
10156                  0         0         0            0      ...         
24363                  0         0         0            0      ...         
2716                   0         0         0            0      ...         
3900                   0         0         0            0      ...         
19927                  0         0         0            0      ...         
9846                   0         0         0            0      ...         
4589                   0         0         0            0      ...         
6267                   0         0         0            0      ...         
25187                  0         0         0            0      ...         
8517                   0         0         0            0      ...         
14501                  0         0         0            0      ...         

       aid_centers  other_infrastructure  weather_related  floods  storm  \
20134            0                     0                0       0      0   
19356            0                     0                0       0      0   
23039            0                     0                0       0      0   
22644            0                     0                1       1      0   
11105            0                     0                1       1      0   
14640            0                     0                0       0      0   
20593            0                     0                0       0      0   
16829            0                     0                1       0      0   
2710             0                     0                0       0      0   
11243            0                     0                1       0      1   
12298            0                     0                0       0      0   
6857             0                     0                0       0      0   
20456            0                     0                1       0      0   
12062            0                     0                0       0      0   
26051            0                     0                0       0      0   
2638             0                     0                0       0      0   
12931            0                     0                1       0      1   
4478             0                     0                0       0      0   
25689            0                     0                0       0      0   
5127             0                     0                0       0      0   
23122            0                     0                0       0      0   
20008            0                     0                0       0      0   
13655            0                     0                1       0      0   
7445             0                     0                0       0      0   
21008            0                     0                0       0      0   
21124            0                     0                0       0      0   
12807            0                     0                0       0      0   
3758             0                     0                0       0      0   
14670            0                     0                1       0      0   
21977            0                     1                0       0      0   
...            ...                   ...              ...     ...    ...   
3453             0                     0                0       0      0   
11418            0                     0                1       0      0   
17               0                     0                0       0      0   
3092             0                     0                0       0      0   
17500            0                     0                1       1      1   
1025             0                     0                0       0      0   
9222             0                     0                1       0      0   
26164            0                     0                0       0      0   
24156            0                     0                0       0      0   
21643            0                     0                0       0      0   
6100             0                     0                0       0      0   
13743            0                     0                1       0      1   
1540             0                     0                0       0      0   
23125            0                     0                0       0      0   
8386             0                     0                0       0      0   
3373             0                     0                0       0      0   
15687            0                     0                1       1      0   
20779            0                     0                0       0      0   
21315            0                     0                0       0      0   
10156            0                     0                1       0      0   
24363            0                     0                0       0      0   
2716             0                     0                0       0      0   
3900             0                     0                0       0      0   
19927            0                     0                0       0      0   
9846             0                     1                1       0      0   
4589             0                     0                0       0      0   
6267             0                     0                0       0      0   
25187            0                     0                0       0      0   
8517             0                     0                0       0      0   
14501            0                     0                0       0      0   

       fire  earthquake  cold  other_weather  direct_report  
20134     0           0     0              0              0  
19356     0           0     0              0              0  
23039     0           0     0              0              1  
22644     0           0     0              0              0  
11105     0           0     0              0              0  
14640     0           0     0              0              0  
20593     0           0     0              0              0  
16829     0           0     1              0              0  
2710      0           0     0              0              0  
11243     0           1     0              0              1  
12298     0           0     0              0              0  
6857      0           0     0              0              1  
20456     0           0     0              1              1  
12062     0           0     0              0              0  
26051     0           0     0              0              0  
2638      0           0     0              0              0  
12931     0           0     0              0              1  
4478      0           0     0              0              0  
25689     0           0     0              0              0  
5127      0           0     0              0              0  
23122     0           0     0              0              0  
20008     0           0     0              0              0  
13655     0           0     0              1              0  
7445      0           0     0              0              1  
21008     0           0     0              0              0  
21124     0           0     0              0              0  
12807     0           0     0              0              0  
3758      0           0     0              0              1  
14670     0           0     0              1              0  
21977     0           0     0              0              0  
...     ...         ...   ...            ...            ...  
3453      0           0     0              0              0  
11418     0           1     0              0              1  
17        0           0     0              0              0  
3092      0           0     0              0              0  
17500     1           1     1              1              0  
1025      0           0     0              0              0  
9222      0           1     0              1              1  
26164     0           0     0              0              0  
24156     0           0     0              0              0  
21643     0           0     0              0              0  
6100      0           0     0              0              0  
13743     0           0     0              0              0  
1540      0           0     0              0              0  
23125     0           0     0              0              0  
8386      0           0     0              0              1  
3373      0           0     0              0              0  
15687     0           0     0              1              0  
20779     0           0     0              0              0  
21315     0           0     0              0              0  
10156     0           1     0              0              0  
24363     0           0     0              0              0  
2716      0           0     0              0              1  
3900      0           0     0              0              0  
19927     0           0     0              0              0  
9846      0           1     0              0              0  
4589      0           0     0              0              0  
6267      0           0     0              0              0  
25187     0           0     0              0              0  
8517      0           0     0              0              0  
14501     0           0     0              0              0  

[6554 rows x 36 columns], array([[ 2.,  0.,  0., ...,  0.,  0.,  0.],
       [ 1.,  0.,  0., ...,  0.,  0.,  0.],
       [ 1.,  0.,  0., ...,  0.,  0.,  0.],
       ..., 
       [ 1.,  0.,  0., ...,  0.,  0.,  0.],
       [ 1.,  0.,  0., ...,  0.,  0.,  0.],
       [ 1.,  0.,  0., ...,  0.,  0.,  0.]]))

In [22]:
import numpy as np

print("Shape of y_train:", y_train.shape)
print("Shape of y_test:", y_test.shape)
print("Data type of y_train:", y_train.values.dtype)
print("Data type of y_test:", y_test.values.dtype)

y_pred = y_pred.astype(int)

unique_labels_test = np.unique(y_test)
unique_labels_pred = np.unique(y_pred)

print("Unique labels in y_test:", unique_labels_test)
print("Unique labels in y_pred:", unique_labels_pred)

Shape of y_train: (19662, 36)
Shape of y_test: (6554, 36)
Data type of y_train: int64
Data type of y_test: int64
Unique labels in y_test: [0 1 2]
Unique labels in y_pred: [0 1 2]


### 8. Try improving your model further. Here are a few ideas:
* try other machine learning algorithms
* add other features besides the TF-IDF

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.svm import SVC
from gensim.models import Word2Vec

# Define Word2Vec model and vectorizer
word2vec_model = Word2Vec(X, size=100, window=5, min_count=1, workers=4)
vectorizer = CountVectorizer(analyzer=lambda x: x)

# Transform text data into word embeddings
X_train_embeddings = [word2vec_model.wv[vectorizer.transform([text]).indices] for text in X_train]
X_test_embeddings = [word2vec_model.wv[vectorizer.transform([text]).indices] for text in X_test]

# Define SVM classifier
svm_classifier = SVC()

# Define pipeline to data
pipeline3 = Pipeline([
    ('clf', MultiOutputClassifier(svm_classifier))
])

# Fit the pipeline to the transformed data
pipeline3.fit(X_train_embeddings, y_train)

### 9. Export your model as a pickle file

In [24]:
# create a filename for the pickle file
filename = 'model.pkl'

# save the model to disk using pickle
with open(filename, 'wb') as file:
    pickle.dump(best_model, file)

### 10. Use this notebook to complete `train.py`
Use the template file attached in the Resources folder to write a script that runs the steps above to create a database and export a model based on a new dataset specified by the user.

In [None]:
# import packages
import sys
from sqlalchemy import create_engine
import pandas as pd
import numpy as np
import sqlite3
import pickle

import nltk
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')
from nltk import word_tokenize
from nltk import sent_tokenize
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.tokenize import word_tokenize
import re

from sklearn.datasets import make_multilabel_classification
from sklearn.multioutput import MultiOutputClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score

def load_data(data_file):
    # read in file
    df = pd.read_csv(data_file)

    # clean data
    def tokenize(text):
        text = re.sub(r"[^a-zA-Z0-9]", " ", text.lower())
        lemmatizer = WordNetLemmatizer()
        tokens = word_tokenize(text)
        clean_tokens = []
        for tok in tokens:
            clean_tok = lemmatizer.lemmatize(tok).strip()
            clean_tokens.append(clean_tok)
        return clean_tokens

    # load to database
    engine = create_engine('sqlite:///DisasterResponse.db')
    df.to_sql('DisasterResponse', engine, index=False)


    # define features and label arrays
    X = df['message']
    y = df[['related', 'request', 'offer', 'aid_related', 'medical_help',
        'medical_products', 'search_and_rescue', 'security', 'military',
        'child_alone', 'water', 'food', 'shelter', 'clothing', 'money',
        'missing_people', 'refugees', 'death', 'other_aid', 'infrastructure_related',
        'transport', 'buildings', 'electricity', 'tools', 'hospitals', 'shops',
        'aid_centers', 'other_infrastructure', 'weather_related', 'floods',
        'storm', 'fire', 'earthquake', 'cold', 'other_weather', 'direct_report']]

    return X, y


def build_model():
    # text processing and model pipeline
    count_vect = CountVectorizer(max_df=1.0)  # Set max_df to 1.0
    tfidf_transformer = TfidfTransformer()
    rf_classifier = RandomForestClassifier(n_estimators=100)  # Set n_estimators to 100
    
    model_pipeline = Pipeline([
    ('vect', count_vect),
    ('tfidf', tfidf_transformer),
    ('clf', MultiOutputClassifier(rf_classifier))
    ])
    
    # define parameters for GridSearchCV
    params = {
    'vect__max_df': [0.5, 1.0],
    'clf__estimator': [RandomForestClassifier(n_estimators=50), RandomForestClassifier(n_estimators=100)],
    }
    

    # create gridsearch object and return as final model pipeline
    grid_search = GridSearchCV(pipeline, param_grid=params, cv=2, n_jobs=-1)
    grid_search.fit(X_subset, y_subset)

    return model_pipeline


def train(X, y, model):
    # train test split
    X_train, X_test, y_train, y_test = train_test_split(X, y)

    # fit model
    model = model_pipeline.fit(X_train, y_train)

    # output model test results
    y_pred = y_pred.astype(int)
    y_pred = model_pipeline.predict(X_test)
    
    report = classification_report(y_test, y_pred)
    print(report)
    
    return model


def export_model(model):
    # Export model as a pickle file
    # create a filename for the pickle file
    filename = 'model.pkl'
    
    # save the model to disk using pickle
    with open(filename, 'wb') as file:
        pickle.dump(model, file)


def run_pipeline(data_file):
    X, y = load_data(data_file)  # run ETL pipeline
    model = build_model()  # build model pipeline
    model = train(X, y, model)  # train model pipeline
    export_model(model)  # save model


if __name__ == '__main__':
    data_file = sys.argv[1]  # get filename of dataset
    run_pipeline(data_file)  # run data pipeline