# ML Pipeline Preparation

#### 1) importing libs and creating dataframes

In [36]:
import pandas as pd
import numpy as np
from sqlalchemy import create_engine
import re
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem.wordnet import WordNetLemmatizer

from sklearn.pipeline import Pipeline
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.multioutput import MultiOutputClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.metrics import classification_report
from sklearn.model_selection import GridSearchCV
import pickle

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\ogzpython\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ogzpython\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\ogzpython\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [3]:
engine= create_engine('sqlite:///C:\\Users\\ogzpython\\Desktop\\ml\\response_ml\\Disaster_Response_Project\\data\\dis_res.db')
# uncoment next if you need to!
# engine= create_engine('sqlite:///dis_res.db')

In [4]:
# loading Data
q = '''select * from dis_res'''
df = pd.read_sql(q,engine)
X = df['message']
y_cols = df.drop(labels=['id','message','original','genre'],axis=1).columns
y = df[y_cols]

#### 2) function to tokenization of 'message' 

In [5]:
def tokenize(text):
    text = text.lower()
    text = re.sub(r"[^a-zA-Z0-9]", " ", text) 
    text = word_tokenize(text) 
    text = [w for w in text if w not in stopwords.words("english")]
    text = [WordNetLemmatizer().lemmatize(w) for w in text]
    
    return text

#### 3) Building ML Pipeline

In [6]:
pipeline = Pipeline([
    ('vect', CountVectorizer(tokenizer=tokenize)),
    ('tfidf', TfidfTransformer()),
    ('clf', MultiOutputClassifier(RandomForestClassifier()))
])

#### 4) split data and train pipeline

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X,y)

In [8]:
pipeline.fit(X_train,y_train)

Pipeline(memory=None,
         steps=[('vect',
                 CountVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.int64'>, encoding='utf-8',
                                 input='content', lowercase=True, max_df=1.0,
                                 max_features=None, min_df=1,
                                 ngram_range=(1, 1), preprocessor=None,
                                 stop_words=None, strip_accents=None,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=<function tokenize at...
                                                                        ccp_alpha=0.0,
                                                                        class_weight=None,
                                                                        criterion='gini',
                                                                   

In [9]:
y_pred = pipeline.predict(X_test)

#### 5) Testing Results

In [10]:
print(classification_report(y_test,y_pred,target_names = y_cols))

                        precision    recall  f1-score   support

               related       0.84      0.93      0.88      4934
               request       0.84      0.50      0.63      1093
                 offer       0.00      0.00      0.00        32
           aid_related       0.75      0.70      0.72      2669
          medical_help       0.67      0.07      0.13       539
      medical_products       0.83      0.07      0.13       351
     search_and_rescue       0.56      0.05      0.09       182
              security       0.33      0.01      0.02       121
              military       0.48      0.07      0.11       215
           child_alone       0.00      0.00      0.00         0
                 water       0.91      0.33      0.49       431
                  food       0.84      0.60      0.70       693
               shelter       0.81      0.34      0.48       555
              clothing       0.50      0.06      0.10       109
                 money       0.71      

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


#### 6) improving model with grid search  

In [31]:
parameters2 = {
        'clf__estimator__criterion':['gini','entropy'],  
        'clf__estimator__min_samples_split':[10,110],
        'clf__estimator__max_depth':[None,100,500]
              }
#run time greatly increases with these paramaters for the sake of completion of the project skipped to less demanding parameters

In [25]:
parameters = {
        'clf__estimator__n_estimators': [50]
        }

In [26]:
cv = GridSearchCV(pipeline, param_grid=parameters)

In [27]:
best_model = cv.fit(X_train,y_train)

In [29]:
y_pred = best_model.predict(X_test)

In [30]:
print(classification_report(y_test,y_pred,target_names = y_cols))

                        precision    recall  f1-score   support

               related       0.84      0.93      0.88      4934
               request       0.83      0.48      0.61      1093
                 offer       0.00      0.00      0.00        32
           aid_related       0.76      0.68      0.72      2669
          medical_help       0.67      0.08      0.15       539
      medical_products       0.79      0.07      0.14       351
     search_and_rescue       0.64      0.05      0.09       182
              security       0.25      0.01      0.02       121
              military       0.55      0.08      0.15       215
           child_alone       0.00      0.00      0.00         0
                 water       0.87      0.35      0.50       431
                  food       0.83      0.59      0.69       693
               shelter       0.80      0.37      0.50       555
              clothing       0.62      0.09      0.16       109
                 money       0.75      

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


#### 7) Creating a Pickle dump

In [48]:
filename = r'C:\Users\ogzpython\Desktop\ml\pkls\dis_res\model.pkl'

In [47]:
pickle.dump(best_model,open(filename,'wb'))

In [40]:
# load a model
# loaded_model= pickle.load(open(filename,'rb'))
# result = loaded_model.score(X_test,Y_test)
# print(result)