### 1. Import libraries and load data from database.

In [1]:
# import libraries
#https://view6914b2f4-3001.udacity-student-workspaces.com/

import pandas as pd
import numpy as np
from sqlalchemy import create_engine
import re

import nltk
nltk.download('wordnet')
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')

from nltk.tokenize import word_tokenize
from nltk.tokenize import sent_tokenize
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer

from sklearn.multioutput import MultiOutputClassifier
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import CountVectorizer , TfidfTransformer
from sklearn.metrics import confusion_matrix, average_precision_score ,roc_auc_score, accuracy_score,classification_report
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression

# load data from database
engine = create_engine('sqlite:///disaster_messages.db')
df = pd.read_sql_table('disaster_messages', con=engine)
df.drop(columns=['child_alone'], inplace=True) # drop child_alone because it only has 0 values
df=df.head(4000)
X = df['message']
y = df.iloc[:,4:]
y['related'].replace(2, 1, inplace=True)  # to fix for multiple classes needed for ROC_AUC parameter
pd.set_option('display.max_colwidth', -1)

#Classes are very unbalanced!
#y.sum()

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


### 2. Process data

In [4]:
def tokenize(text):
    
    text = re.sub(r"[^a-zA-Z0-9]", ' ', text.lower())
    tokens = word_tokenize(text)  #tokenize
    lemmatizer = WordNetLemmatizer()

    clean_tokens = []
    for tok in tokens:
        if tok not in stopwords.words("english"):                  # take out stop-words
            clean_tok = lemmatizer.lemmatize(tok).lower().strip()  # lematize, normalize
            clean_tokens.append(clean_tok)
        
        

    return clean_tokens
    pass

for message in X[:1]:
    tokens = tokenize(message)
    print(message)
    print(tokens, '\n')

Weather update - a cold front from Cuba that could pass over Haiti
['weather', 'update', 'cold', 'front', 'cuba', 'could', 'pas', 'haiti'] 



### 3. Train Test Split

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4)

### 3. Build a machine learning pipeline

In [5]:
# pipeline with RF

pipeline_rf = Pipeline([
    ('vect',CountVectorizer(tokenizer=tokenize)),
    ('tfidef', TfidfTransformer()),
    ('clf', MultiOutputClassifier(RandomForestClassifier(n_estimators=10))) #RandomForestClassifier()  SVC()
])

pipeline_rf.fit(X_train, y_train)

y_pred =pipeline_rf.predict(X_test)
#y_pred

In [6]:
for i in range(0, len(y_test.columns)):
    cr=classification_report(y_test.iloc[:,i], y_pred[:,i])
    #print(y_test.columns[i],cr)

  'precision', 'predicted', average, warn_for)


### 5. Test your model (f1 score, precision and recall )

In [None]:
def test_function(pred):
    target_names = y.columns
    pred=pred[:,1]


    category=[]
    accuracy=[]
    pos_precis=[]
    neg_precis=[]
    pos_recall=[]
    neg_recall=[]
    f_one_pos=[]
    f_one_neg=[]
    roc_auc=[]


    for i in range(0, len(y_test.columns)):
        cr=classification_report(y_test.iloc[:,i],pred ) #avg / total

        neg=cr.partition(' 0 ')[2]
        pos=cr.partition(' 1 ')[2]

        tn=target_names[i]   
        category.append(tn)

        accuracy.append(accuracy_score(y_test.iloc[:,i], pred))

        pos_precis.append(pos.split()[0])
        neg_precis.append(neg.split()[1])

        pos_recall.append(pos.split()[1])
        neg_recall.append(neg.split()[1])

        f_one_pos.append(pos.split()[2])
        f_one_neg.append(neg.split()[2])

        try:
            roc=roc_auc_score(y_test.iloc[:,i], y_pred[:,i])
        except:
            roc=0

        roc_auc.append(roc)

    performance = pd.DataFrame(list(zip(category, accuracy, pos_precis, neg_precis, pos_recall, neg_recall, f_one_pos, f_one_neg,roc_auc)), \
                               columns =['category','accuracy', 'precision_pos','precision_neg','recall_pos','recall_neg','f_one_pos','f_one_neg', 'roc_auc']) 

    cols = performance.columns.drop('category')
    performance[cols] = performance[cols].apply(pd.to_numeric, errors='coerce')
    performance['balanced_accuracy'] = (performance['recall_pos'] + performance['recall_neg'])/2
    return performance.mean()



print(test_function(y_pred))

### 6. Improve your model (check other models and use grid search to find better parameters)

# RANDOM FOREST

In [8]:
########################################## RF

pipeline_rf = Pipeline([

('vect', CountVectorizer(tokenizer=tokenize)),
('tfidf', TfidfTransformer()),
('clf', MultiOutputClassifier(RandomForestClassifier()))

])


parameters = {
#'vect__max_features': (None, 5000), 
'clf__estimator__n_estimators': [10, 30],
'clf__estimator__min_samples_split': [2, 3, 4],
'clf__estimator__criterion': ['entropy', 'gini']
#'tfidf__use_idf': (True, False),
#'vect__max_n': (1, 2),
#'vect__max_df': (0.5, 0.75, 1.0),
#'vect__max_features': (None, 5000, 10000, 50000)}
}

cv_rf = GridSearchCV(pipeline_rf, param_grid=parameters,n_jobs=4) #scoring = 'roc_auc' , verbose=2
cv_rf.fit(X_train, y_train)
y_pred_rf = cv_rf.predict(X_test)

print("\nBest Parameters:", cv_rf.best_params_)

print(test_function(y_pred_rf))

KeyboardInterrupt: 

# LOGISTIC REGRESSION

In [6]:
from sklearn.linear_model import LogisticRegression
#################################LR
pipeline_lr = Pipeline([
    ('vect', CountVectorizer(tokenizer=tokenize)),
    ('tfidf', TfidfTransformer()),
    ('clf', MultiOutputClassifier(LogisticRegression()))
     ])

parameters = {

    'vect__max_df': (0.5, 0.75, 1.0),
    #'vect__max_n': (1, 2),
    'vect__max_features': (None, 5000),
    "clf__estimator__penalty":("l1","l2")
}

cv_lr = GridSearchCV(pipeline_lr, param_grid=parameters, n_jobs=-1) #, verbose=2

cv_lr.fit(X_train, y_train)
y_pred_lr = cv_lr.predict(X_test)

print("\nBest Parameters:", cv_lr.best_params_)

print(test_function(y_pred_lr))


Best Parameters: {'clf__estimator__penalty': 'l1', 'vect__max_df': 0.5, 'vect__max_features': None}
accuracy             0.553661
precision_pos        0.154286
precision_neg        0.554286
recall_pos           0.648571
recall_neg           0.554286
f_one_pos            0.185143
f_one_neg            0.681429
roc_auc              0.000000
balanced_accuracy    0.601429
dtype: float64


# SVC

In [20]:
################################## SVC
pipeline_svc = Pipeline([
    ('vect', CountVectorizer(tokenizer=tokenize)),
    ('tfidf', TfidfTransformer()),
    ('clf', MultiOutputClassifier(SVC()))
     ])

parameters = {
    'clf__estimator__kernel': ['linear', 'rbf', 'poly'],
    'clf__estimator__C':[1, 10, 100],
    'vect__max_features': (None, 5000)
}

cv_svc = GridSearchCV(pipeline_svc, param_grid=parameters, n_jobs=-1) #, verbose=2

cv_svc.fit(X_train, y_train)
y_pred_svc = cv_svc.predict(X_test)

print("\nBest Parameters:", cv_svc.best_params_)

print(test_function(y_pred_svc))


Best Parameters: {'clf__estimator__C': 1, 'clf__estimator__kernel': 'linear', 'vect__max_features': None}
accuracy             0.556696
precision_pos        0.155429
precision_neg        0.557143
recall_pos           0.726286
recall_neg           0.557143
f_one_pos            0.187143
f_one_neg            0.684000
roc_auc              0.578902
balanced_accuracy    0.641714
dtype: float64


## XG Boost

In [None]:
from sklearn.ensemble import GradientBoostingClassifier

pipeline_xgb = Pipeline([
    ('vect', CountVectorizer(tokenizer=tokenize)),
    ('tfidf', TfidfTransformer()),
    ('clf', MultiOutputClassifier(GradientBoostingClassifier()))
     ])

parameters = {
    'clf__estimator__max_depth': [3, 5, 10],
    'clf__estimator__n_estimators':[100, 300,500],
}

cv_xgb = GridSearchCV(pipeline_xgb, param_grid=parameters, n_jobs=-1) #, verbose=2

cv_xgb.fit(X_train, y_train)
y_pred_xgb = cv_xgb.predict(X_test)

print("\nBest Parameters:", cv_xgb.best_params_)

print(test_function(y_pred_xgb))