In [15]:
import os
import pandas as pd
import numpy as np
from itertools import chain

import sqlalchemy
from sqlalchemy import create_engine

import nltk
from nltk import pos_tag
nltk.download(['stopwords','wordnet','punkt','averaged_perceptron_tagger'])
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem.wordnet import WordNetLemmatizer

from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.multioutput import MultiOutputClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import confusion_matrix, classification_report

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\manor\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\manor\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\manor\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\manor\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [16]:
engine = create_engine('sqlite:///DisasterResponse_try3.db')
df = pd.read_sql_table("Message", engine)
print(f"{type(df)},\n{len(df)}")


<class 'pandas.core.frame.DataFrame'>,
26216


In [17]:
df.head(1)

Unnamed: 0,id,message,original,genre,related,request,offer,aid_related,medical_help,medical_products,...,aid_centers,other_infrastructure,weather_related,floods,storm,fire,earthquake,cold,other_weather,direct_report
0,2,Weather update - a cold front from Cuba that c...,Un front froid se retrouve sur Cuba ce matin. ...,direct,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [18]:
df.columns

Index(['id', 'message', 'original', 'genre', 'related', 'request', 'offer',
       'aid_related', 'medical_help', 'medical_products', 'search_and_rescue',
       'security', 'military', 'child_alone', 'water', 'food', 'shelter',
       'clothing', 'money', 'missing_people', 'refugees', 'death', 'other_aid',
       'infrastructure_related', 'transport', 'buildings', 'electricity',
       'tools', 'hospitals', 'shops', 'aid_centers', 'other_infrastructure',
       'weather_related', 'floods', 'storm', 'fire', 'earthquake', 'cold',
       'other_weather', 'direct_report'],
      dtype='object')

In [19]:
# extract data for ml model
X = df['message']
# drop additional cols
y = df.drop(['id', 'message', 'original', 'genre'], axis=1)
print(X.shape, y.shape)

(26216,) (26216, 36)


In [20]:
print(f"df type: {type(df)}\nLen of df: {len(df)}")
print(f"Shape of X: {X.shape},\nShape of Y: {y.shape}")
print(list(df.columns))

df type: <class 'pandas.core.frame.DataFrame'>
Len of df: 26216
Shape of X: (26216,),
Shape of Y: (26216, 36)
['id', 'message', 'original', 'genre', 'related', 'request', 'offer', 'aid_related', 'medical_help', 'medical_products', 'search_and_rescue', 'security', 'military', 'child_alone', 'water', 'food', 'shelter', 'clothing', 'money', 'missing_people', 'refugees', 'death', 'other_aid', 'infrastructure_related', 'transport', 'buildings', 'electricity', 'tools', 'hospitals', 'shops', 'aid_centers', 'other_infrastructure', 'weather_related', 'floods', 'storm', 'fire', 'earthquake', 'cold', 'other_weather', 'direct_report']


In [21]:
def tokenize(text):
    # Tokenize text into words
    words = list(chain.from_iterable([word_tokenize(t) for t in text]))
    
    # Remove stopwords
    words = [word for word in words if word not in stopwords.words("english")]
    
    # Tag words with POS
    tagged_words = pos_tag(words)
    
    # Initiate lemmatizer
    lemmatizer = WordNetLemmatizer()
    
    clean_tokens = []
    # Iterate through each tagged word
    for word, tag in tagged_words:
        # Lemmatize based on POS tag
        if tag.startswith('N'):
            clean_tok = lemmatizer.lemmatize(word, pos='n').lower().strip()
        elif tag.startswith('V'):
            clean_tok = lemmatizer.lemmatize(word, pos='v').lower().strip()
        elif tag.startswith('J'):
            clean_tok = lemmatizer.lemmatize(word, pos='a').lower().strip()
        else:
            clean_tok = lemmatizer.lemmatize(word).lower().strip()
        
        clean_tokens.append(clean_tok)
    
    return clean_tokens

In [22]:
pipeline = Pipeline([
    ('vect',CountVectorizer(tokenizer=tokenize)),
    ('tfidf', TfidfTransformer()),
    ('clf', RandomForestClassifier())
])


In [23]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

pipeline.fit(X_train, y_train)
y_pred = pipeline.predict(X_test)



In [24]:
# Iterate through each output category and calculate the F1 score, precision, and recall
for i, column in enumerate(y.columns):
    print(f"Category: {column}\n")
    print(classification_report(y_test[column], y_pred[:, i]))
    print("------------------------")
    

Category: related

              precision    recall  f1-score   support

           0       0.65      0.10      0.17      1563
           1       0.77      0.98      0.86      4944
           2       0.00      0.00      0.00        47

    accuracy                           0.77      6554
   macro avg       0.47      0.36      0.34      6554
weighted avg       0.74      0.77      0.69      6554

------------------------
Category: request

              precision    recall  f1-score   support

           0       0.85      0.99      0.91      5443
           1       0.75      0.13      0.22      1111

    accuracy                           0.85      6554
   macro avg       0.80      0.56      0.57      6554
weighted avg       0.83      0.85      0.80      6554

------------------------
Category: offer

              precision    recall  f1-score   support

           0       0.99      1.00      1.00      6521
           1       0.00      0.00      0.00        33

    accuracy           

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

           0       0.89      1.00      0.94      5809
           1       0.77      0.02      0.04       745

    accuracy                           0.89      6554
   macro avg       0.83      0.51      0.49      6554
weighted avg       0.88      0.89      0.84      6554

------------------------
Category: shelter

              precision    recall  f1-score   support

           0       0.91      1.00      0.95      5973
           1       0.75      0.02      0.03       581

    accuracy                           0.91      6554
   macro avg       0.83      0.51      0.49      6554
weighted avg       0.90      0.91      0.87      6554

------------------------
Category: clothing

              precision    recall  f1-score   support

           0       0.99      1.00      0.99      6456
           1       1.00      0.01      0.02        98

    accuracy                           0.99      6554
   macro avg       0.99      0.51      

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_pr

              precision    recall  f1-score   support

           0       0.99      1.00      1.00      6498
           1       0.00      0.00      0.00        56

    accuracy                           0.99      6554
   macro avg       0.50      0.50      0.50      6554
weighted avg       0.98      0.99      0.99      6554

------------------------
Category: shops

              precision    recall  f1-score   support

           0       1.00      1.00      1.00      6530
           1       0.00      0.00      0.00        24

    accuracy                           1.00      6554
   macro avg       0.50      0.50      0.50      6554
weighted avg       0.99      1.00      0.99      6554

------------------------
Category: aid_centers

              precision    recall  f1-score   support

           0       0.99      1.00      0.99      6473
           1       0.00      0.00      0.00        81

    accuracy                           0.99      6554
   macro avg       0.49      0.50     

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [25]:
import pickle

In [26]:
# Export model to pickle file
with open('rfc_model_try3.pkl', 'wb') as file:
    pickle.dump(pipeline, file)

#### trying to improve... the above model is really good based on accuracy matrix

In [31]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [32]:
def build_model():
    pipeline = Pipeline([
        # ('vect',CountVectorizer(tokenizer=tokenize)),
        # ('tfidf', TfidfTransformer()),
        ('vect', TfidfVectorizer(tokenizer=tokenize)),
        ('clf', RandomForestClassifier())
    ])
    # specify parameters for grid search
    parameters = {
        "vect__ngram_range":[(1,1),(1,2)],
        "clf__n_estimators": [10,25,50],
        "clf__min_samples_split":[2, 3, 4]
        # "clf__criterion":["gini", "entropy"]
    }

    # create grid search object
    cv = GridSearchCV(pipeline, param_grid=parameters)

    return cv

In [33]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state=42)
print(X_train.shape, y_train.shape, X_test.shape, y_test.shape)

model = build_model()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

(19662,) (19662, 36) (6554,) (6554, 36)


Traceback (most recent call last):
  File "c:\Users\manor\Downloads\05 Project_Disaster Resoponse Pipelines\proj_uda_nlpdis_venv\lib\site-packages\sklearn\model_selection\_validation.py", line 813, in _score
    scores = scorer(estimator, X_test, y_test)
  File "c:\Users\manor\Downloads\05 Project_Disaster Resoponse Pipelines\proj_uda_nlpdis_venv\lib\site-packages\sklearn\metrics\_scorer.py", line 527, in __call__
    return estimator.score(*args, **kwargs)
  File "c:\Users\manor\Downloads\05 Project_Disaster Resoponse Pipelines\proj_uda_nlpdis_venv\lib\site-packages\sklearn\pipeline.py", line 753, in score
    return self.steps[-1][1].score(Xt, y, **score_params)
  File "c:\Users\manor\Downloads\05 Project_Disaster Resoponse Pipelines\proj_uda_nlpdis_venv\lib\site-packages\sklearn\base.py", line 705, in score
    return accuracy_score(y, self.predict(X), sample_weight=sample_weight)
  File "c:\Users\manor\Downloads\05 Project_Disaster Resoponse Pipelines\proj_uda_nlpdis_venv\lib\site-

KeyboardInterrupt: 