In [1]:
# import libraries
import pandas as pd
import numpy as np
import sqlalchemy as db

# download necessary NLTK data
import nltk
nltk.download(['punkt', 'wordnet'])
nltk.download('words')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')
nltk.download('maxent_ne_chunker')
nltk.download('vader_lexicon')

# import statements
import pandas as pd
import re
from nltk import word_tokenize, sent_tokenize, pos_tag, ne_chunk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.sentiment import SentimentIntensityAnalyzer
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.metrics import classification_report
from sklearn.model_selection import GridSearchCV
from sklearn.multioutput import MultiOutputClassifier
from sklearn.neighbors import KNeighborsClassifier
import pickle
import time

from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier


[nltk_data] Downloading package punkt to
[nltk_data]     /Users/pongsakorn/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/pongsakorn/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package words to
[nltk_data]     /Users/pongsakorn/nltk_data...
[nltk_data]   Package words is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/pongsakorn/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/pongsakorn/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/pongsakorn/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     /Users/pongsakorn/nl

In [4]:
# load data from database
engine = db.create_engine('sqlite:///../data/disaster_response_pete_proj2.db')
df = pd.read_sql_table('disaster_response_pete_proj2',engine)
df.head()
X = df['message']
y = df.drop(['id','message','original','genre'],axis=1)

In [5]:
df

Unnamed: 0,id,message,original,genre,related,request,offer,aid_related,medical_help,medical_products,...,aid_centers,other_infrastructure,weather_related,floods,storm,fire,earthquake,cold,other_weather,direct_report
0,2,Weather update - a cold front from Cuba that c...,Un front froid se retrouve sur Cuba ce matin. ...,direct,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,7,Is the Hurricane over or is it not over,Cyclone nan fini osinon li pa fini,direct,1,0,0,1,0,0,...,0,0,1,0,1,0,0,0,0,0
2,8,Looking for someone but no name,"Patnm, di Maryani relem pou li banm nouvel li ...",direct,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,9,UN reports Leogane 80-90 destroyed. Only Hospi...,UN reports Leogane 80-90 destroyed. Only Hospi...,direct,1,1,0,1,0,1,...,0,0,0,0,0,0,0,0,0,0
4,12,"says: west side of Haiti, rest of the country ...",facade ouest d Haiti et le reste du pays aujou...,direct,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26023,30261,The training demonstrated how to enhance micro...,,news,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
26024,30262,A suitable candidate has been selected and OCH...,,news,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
26025,30263,"Proshika, operating in Cox's Bazar municipalit...",,news,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
26026,30264,"Some 2,000 women protesting against the conduc...",,news,1,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0


In [7]:
# using nltk to case normalize, lemmatize, and tokenize text from breaking down words.
def tokenize(text):
    return [WordNetLemmatizer().lemmatize(w).strip() for w in word_tokenize(text.lower())]
    

In [7]:
pipeline = Pipeline([
        ('features', FeatureUnion([

            ('text_pipeline', Pipeline([
                ('vect', CountVectorizer(tokenizer=tokenize)),
                ('tfidf', TfidfTransformer())
            ])),
        ])),
#         ('clf', MultiOutputClassifier(KNeighborsClassifier(n_neighbors = 3)))
        ('clf', MultiOutputClassifier(RandomForestClassifier()))
    ])

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y)


In [9]:
start_time = time.time()
pipeline.fit(X_train, y_train)
print("--- %s seconds from start time ---" % (time.time() - start_time))
y_pred = pipeline.predict(X_test)
print("--- %s seconds from start time ---" % (time.time() - start_time))

--- 313.7391400337219 seconds from start time ---
--- 330.649906873703 seconds from start time ---


In [14]:
start_time = time.time()

vect = CountVectorizer(tokenizer=tokenize)
tfidf = TfidfTransformer()
# clf = (RandomForestClassifier())
clf = MultiOutputClassifier(RandomForestClassifier())
print("--- %s seconds from start time ---" % (time.time() - start_time))

# train classifier
X_train_counts = vect.fit_transform(X_train)
X_train_tfidf = tfidf.fit_transform(X_train_counts)
clf.fit(X_train_tfidf, y_train)
print("--- %s seconds from start time ---" % (time.time() - start_time))

# predict on test data
X_test_counts = vect.transform(X_test)
X_test_tfidf = tfidf.transform(X_test_counts)
y_pred = clf.predict(X_test_tfidf)

# X_train,y_train

# .predict(X_test_tfidf)
print("--- %s seconds from start time ---" % (time.time() - start_time))


--- 0.1823868751525879 seconds from start time ---
--- 419.79430079460144 seconds from start time ---
--- 440.53108310699463 seconds from start time ---


In [15]:
# Running randomforestclassifier by itself
# --- 0.0018010139465332031 seconds from start time ---
# --- 103.60312819480896 seconds from start time ---
# --- 108.07953429222107 seconds from start time ---

# Running multioutputclassifier
# --- 0.1823868751525879 seconds from start time ---
# --- 419.79430079460144 seconds from start time ---
# --- 440.53108310699463 seconds from start time ---

In [20]:
start_time = time.time()
clf.predict(X_test_tfidf[0])
print("--- %s seconds from start time ---" % (time.time() - start_time))

--- 0.33495497703552246 seconds from start time ---


In [27]:
(y_pred == y_test).mean()

related                   0.803750
request                   0.894882
offer                     0.994775
aid_related               0.759951
medical_help              0.916090
medical_products          0.949132
search_and_rescue         0.971108
security                  0.982173
military                  0.968034
child_alone               1.000000
water                     0.948978
food                      0.926080
shelter                   0.919932
clothing                  0.983710
money                     0.978331
missing_people            0.988474
refugees                  0.964807
death                     0.957430
other_aid                 0.873521
infrastructure_related    0.929461
transport                 0.953127
buildings                 0.949746
electricity               0.981866
tools                     0.993084
hospitals                 0.986630
shops                     0.995697
aid_centers               0.988167
other_infrastructure      0.951898
weather_related     

In [10]:
# Test LINE webhook structure

import json
json_object = json.loads('{"destination":"Uf798a60cb0b0cd0169463a26eed30cae","events":[{"type":"message","message":{"type":"text","id":"14237049330994","text":"Nzfmgsgs"},"timestamp":1623865155864,"source":{"type":"user","userId":"U5e2c5d78107955fc10a39890582713b6"},"replyToken":"ad27ff1441b340768c85fe0c23fdb34c","mode":"active"}]}')


                         

In [11]:
# {
#     "destination": "Uf798a60cb0b0cd0169463a26eed30cae",
#     "events": [
#         {
#             "type": "message",
#             "message": {
#                 "type": "text",
#                 "id": "14237049330994",
#                 "text": "Nzfmgsgs"
#             },
#             "timestamp": 1623865155864,
#             "source": {
#                 "type": "user",
#                 "userId": "U5e2c5d78107955fc10a39890582713b6"
#             },
#             "replyToken": "ad27ff1441b340768c85fe0c23fdb34c",
#             "mode": "active"
#         }
#     ]
# }

In [9]:
json_object['events'][0]['message']['text']

NameError: name 'json_object' is not defined

In [10]:
class HelpWordExtractor(BaseEstimator, TransformerMixin):
# To identify specific words for help and label them appropriately
    def contains_help(self, text):
        sentence_list = nltk.sent_tokenize(text)
        for sentence in sentence_list:
            if 'help' in sentence:
                return 1
            else:
                return 0
#             return 'help' in sentence
        return 0

    def fit(self, x, y=None):
        return self

    def transform(self, X):
        X_tagged = pd.Series(X).apply(self.contains_help)
        return pd.DataFrame(X_tagged)
    
class WordLengthExtractor(BaseEstimator, TransformerMixin):
# To see word length whether longer message have more distress in them or not
    def word_length(self, text):
        return len(text)

    def fit(self, x, y=None):
        return self

    def transform(self, X):
        X_tagged = pd.Series(X).apply(self.word_length)
        return pd.DataFrame(X_tagged)
    
class SentimentSentenceExtractor(BaseEstimator, TransformerMixin):
# To see word length whether longer message have more distress in them or not
    def sentiment_analyser(self, text):
        return SentimentIntensityAnalyzer().polarity_scores(text)['compound']

    def fit(self, x, y=None):
        return self

    def transform(self, X):
        X_tagged = pd.Series(X).apply(self.sentiment_analyser)
        return pd.DataFrame(X_tagged)

In [118]:
pkl_filename = "classifier-knn.pkl"  
# Load the Model back from file
with open(pkl_filename, 'rb') as file:  
    loaded_model = pickle.load(file)

In [119]:
loaded_model.best_params_

{'clf__estimator__leaf_size': 30, 'clf__estimator__n_neighbors': 10}

In [120]:
loaded_model.best_score_

0.181250390646172

In [121]:
X_train, X_test, y_train, y_test = train_test_split(X, y)


In [122]:
X_test.head()

25749    "The European Commission is providing vital re...
10711    So done with zombies for tonight now a little ...
20287    Cold weather, and the crowding which results, ...
9147     I would like to find a job. can you lead me wh...
5071     Tell the ministry of health in La goave we hav...
Name: message, dtype: object

In [123]:
loaded_model.predict(X_test[200:220])

array([[1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0],


In [None]:
print(classification_report(y_test[col_name],y_pred_df[col_name]))

Unnamed: 0,related,request,offer,aid_related,medical_help,medical_products,search_and_rescue,security,military,child_alone,...,aid_centers,other_infrastructure,weather_related,floods,storm,fire,earthquake,cold,other_weather,direct_report
5725,1,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3706,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
25049,1,0,1,1,0,0,1,0,0,0,...,0,0,1,1,0,0,0,0,0,0
10061,1,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,1,0,0,0
7971,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
24676,1,0,0,1,0,0,0,1,1,0,...,0,0,0,0,0,0,0,0,0,0
8685,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
13675,1,0,0,0,0,0,0,0,0,0,...,0,0,1,1,1,0,0,0,0,0
20614,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9643,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [110]:
loaded_model.predict(["There's nothing to eat and water, we starving and thirsty."])

array([[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]])

In [111]:
loaded_model.predict(df.iloc[:20]['message'])

array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],


In [69]:
df.iloc[3]

id                                                                        9
message                   UN reports Leogane 80-90 destroyed. Only Hospi...
original                  UN reports Leogane 80-90 destroyed. Only Hospi...
genre                                                                direct
related                                                                   1
request                                                                   1
offer                                                                     0
aid_related                                                               1
medical_help                                                              0
medical_products                                                          1
search_and_rescue                                                         0
security                                                                  0
military                                                                  0
child_alone 

In [22]:
df.related.sum()/df.shape[0]

0.7647917627170739

In [35]:
col_df = df.drop(['id','message','original','genre'],axis=1)
x_data = list((col_df).mean())
y_data = col_df.columns

Index(['related', 'request', 'offer', 'aid_related', 'medical_help',
       'medical_products', 'search_and_rescue', 'security', 'military',
       'child_alone', 'water', 'food', 'shelter', 'clothing', 'money',
       'missing_people', 'refugees', 'death', 'other_aid',
       'infrastructure_related', 'transport', 'buildings', 'electricity',
       'tools', 'hospitals', 'shops', 'aid_centers', 'other_infrastructure',
       'weather_related', 'floods', 'storm', 'fire', 'earthquake', 'cold',
       'other_weather', 'direct_report'],
      dtype='object')

In [18]:
df.shape[0]

26028

In [49]:
col_df = df.drop(['id','message','original','genre'],axis=1)
y_data = list((col_df).mean())
x_data = col_df.columns
middle_index = int(len(y_data)/2)

In [48]:
len(y_data)

36

In [41]:
int(middle_index)

13014

In [45]:
x_data[:int(middle_index)]

Index(['related', 'request', 'offer', 'aid_related', 'medical_help',
       'medical_products', 'search_and_rescue', 'security', 'military',
       'child_alone', 'water', 'food', 'shelter', 'clothing', 'money',
       'missing_people', 'refugees', 'death', 'other_aid',
       'infrastructure_related', 'transport', 'buildings', 'electricity',
       'tools', 'hospitals', 'shops', 'aid_centers', 'other_infrastructure',
       'weather_related', 'floods', 'storm', 'fire', 'earthquake', 'cold',
       'other_weather', 'direct_report'],
      dtype='object')

In [51]:
table_col_names = df.drop(['id','original','genre'],axis=1).columns[:5]
table_col_data = df.drop(['id','original','genre'],axis=1)[:10]

In [52]:
table_col_names

Index(['message', 'related', 'request', 'offer', 'aid_related'], dtype='object')

In [57]:
[table_col_data[data_1] for data_1 in table_col_names]

[0    Weather update - a cold front from Cuba that c...
 1              Is the Hurricane over or is it not over
 2                      Looking for someone but no name
 3    UN reports Leogane 80-90 destroyed. Only Hospi...
 4    says: west side of Haiti, rest of the country ...
 5               Information about the National Palace-
 6                       Storm at sacred heart of jesus
 7    Please, we need tents and water. We are in Sil...
 8      I would like to receive the messages, thank you
 9    I am in Croix-des-Bouquets. We have health iss...
 Name: message, dtype: object,
 0    1
 1    1
 2    1
 3    1
 4    1
 5    0
 6    1
 7    1
 8    0
 9    1
 Name: related, dtype: int64,
 0    0
 1    0
 2    0
 3    1
 4    0
 5    0
 6    0
 7    1
 8    0
 9    1
 Name: request, dtype: int64,
 0    0
 1    0
 2    0
 3    0
 4    0
 5    0
 6    0
 7    0
 8    0
 9    0
 Name: offer, dtype: int64,
 0    0
 1    1
 2    0
 3    1
 4    0
 5    0
 6    0
 7    1
 8    0
 9    1
 

In [56]:
table_col_data[table_col_names]

Unnamed: 0,message,related,request,offer,aid_related
0,Weather update - a cold front from Cuba that c...,1,0,0,0
1,Is the Hurricane over or is it not over,1,0,0,1
2,Looking for someone but no name,1,0,0,0
3,UN reports Leogane 80-90 destroyed. Only Hospi...,1,1,0,1
4,"says: west side of Haiti, rest of the country ...",1,0,0,0
5,Information about the National Palace-,0,0,0,0
6,Storm at sacred heart of jesus,1,0,0,0
7,"Please, we need tents and water. We are in Sil...",1,1,0,1
8,"I would like to receive the messages, thank you",0,0,0,0
9,I am in Croix-des-Bouquets. We have health iss...,1,1,0,1
