In [34]:
# download necessary NLTK data
import nltk
nltk.download(['punkt', 'wordnet'])

import sqlalchemy
from sqlalchemy import create_engine

# import statements
import re
import pandas as pd
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
import pickle
from sklearn.metrics import accuracy_score

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\manor\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\manor\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [28]:
def load_data():
    engine = create_engine('sqlite:///DisasterResponse_try3.db')
    df = pd.read_sql_table("Message", engine)
    X = df['message']
    y = df.drop(['id', 'message', 'original', 'genre'], axis=1)
    return X, y

def tokenize(text):

    text = re.sub(pattern=r"[^a-zA-Z0-9]",repl=" ", string=str(text))

    words = text.lower().split()
    stops = set(stopwords.words("english"))      
    meaningful_words = [w for w in words if not w in stops]      
    text = " ".join(meaningful_words)

    tokens = word_tokenize(text)
    lemmatizer = WordNetLemmatizer()

    clean_tokens = []
    for tok in tokens:
        clean_tok = lemmatizer.lemmatize(tok).lower().strip()
        clean_tokens.append(clean_tok)

    return clean_tokens

def display_results(y_test, y_pred):
    for i, column in enumerate(y.columns):
        print(f"Category: {column}\n")
        print(classification_report(y_test[column], y_pred[:, i]))
        print("------------------------")

def main():
    X, y = load_data()

    X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.25, random_state=42)

    # instantiate transformers and classifiers
    # note: we can also use vectorizer intead of the above two
    # also the below can be used with pipeline, this is the simple classic version
    vect = CountVectorizer(tokenizer=tokenize)
    tfidf = TfidfTransformer()
    clf = RandomForestClassifier()

    # fit and transform the training data
    X_train_counts = vect.fit_transform(X_train)
    X_train_tfidf = tfidf.fit_transform(X_train_counts)

    # train classifier
    clf.fit(X_train_tfidf, y_train)

    # transform (no fitting) the test data
    X_test_counts = vect.transform(X_test)
    X_test_tfidf = tfidf.transform(X_test_counts)
    # predict on test data
    y_pred = clf.predict(X_test_tfidf)

    # display results
    display_results(y_test, y_pred)

In [29]:
main()



Category: related

              precision    recall  f1-score   support

           0       0.69      0.47      0.56      1563
           1       0.85      0.93      0.89      4944
           2       0.41      0.36      0.39        47

    accuracy                           0.82      6554
   macro avg       0.65      0.59      0.61      6554
weighted avg       0.81      0.82      0.80      6554

------------------------
Category: request

              precision    recall  f1-score   support

           0       0.90      0.98      0.94      5443
           1       0.84      0.49      0.62      1111

    accuracy                           0.90      6554
   macro avg       0.87      0.74      0.78      6554
weighted avg       0.89      0.90      0.89      6554

------------------------
Category: offer

              precision    recall  f1-score   support

           0       0.99      1.00      1.00      6521
           1       0.00      0.00      0.00        33

    accuracy           

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

           0       0.98      1.00      0.99      6421
           1       0.78      0.05      0.10       133

    accuracy                           0.98      6554
   macro avg       0.88      0.53      0.54      6554
weighted avg       0.98      0.98      0.97      6554

------------------------
Category: missing_people

              precision    recall  f1-score   support

           0       0.99      1.00      0.99      6481
           1       1.00      0.01      0.03        73

    accuracy                           0.99      6554
   macro avg       0.99      0.51      0.51      6554
weighted avg       0.99      0.99      0.98      6554

------------------------
Category: refugees

              precision    recall  f1-score   support

           0       0.97      1.00      0.98      6339
           1       0.38      0.01      0.03       215

    accuracy                           0.97      6554
   macro avg       0.67      0.5

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

           0       1.00      1.00      1.00      6530
           1       0.00      0.00      0.00        24

    accuracy                           1.00      6554
   macro avg       0.50      0.50      0.50      6554
weighted avg       0.99      1.00      0.99      6554

------------------------
Category: aid_centers

              precision    recall  f1-score   support

           0       0.99      1.00      0.99      6473
           1       0.00      0.00      0.00        81

    accuracy                           0.99      6554
   macro avg       0.49      0.50      0.50      6554
weighted avg       0.98      0.99      0.98      6554

------------------------
Category: other_infrastructure

              precision    recall  f1-score   support

           0       0.96      1.00      0.98      6271
           1       0.00      0.00      0.00       283

    accuracy                           0.96      6554
   macro avg       0.48

In [30]:
# # Save the trained model as a pickle file
# with open('rcf_simple_model.pkl', 'wb') as f:
#     pickle.dump(clf, f)

NameError: name 'clf' is not defined

In [39]:
def load_data():
    engine = create_engine('sqlite:///DisasterResponse_try3.db')
    df = pd.read_sql_table("Message", engine)
    X = df['message']
    y = df.drop(['id', 'message', 'original', 'genre'], axis=1)
    return X, y

def tokenize(text):

    text = re.sub(pattern=r"[^a-zA-Z0-9]",repl=" ", string=str(text))

    words = text.lower().split()
    stops = set(stopwords.words("english"))      
    meaningful_words = [w for w in words if not w in stops]      
    text = " ".join(meaningful_words)

    tokens = word_tokenize(text)
    lemmatizer = WordNetLemmatizer()

    clean_tokens = []
    for tok in tokens:
        clean_tok = lemmatizer.lemmatize(tok).lower().strip()
        clean_tokens.append(clean_tok)

    return clean_tokens

def display_results(y_test, y_pred):
    for i, column in enumerate(y.columns):
        print(f"Category: {column}\n")
        print(classification_report(y_test[column], y_pred[:, i]))
        print("------------------------")

def save_model(model, file_path):
    with open(file_path, 'wb') as f:
        pickle.dump(model, f)

def load_model(file_path):
    with open(file_path, 'rb') as f:
        model = pickle.load(f)
    return model

def get_acc(y_true, y_pred):
    # Assuming y_true is the true labels and y_pred is the predicted labels
    exact_match_ratio = accuracy_score(y_true, y_pred)
    print("Exact Match Ratio:", exact_match_ratio)
    return f"Exact Match Ratio: {exact_match_ratio}"


def main2():
    X, y = load_data()

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

    # instantiate transformers and classifiers
    vect = CountVectorizer(tokenizer=tokenize)
    tfidf = TfidfTransformer()
    clf = RandomForestClassifier()

    # fit and transform the training data
    X_train_counts = vect.fit_transform(X_train)
    X_train_tfidf = tfidf.fit_transform(X_train_counts)

    # train classifier
    clf.fit(X_train_tfidf, y_train)

    # transform (no fitting) the test data
    X_test_counts = vect.transform(X_test)
    X_test_tfidf = tfidf.transform(X_test_counts)
    # predict on test data
    y_pred = clf.predict(X_test_tfidf)

    # # Evaluate the model's performance
    # accuracy = accuracy_score(y_test, y_pred)
    # print(f"Accuracy: {accuracy:.2f}")
    get_acc(y_test, y_pred)
    # Decide if the model's performance is good enough
    # if accuracy >= 0.85:  # You can adjust this threshold as needed
    #     # Save the trained classifier using pickle
    #     save_model(clf, 'rfc_simple_model.pkl')

    # # Save the trained classifier using pickle
    # save_model(clf, 'rfc_simple_model.pkl')

    # display results
    display_results(y_test, y_pred)

In [40]:
main2()

