In [26]:
# download necessary NLTK data
import nltk
nltk.download(['punkt', 'wordnet'])

import sqlalchemy
from sqlalchemy import create_engine

# import statements
import re
import pandas as pd
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\manor\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\manor\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [28]:
def load_data():
    engine = create_engine('sqlite:///DisasterResponse_try3.db')
    df = pd.read_sql_table("Message", engine)
    X = df['message']
    y = df.drop(['id', 'message', 'original', 'genre'], axis=1)
    return X, y

def tokenize(text):

    text = re.sub(pattern=r"[^a-zA-Z0-9]",repl=" ", string=str(text))

    words = text.lower().split()
    stops = set(stopwords.words("english"))      
    meaningful_words = [w for w in words if not w in stops]      
    text = " ".join(meaningful_words)

    tokens = word_tokenize(text)
    lemmatizer = WordNetLemmatizer()

    clean_tokens = []
    for tok in tokens:
        clean_tok = lemmatizer.lemmatize(tok).lower().strip()
        clean_tokens.append(clean_tok)

    return clean_tokens

def display_results(y_test, y_pred):
    for i, column in enumerate(y.columns):
        print(f"Category: {column}\n")
        print(classification_report(y_test[column], y_pred[:, i]))
        print("------------------------")

def main():
    X, y = load_data()

    X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.25, random_state=42)

    # instantiate transformers and classifiers
    # note: we can also use vectorizer intead of the above two
    # also the below can be used with pipeline, this is the simple classic version
    vect = CountVectorizer(tokenizer=tokenize)
    tfidf = TfidfTransformer()
    clf = RandomForestClassifier()

    # fit and transform the training data
    X_train_counts = vect.fit_transform(X_train)
    X_train_tfidf = tfidf.fit_transform(X_train_counts)

    # train classifier
    clf.fit(X_train_tfidf, y_train)

    # transform (no fitting) the test data
    X_test_counts = vect.transform(X_test)
    X_test_tfidf = tfidf.transform(X_test_counts)
    # predict on test data
    y_pred = clf.predict(X_test_tfidf)

    # display results
    display_results(y_test, y_pred)

In [29]:
main()

