In [1]:
# import libraries
import sys
import re
import pandas as pd
from sqlalchemy import create_engine
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.model_selection import train_test_split
from sklearn.multioutput import MultiOutputClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
import pickle

nltk.download(['punkt', 'wordnet', 'omw-1.4'])

# define regex characters
url_regex = 'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+'

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/tomaslorincfpt/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/tomaslorincfpt/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /Users/tomaslorincfpt/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [2]:
def tokenize(text):
    """
    Function which tokenize message using regular expressions
    :param text: String containing message
    :return: clean_tokens: list of words containing tokenized and cleaned message
    """
    # get list of all urls using regex
    detected_urls = re.findall(url_regex, text)
    # replace each url in text string with placeholder
    for url in detected_urls:
        text = text.replace(url, "urlplaceholder")
    # tokenize text
    tokens = word_tokenize(text)
    lemmatizer = WordNetLemmatizer()
    # iterate through each token
    clean_tokens = []
    for tok in tokens:
        # lemmatize, normalize case, and remove leading/trailing white space
        clean_tok = lemmatizer.lemmatize(tok).lower().strip()
        clean_tokens.append(clean_tok)
    return clean_tokens

def display_results(cv, y_test, y_pred, labels):
    """
    This function visualise trained model
    :param cv: Any
    :param y_test: Any
    :param y_pred: Any
    :param labels: Any
    :return: none
    """
    accuracy = (y_pred == y_test).mean()
    print("Labels:", labels)
    print("Accuracy:", accuracy)
    print("\nBest Parameters:", cv.best_params_)

In [3]:
# load data
engine = create_engine('sqlite:///moviesdata.db')
df = pd.read_sql_table(table_name='name', con=engine)
X = df['description']
Y = df[df.columns].drop(['description'], axis = 1)

Y = Y.astype(int)
categories = Y.columns

ValueError: Table name not found

In [None]:
# split data
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2)

In [None]:
# build model
pipeline = Pipeline([
    ('vect', CountVectorizer(tokenizer=tokenize)),
    ('Tfidf', TfidfTransformer()),
    ('clf', MultiOutputClassifier(RandomForestClassifier()))])
parameters = {'clf__estimator__n_estimators': [3]}

model = GridSearchCV(pipeline, param_grid=parameters, verbose=3, n_jobs=-1)

In [None]:
# fit model
model.fit(X_train, Y_train)

In [None]:
# display results
y_pred = model.predict(X_test)
display_results(model, Y_test, y_pred, categories)

In [None]:
#save model
with open('netflix_model.pkl', 'wb') as f:
    pickle.dump(model, f)
pass