# Notebook 4
This notebook contains codes for deployment purposes.

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import pickle

# Import preprocessing libraries
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.tokenize import RegexpTokenizer

# Import modeling libraries
from sklearn.multiclass import OneVsRestClassifier
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression

## Import Data

In [2]:
ted_model = pd.read_csv('../data/ted_model_final.csv')

In [3]:
ted_model.head()

Unnamed: 0,comments,views,transcript,persuasive_label,inspiring_label,unconvincing_label
0,4553,47227110,good morning great ive blown away whole thing ...,1,1,1
1,265,3200520,thank much chris truly great honor opportunity...,1,1,1
2,124,1636292,hello voice mail old friend ive called tech su...,1,0,1
3,200,1697550,today im happy heard sustainable development s...,1,1,1
4,593,12005869,10 year ago took task teach global development...,1,1,1


## Model Preparation

In [5]:
labels=['persuasive_label', 'inspiring_label', 'unconvincing_label']
X = ted_model.transcript
y = ted_model[labels]

In [7]:
logreg_pipeline = Pipeline([
                ('tfidf', TfidfVectorizer(stop_words='english', ngram_range=(1,2))),
                ('clf', OneVsRestClassifier(LogisticRegression(solver='sag', C=0.1), n_jobs=-1))
            ])
    
logreg_pipeline.fit(X, y)

Pipeline(memory=None,
         steps=[('tfidf',
                 TfidfVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.float64'>,
                                 encoding='utf-8', input='content',
                                 lowercase=True, max_df=1.0, max_features=None,
                                 min_df=1, ngram_range=(1, 2), norm='l2',
                                 preprocessor=None, smooth_idf=True,
                                 stop_words='english', strip_accents=None,
                                 sublinear_tf=False,
                                 token_patt...+\\b',
                                 tokenizer=None, use_idf=True,
                                 vocabulary=None)),
                ('clf',
                 OneVsRestClassifier(estimator=LogisticRegression(C=0.1,
                                                                  class_weight=None,

In [14]:
# Save the model
pickle.dump(logreg_pipeline, open('../data/pipeline.pkl', 'wb'))  

In [15]:
# Load the model
obj = pickle.load(open('../data/pipeline.pkl', 'rb'))

In [19]:
# Predict using the model loaded
obj.predict(["apple, dog"])

array([[0, 0, 0]])

## Text Input Preparation

In [20]:
# Instantiate TfidfVectorizer in the same way as the pipeline
tvec = TfidfVectorizer(stop_words='english', ngram_range=(1,2))
X_tvec = tvec.fit_transform(X).toarray()

In [None]:
# Save TfidfVectorizer into pickle file for processing text input
pickle.dump(tvec, open('../data/tvec.pkl', 'wb'))