# Notebook 4
This notebook contains codes for deployment purposes.

In [2]:
# Import preprocessing libraries

from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.tokenize import RegexpTokenizer

## Preparation for Deployment

In [None]:
def lemmastop(word):
    # Instantiate Lemmatizer
    lemmatizer = WordNetLemmatizer()
    
    #remove if words from stoplist or words with http or '/' in it
    if word in stopwords.words('english') or '/' in word:
        word = ''
        
    # Lemmatize word then remove any non word characters not catched in previous steps
    p_word = re.sub('\W+', '',lemmatizer.lemmatize(word))
    
    # returns processed words
    return p_word

def clean_data(raw_string):
    # The input is raw unprocessed text), and 
    # the output is preprocessed text)
    # Instantiate Tokenizer. 
    tokenizer = RegexpTokenizer(r'\w+\'?\w+(?=\W)') # Regex matches words and words with apostrophe in between
    
    # Tokenize raw string
    tokens = tokenizer.tokenize(raw_string.lower())  
    
    # call function to remove stop list words and lemmatize words
    processed_tokens = map(lemmastop, tokens)
    
    # Joins only tokens with words and returns processed string
    return ' '.join(token for token in processed_tokens if token != '')

In [None]:
# Apply the function to the transcript
ted_model['transcript'] = ted_model['transcript'].apply(clean_data)

In [None]:
# Instantiate model for deployment
logreg_pipeline = Pipeline([
                ('tfidf', TfidfVectorizer(stop_words='english', ngram_range=(1,2))),
                ('clf', OneVsRestClassifier(LogisticRegression(solver='sag', C=0.1), n_jobs=-1)),
            ])

labels=['persuasive_label', 'inspiring_label', 'unconvincing_label']

# logreg_pipeline(X_train, ted_model[labels])

# logreg_pipeline.predict_proba(X_test)

for label in labels:
    # train the model using X_train and y_train
    logreg_pipeline(X_train, ted_model[label])
    # compute probabilities
    test_pred = logreg_pipeline.predict_proba(X_test)