## Part 2: Training your own ML Model

<a href="https://colab.research.google.com/github/peckjon/hosting-ml-as-microservice/blob/master/part2/train_sentiment_analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from nltk import download

# get corpuses we'll need for tokenization and training
download('punkt')
download('movie_reviews')
download('stopwords')

In [None]:
from nltk.corpus import stopwords
from string import punctuation

# remove stopwords and punctuation
def clean_words(words):
    return [w for w in words if w not in stopwords.words("english") and w not in punctuation]

# reformat list as bag of words
def bag_of_words(words):
    return dict([(w, True) for w in words])

In [None]:
from nltk.corpus import movie_reviews

# extract words from reviews, format, pair with label
reviews_pos = []
reviews_neg = []
for fileid in movie_reviews.fileids('pos'):
    words = clean_words(movie_reviews.words(fileid))
    reviews_pos.append((bag_of_words(words), 'pos'))
for fileid in movie_reviews.fileids('neg'):
    words = clean_words(movie_reviews.words(fileid))
    reviews_neg.append((bag_of_words(words), 'neg'))

In [None]:
# split into training and test sets
def split_set(review_set):
    split = int(len(review_set)*.80)
    return (review_set[:split], review_set[split:])

pos_train, pos_test = split_set(reviews_pos)
neg_train, neg_test = split_set(reviews_neg)

train_set = pos_train+neg_train
test_set = pos_test+neg_test

In [None]:
from nltk.classify import NaiveBayesClassifier

# train the model
model = NaiveBayesClassifier.train(train_set)

In [None]:
from nltk.classify.util import accuracy

# test the model
correct = accuracy(model, test_set)
print(correct * 100)

In [None]:
import pickle

# save the model file
model_file = open("sa_classifier.pickle","wb")
pickle.dump(model, model_file)
model_file.close()



In [None]:
# Colab only: save to Google Drive
import sys
import os
if 'google.colab' in sys.modules:
    from google.colab import drive
    drive.mount('/content/gdrive')
    !mkdir -p '/content/gdrive/My Drive/Colab Output'
    model_file = open('/content/gdrive/My Drive/Colab Output/sa_classifier.pickle',"wb")
    pickle.dump(model, model_file)
    model_file.flush()
    # os.fsync(model_file.fileno())
    print('Model saved in /content/gdrive/My Drive/Colab Output')
    !ls '/content/gdrive/My Drive/Colab Output'
    drive.flush_and_unmount()
    print('Re-run this cell if you cannot find it in https://drive.google.com')