# Training & Saving a model

In [1]:
import datetime
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd
import os

from urllib import request
from zipfile import ZipFile

def download_file(file_name, url):
    res = request.urlopen(url)
    with open(file_name,'wb') as file:
        file.write(res.read())
        
def unzip(file_name, path='./'):
    # opening the zip file in READ mode 
    with ZipFile(file_name, 'r') as zip: 
        # printing all the contents of the zip file 
        zip.printdir() 

        # extracting all the files 
        print('Extracting all the files now...') 
        zip.extractall(path = path) 
        print('Done!')
        

file_name = 'imdb.csv'

if os.path.isfile(file_name) == False:
    print('downloading')
    download_file(file_name, 'https://github.com/msaricaumbc/DS_data/blob/master/ds602/imdb2.zip?raw=true')
    print('extracting')
    unzip(file_name)


df = pd.read_csv(file_name, encoding='utf-8')
df.head()

Unnamed: 0,review,sentiment
0,"In 1974, the teenager Martha Moxley (Maggie Gr...",1
1,OK... so... I really like Kris Kristofferson a...,0
2,"***SPOILER*** Do not read this, if you think a...",0
3,hi for all the people who have seen this wonde...,1
4,"I recently bought the DVD, forgetting just how...",0


In [2]:
X = df['review']
y = df['sentiment']

In [3]:
p = Pipeline([('vect', TfidfVectorizer(strip_accents='ascii', 
                        lowercase=True,
                        stop_words='english',
                        analyzer='word'
                       )),
              ('lm', LogisticRegression(C=10, solver='liblinear'))
             ])

model = p.fit(X, y)
model

In [8]:
model.predict(['this is a good movie'])

NameError: name 'model' is not defined

# Save model

In [5]:
import joblib

joblib.dump(model, 'pipeline.pkl')

['pipeline.pkl']

In [6]:
!ls

[34mdiagrams[m[m                      online-learning-and-sgd.ipynb
[34mhomework[m[m                      pipeline.pkl
imdb.csv                      saving-model.ipynb
imdb.csv.zip                  using-saved-model.ipynb
imdb.zip


In [7]:
del model

In [None]:
# model

In [9]:
pipeline = joblib.load('pipeline.pkl')
pipeline

In [10]:
pipeline.predict(['this is an awesome movie', 'this is a terrible movie', 'I really enjoyed this movie!'])

array([1, 0, 1])