# Training & Saving a model

In [None]:
import datetime
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd
import os

from urllib import request
from zipfile import ZipFile

def download_file(file_name, url):
    res = request.urlopen(url)
    with open(file_name,'wb') as file:
        file.write(res.read())
        
def unzip(file_name, path='./'):
    # opening the zip file in READ mode 
    with ZipFile(file_name, 'r') as zip: 
        # printing all the contents of the zip file 
        zip.printdir() 

        # extracting all the files 
        print('Extracting all the files now...') 
        zip.extractall(path = path) 
        print('Done!')
        

file_name = 'imdb.csv'

if os.path.isfile(file_name) == False:
    print('downloading')
    download_file(file_name + '.zip', 'https://github.com/msaricaumbc/DS_data/blob/master/ds602/imdb2.zip?raw=true')
    print('extracting')
    unzip(file_name + '.zip')


df = pd.read_csv(file_name, encoding='utf-8')
df.head()

In [None]:
X = df['review']
y = df['sentiment']

In [None]:
p = Pipeline([('vect', TfidfVectorizer(strip_accents='ascii', 
                        lowercase=True,
                        stop_words='english',
                        analyzer='word'
                       )),
              ('lm', LogisticRegression(C=10, solver='liblinear'))
             ])

model = p.fit(X, y)
model

In [None]:
model.predict(['this is a great movie'])

# Save model

In [None]:
import joblib

joblib.dump(model, 'pipeline.pkl')

In [None]:
!ls

In [None]:
del model

In [None]:
# model

In [None]:
pipeline = joblib.load('pipeline.pkl')
pipeline

In [None]:
pipeline.predict(['this is an awesome movie', 'this is a terrible movie', 'I really enjoyed this movie!'])