https://hackernoon.com/logistic-regression-train-model-in-python-and-use-it-on-angular-front-end-u1s3u95

https://stackblitz.com/edit/ms-ng-pub-movie-sentiment-analysis-logistic-regression

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import roc_auc_score, accuracy_score, f1_score, precision_score, recall_score
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression

import matplotlib.pyplot as plt

from sklearn import __version__

__version__

In [None]:
# dataset can be found at: https://www.kaggle.com/lakshmi25npathi/imdb-dataset-of-50k-movie-reviews

In [None]:
from urllib import request
from zipfile import ZipFile

def download_file(file_name, url):
    res = request.urlopen(url)
    with open(file_name,'wb') as file:
        file.write(res.read())
        
def unzip(file_name, path='./'):
    # opening the zip file in READ mode 
    with ZipFile(file_name, 'r') as zip: 
        # printing all the contents of the zip file 
        zip.printdir() 

        # extracting all the files 
        print('Extracting all the files now...') 
        zip.extractall(path = path) 
        print('Done!')
        
download_file('imdb.zip', 'https://github.com/msaricaumbc/DS_data/blob/master/ds602/imdb2.zip?raw=true')
unzip('imdb.zip')

In [None]:
def prepare_dataset():
    df = pd.read_csv('imdb.csv')
    return df

In [None]:
df = prepare_dataset()
df.head()

In [None]:
df.tail()

In [None]:
df['sentiment'].hist()

In [None]:
def assertions(model):    
    positives = [
        'this is a great movie',
        'awesome movie'
    ]
        
    negatives = [
        'did not like it',
        'worst movie ever'
    ]
    
    for sentence in positives:
        pred = model.predict([sentence])[0]
        print(sentence, '-->', 'correct' if pred == 1 else 'incorrect')

    for sentence in negatives:
        pred = model.predict([sentence])[0]
        print(sentence, '-->', 'correct' if pred == 0 else 'incorrect')

In [None]:
def create_train_test(df=df):
    X = df['review']
    y = df['sentiment']
    X_train, X_test, y_train, y_test = train_test_split(X,y)
    return X_train, X_test, y_train, y_test

In [None]:
X_train, X_test, y_train, y_test = create_train_test()

y_train.hist()
y_test.hist()

In [None]:
def train_and_test(model):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    
    params = {
        'pos_label': 1
    }
    print('Accuracy:', accuracy_score(y_test, y_pred))
    print('Precision:', precision_score(y_test, y_pred, **params))
    print('Recall:', recall_score(y_test, y_pred, **params))
    print('F1 Score:', f1_score(y_test, y_pred, **params))
    print('-'* 60)
    assertions(model)

In [None]:
pipeline = Pipeline([
    ('vectorizer', TfidfVectorizer(max_features=3000, stop_words='english')), 
    ('model', LogisticRegression(solver='liblinear'))
])


train_and_test(pipeline)

In [None]:
pipeline.predict(['this is a great movie', 'worst movie ever'])

In [None]:
import joblib

def save_model(model, file_name='movie_model.pkl'):
    joblib.dump(model, file_name)

In [None]:
save_model(pipeline)