# Creating a connection with SQL

In [1]:
import pymysql
from sqlalchemy import create_engine
import pandas as pd
import getpass
password = getpass.getpass()

········


In [2]:
connection_string = "mysql+pymysql://root:"+password+"@localhost/sakila"
engine = create_engine(connection_string)

# Querying the description and the rating for each movie

In [3]:
query = """
SELECT 
    description, rating
FROM
    film;
"""
df = pd.read_sql_query(query, engine)

In [4]:
df.head()

Unnamed: 0,description,rating
0,A Epic Drama of a Feminist And a Mad Scientist...,PG
1,A Astounding Epistle of a Database Administrat...,G
2,A Astounding Reflection of a Lumberjack And a ...,NC-17
3,A Fanciful Documentary of a Frisbee And a Lumb...,G
4,A Fast-Paced Documentary of a Pastry Chef And ...,G


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 2 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   description  1000 non-null   object
 1   rating       1000 non-null   object
dtypes: object(2)
memory usage: 15.8+ KB


# Creating a function named 'binary_rating' which will replace the rating values by 'Yes' or 'No'

In [6]:
def binary_rating(x):
    for rating in x:
        if rating in ['G','PG','PG-13']:
            return 'Yes'
        else:
            return 'No'
df['rating'] = df['rating'].apply(binary_rating)
df.head()

Unnamed: 0,description,rating
0,A Epic Drama of a Feminist And a Mad Scientist...,No
1,A Astounding Epistle of a Database Administrat...,Yes
2,A Astounding Reflection of a Lumberjack And a ...,No
3,A Fanciful Documentary of a Frisbee And a Lumb...,Yes
4,A Fast-Paced Documentary of a Pastry Chef And ...,Yes


# Creating a function named 'get_df_corpus' that, given the dataframe,will return a list in which each element will be a movie description

In [7]:
def get_df_corpus(x):
    return x['description'].tolist()

corpus = get_df_corpus(df)

# Train/test split

In [8]:
from sklearn.model_selection import train_test_split

y = df['rating']
X = df['description']

X_train,X_test,y_train,y_test = train_test_split(X, y, test_size=0.3, random_state=40)

# Dummifying the words in each description removing stop-words

In [9]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

vectorizer = CountVectorizer(stop_words = 'english', analyzer='word')
vectorizer.fit(corpus)

X_train_counts = vectorizer.transform(X_train)
X_test_counts  = vectorizer.transform(X_test)

tf_transformer = TfidfTransformer()
tf_transformer.fit(X_train_counts)
X_train_tfidf = tf_transformer.transform(X_train_counts)
X_test_tfidf  = tf_transformer.transform(X_test_counts)

# Logistic regression model

In [10]:
from sklearn.linear_model import LogisticRegression

LR = LogisticRegression(max_iter=400).fit(X_train_tfidf, y_train)

# Predicting ratings

In [11]:
preds_X_train = LR.predict(X_train_tfidf)
preds_X_test = LR.predict(X_test_tfidf)

In [12]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test, preds_X_test)

0.8266666666666667

# Using pickle to save the vectorizer, the tf_transformer and the model

In [13]:
import pickle
import os

In [14]:
if(not os.path.exists('transformers')):
    os.makedirs('transformers')
    pickle.dump(vectorizer, open('transformers/vectorizer.pkl', 'wb'))
    pickle.dump(tf_transformer, open('transformers/tf_transformer.pkl', 'wb'))

if(not os.path.exists('models')):
    os.makedirs('models')
    pickle.dump(LR, open('models/logistic_model.pkl', 'wb'))