# Import Dependencies

In [1]:
import pandas as pd
import numpy as np
from pathlib import Path
from collections import Counter
import sqlalchemy as db
from sqlalchemy import create_engine
import io
import psycopg2

# Import Data Preparation Dependency for Algorithms
from sklearn.model_selection import train_test_split

# Import Tree Regressor
from sklearn.tree import DecisionTreeRegressor

In [2]:
from config import db_password, username

# Connect To PostgreSQL Database With SQLAlchemy

In [3]:
host = 'movies-fp.cpige012zhtw.us-east-1.rds.amazonaws.com'
port = 5432
passw = db_password
database = "postgres"
port=5432

   
db_string = "postgresql://" + username + ":" + passw + "@" + host + ":" + "5432/" + database
engine = db.create_engine(db_string)
connection = engine.connect()
metadata = db.MetaData()

# Query Database and Select Tables To Import and Create Dataframes

In [4]:
movie_genres = db.Table('movie_genre', metadata, autoload=True, autoload_with=engine)
query = db.select([movie_genres]) 
ResultProxy = connection.execute(query)
ResultSet = ResultProxy.fetchall()
mov_gen_df = pd.DataFrame(ResultSet)
mov_gen_df.columns = ResultSet[0].keys()
mov_gen_df = mov_gen_df.loc[:, ['movie_id', 'mg_id', 'genre_id']]

mov_gen_df.head()

Unnamed: 0,movie_id,mg_id,genre_id
0,1,0,1
1,1,1,3
2,1,2,4
3,1,3,5
4,1,4,9


In [5]:
movies = db.Table('movies', metadata, autoload=True, autoload_with=engine)
query = db.select([movies]) 
ResultProxy = connection.execute(query)
ResultSet = ResultProxy.fetchall()
movies_df = pd.DataFrame(ResultSet)
movies_df.columns = ResultSet[0].keys()
movies_df = movies_df.loc[:, ["movie_id", "name", "year", "runtime"]]

movies_df.head()

Unnamed: 0,movie_id,name,year,runtime
0,1,Toy Story,1995,81
1,2,Jumanji,1995,104
2,3,Grumpier Old Men,1995,101
3,4,Waiting to Exhale,1995,127
4,5,Father of the Bride Part II,1995,106


In [6]:
ratings = db.Table('ratings', metadata, autoload=True, autoload_with=engine)
query = db.select([ratings]) 
ResultProxy = connection.execute(query)
ResultSet = ResultProxy.fetchall()
ratings_df = pd.DataFrame(ResultSet)
ratings_df.columns = ResultSet[0].keys()
ratings_df = ratings_df.loc[:, ['movie_id', 'user_id', 'rating']]

ratings_df.head()

Unnamed: 0,movie_id,user_id,rating
0,2076,47452,4.0
1,2077,47452,2.0
2,2078,47452,3.0
3,2080,47452,3.0
4,2081,47452,4.0


In [7]:
cast = db.Table('movie_cast', metadata, autoload=True, autoload_with=engine)
query = db.select([cast])
ResultProxy = connection.execute(query)
ResultSet = ResultProxy.fetchall()
cast_df = pd.DataFrame(ResultSet)
cast_df.columns = ResultSet[0].keys()
cast_df = cast_df.loc[:, ['movie_id', 'cast_id']]

cast_df.head()

Unnamed: 0,movie_id,cast_id
0,1,31
1,2,2157
2,3,6837
3,4,8851
4,5,67773


In [8]:
connection.close()

In [9]:
def drop_no_genres(column):
    i = -1
    na = float("Nan")
    mov_gen_df["clean_genres"] = ""
    for row in column:
        i += 1
        if row == 20:
            mov_gen_df.at[i, "clean_genres"] = na
        else:
            mov_gen_df.at[i, "clean_genres"] = row
    return mov_gen_df.head()

In [10]:
drop_no_genres(mov_gen_df["genre_id"])

Unnamed: 0,movie_id,mg_id,genre_id,clean_genres
0,1,0,1,1
1,1,1,3,3
2,1,2,4,4
3,1,3,5,5
4,1,4,9,9


In [11]:
mov_gen_df.count()

movie_id        106104
mg_id           106104
genre_id        106104
clean_genres    101838
dtype: int64

In [12]:
mov_gen_df = mov_gen_df.drop_duplicates(keep="first")
mov_gen_df = mov_gen_df.dropna()

In [13]:
mov_gen_df.count()

movie_id        101838
mg_id           101838
genre_id        101838
clean_genres    101838
dtype: int64

In [14]:
mov_gen_df = mov_gen_df.drop(columns=["clean_genres"])
mov_gen_df.head()

Unnamed: 0,movie_id,mg_id,genre_id
0,1,0,1
1,1,1,3
2,1,2,4
3,1,3,5
4,1,4,9


In [15]:
# Set Index to Common Column of movie_id For Easier Join
movies_df = movies_df.set_index("movie_id")
ratings_df = ratings_df.set_index("movie_id")
mov_gen_df = mov_gen_df.set_index("movie_id")
cast_df = cast_df.set_index("movie_id")

In [16]:
data_df = ratings_df.join(movies_df, on="movie_id", how="left", sort=True)
data_df = data_df.join(mov_gen_df, on="movie_id", how="left", sort=True)
data_df = data_df.join(cast_df, on="movie_id", how="left", sort=True)

data_df.head()

Unnamed: 0_level_0,user_id,rating,name,year,runtime,mg_id,genre_id,cast_id
movie_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,47453,4.0,Toy Story,1995.0,81.0,0.0,1.0,31.0
1,47453,4.0,Toy Story,1995.0,81.0,1.0,3.0,31.0
1,47453,4.0,Toy Story,1995.0,81.0,2.0,4.0,31.0
1,47453,4.0,Toy Story,1995.0,81.0,3.0,5.0,31.0
1,47453,4.0,Toy Story,1995.0,81.0,4.0,9.0,31.0


In [17]:
data_df["movie_id"] = data_df.index
data_df.head()

Unnamed: 0_level_0,user_id,rating,name,year,runtime,mg_id,genre_id,cast_id,movie_id
movie_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1,47453,4.0,Toy Story,1995.0,81.0,0.0,1.0,31.0,1
1,47453,4.0,Toy Story,1995.0,81.0,1.0,3.0,31.0,1
1,47453,4.0,Toy Story,1995.0,81.0,2.0,4.0,31.0,1
1,47453,4.0,Toy Story,1995.0,81.0,3.0,5.0,31.0,1
1,47453,4.0,Toy Story,1995.0,81.0,4.0,9.0,31.0,1


In [18]:
data_df = data_df.reset_index(drop=True)
data_df.head()

Unnamed: 0,user_id,rating,name,year,runtime,mg_id,genre_id,cast_id,movie_id
0,47453,4.0,Toy Story,1995.0,81.0,0.0,1.0,31.0,1
1,47453,4.0,Toy Story,1995.0,81.0,1.0,3.0,31.0,1
2,47453,4.0,Toy Story,1995.0,81.0,2.0,4.0,31.0,1
3,47453,4.0,Toy Story,1995.0,81.0,3.0,5.0,31.0,1
4,47453,4.0,Toy Story,1995.0,81.0,4.0,9.0,31.0,1


In [19]:
data_df.count()

user_id     53797871
rating      53797871
name        53638017
year        53638017
runtime     53638017
mg_id       49759845
genre_id    49759845
cast_id     53261317
movie_id    53797871
dtype: int64

In [20]:
data_df["runtime"].describe()

count    5.363802e+07
mean     1.152789e+02
std      2.510561e+01
min      0.000000e+00
25%      9.900000e+01
50%      1.120000e+02
75%      1.270000e+02
max      1.256000e+03
Name: runtime, dtype: float64

In [21]:
counts = data_df["movie_id"].value_counts()
counts.describe()

count     53889.000000
mean        998.308950
std        7198.073055
min           1.000000
25%           2.000000
50%           7.000000
75%          60.000000
max      342345.000000
Name: movie_id, dtype: float64

In [22]:
print(data_df["movie_id"].value_counts())

1         342345
589       321290
1198      317525
590       311262
356       291120
           ...  
135466         1
173471         1
135416         1
135414         1
176747         1
Name: movie_id, Length: 53889, dtype: int64


In [23]:
movie_ids = data_df["movie_id"].unique()

movie_counts = pd.DataFrame(data=counts, index=movie_ids)
movie_counts.head()

Unnamed: 0,movie_id
1,342345
2,81429
3,31170
4,8967
5,15474


In [24]:
clean_movie_ids = movie_counts.loc[movie_counts["movie_id"] > 1000]
clean_movie_ids["counts"] = clean_movie_ids["movie_id"]
clean_movie_ids["movie_id"] = clean_movie_ids.index
clean_movie_ids = clean_movie_ids.reset_index(drop=True)
clean_movie_ids

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


Unnamed: 0,movie_id,counts
0,1,342345
1,2,81429
2,3,31170
3,4,8967
4,5,15474
...,...,...
4859,177615,1464
4860,177765,2701
4861,179819,2273
4862,180031,1473


In [25]:
clean_movie_ids.head()

Unnamed: 0,movie_id,counts
0,1,342345
1,2,81429
2,3,31170
3,4,8967
4,5,15474


In [26]:
len(clean_movie_ids["movie_id"])

4864

In [27]:
na = float("Nan")
data_df["new_movie_id"] = ""

def limit_ratings(column, column_2):
    i = -1
    for entry in column:
        i += 1
        if entry in column_2:
            data_df.at[i, "new_movie_id"] = entry
        else:
            data_df.at[i, "new_movie_id"] = na
    
    return data_df.head()
            


In [28]:
limit_ratings(data_df["movie_id"], clean_movie_ids["movie_id"])

Unnamed: 0,user_id,rating,name,year,runtime,mg_id,genre_id,cast_id,movie_id,new_movie_id
0,47453,4.0,Toy Story,1995.0,81.0,0.0,1.0,31.0,1,1
1,47453,4.0,Toy Story,1995.0,81.0,1.0,3.0,31.0,1,1
2,47453,4.0,Toy Story,1995.0,81.0,2.0,4.0,31.0,1,1
3,47453,4.0,Toy Story,1995.0,81.0,3.0,5.0,31.0,1,1
4,47453,4.0,Toy Story,1995.0,81.0,4.0,9.0,31.0,1,1


In [29]:
data_df.count()

user_id         53797871
rating          53797871
name            53638017
year            53638017
runtime         53638017
mg_id           49759845
genre_id        49759845
cast_id         53261317
movie_id        53797871
new_movie_id    38846840
dtype: int64

In [30]:
data_df = data_df.drop_duplicates(keep="first")
data_df = data_df.dropna()

In [31]:
data_df.count()

user_id         38635675
rating          38635675
name            38635675
year            38635675
runtime         38635675
mg_id           38635675
genre_id        38635675
cast_id         38635675
movie_id        38635675
new_movie_id    38635675
dtype: int64

In [32]:
data_df["year"] = data_df["year"].astype(int)
# data_df["mg_id"] = data_df["mg_id"].astype(int)
data_df["genre_id"] = data_df["genre_id"].astype(int)
data_df["cast_id"] = data_df["cast_id"].astype(int)

data_df.head()

Unnamed: 0,user_id,rating,name,year,runtime,mg_id,genre_id,cast_id,movie_id,new_movie_id
0,47453,4.0,Toy Story,1995,81.0,0.0,1,31,1,1
1,47453,4.0,Toy Story,1995,81.0,1.0,3,31,1,1
2,47453,4.0,Toy Story,1995,81.0,2.0,4,31,1,1
3,47453,4.0,Toy Story,1995,81.0,3.0,5,31,1,1
4,47453,4.0,Toy Story,1995,81.0,4.0,9,31,1,1


In [33]:
data_df["trimmed_runtime"] = data_df["runtime"].loc[data_df["runtime"] > 59]

In [34]:
data_df.count()

user_id            38635675
rating             38635675
name               38635675
year               38635675
runtime            38635675
mg_id              38635675
genre_id           38635675
cast_id            38635675
movie_id           38635675
new_movie_id       38635675
trimmed_runtime    38549840
dtype: int64

In [35]:
data_df = data_df.dropna()

In [36]:
data_df.count()

user_id            38549840
rating             38549840
name               38549840
year               38549840
runtime            38549840
mg_id              38549840
genre_id           38549840
cast_id            38549840
movie_id           38549840
new_movie_id       38549840
trimmed_runtime    38549840
dtype: int64

In [37]:
data_df = data_df.drop(columns=["new_movie_id", "trimmed_runtime"])
data_df.head()

Unnamed: 0,user_id,rating,name,year,runtime,mg_id,genre_id,cast_id,movie_id
0,47453,4.0,Toy Story,1995,81.0,0.0,1,31,1
1,47453,4.0,Toy Story,1995,81.0,1.0,3,31,1
2,47453,4.0,Toy Story,1995,81.0,2.0,4,31,1
3,47453,4.0,Toy Story,1995,81.0,3.0,5,31,1
4,47453,4.0,Toy Story,1995,81.0,4.0,9,31,1


In [38]:
# data_df.to_csv("ML_Dataframe.csv", index=False)

# Machine Learning Model Creation and Testing

In [39]:
# Declare X and y Variable Values From Dataframe
X = data_df.drop(columns=["user_id", "rating", "name", "mg_id"])
y = data_df["rating"]

X.head()

Unnamed: 0,year,runtime,genre_id,cast_id,movie_id
0,1995,81.0,1,31,1
1,1995,81.0,3,31,1
2,1995,81.0,4,31,1
3,1995,81.0,5,31,1
4,1995,81.0,9,31,1


In [40]:
# Create Training and Testing Sets
X_train, X_test, y_train, y_test = train_test_split(X, y)

# Decision Tree Regressor

In [41]:
# Create and Fit Model With Features
model = DecisionTreeRegressor(random_state=0)
model.fit(X_train, y_train)

DecisionTreeRegressor(random_state=0)

In [42]:
# Determine Coefficient of Determination (R^2)
model.score(X_test, y_test)

0.1896050371860699

In [43]:
# Predict Rating Values Based on Initial Datatable Features, Populate Dataframe Column
dcsntree_preds = model.predict(X)
data_df["Predicted Rating"] = dcsntree_preds
data_df.head()

Unnamed: 0,user_id,rating,name,year,runtime,mg_id,genre_id,cast_id,movie_id,Predicted Rating
0,47453,4.0,Toy Story,1995,81.0,0.0,1,31,1,3.886184
1,47453,4.0,Toy Story,1995,81.0,1.0,3,31,1,3.884246
2,47453,4.0,Toy Story,1995,81.0,2.0,4,31,1,3.884522
3,47453,4.0,Toy Story,1995,81.0,3.0,5,31,1,3.889121
4,47453,4.0,Toy Story,1995,81.0,4.0,9,31,1,3.887248


# Export to CSV File For Visualization Software Access

In [44]:
data_df.to_csv("full_df_prediction_dataframe.csv", index=False)