# Import Packages

In [1]:
# keep track of run time of the code
import time
start_time = time.time()

In [2]:
# Import Packages
import numpy as np
import pandas as pd
from sklearn.metrics import f1_score, precision_score, recall_score

In [3]:
def CountFreq(li):
    """
    function to get frequency
    """
    freq = {}
    for item in li:
        if (item in freq):
            freq[item] += 1
        else:
            freq[item] = 1
    freq.pop(np.NaN, 0)
    return freq

# Import Datasets & Clean Data

In [4]:
# read the csv file containing movie data
df_movie = pd.read_csv("/Users/maelys/tensorflow-test/CS4050/FINAL PROJECT - Movies/data_movie.csv")

In [5]:
genre = list(df_movie['genre_1']) + list(df_movie['genre_2']) + list(df_movie['genre_3']) + list(df_movie['genre_4'])
genre_dict = CountFreq(genre)

genre_df = pd.DataFrame.from_dict(genre_dict, orient= 'index', columns=['count'])
genre_df = genre_df.sort_values(by = "count", ascending=False) 
# genre_df 

# create new list of column names
temp_cols = list(genre_df.index)

# Add multiple columns with NaN using columns param
df_movie = df_movie.reindex(columns = df_movie.columns.tolist() + temp_cols)

# add boolean values under each genre column 
for col in temp_cols:
    condition = (df_movie['genre_1'] == col) | (df_movie['genre_2'] ==  col) | (df_movie['genre_3'] ==  col )|( df_movie['genre_4'] ==  col)
    df_movie[col] = np.where(condition, 1,0)

In [6]:
actors = list(df_movie['main_actor_1']) + list(df_movie['main_actor_2']) + list(df_movie['main_actor_3']) + list(df_movie['main_actor_4'])
actor_dict = CountFreq(actors)

actor_df = pd.DataFrame.from_dict(actor_dict, orient= 'index', columns=['count'])
actor_df = actor_df.sort_values(by = "count", ascending=False)

# get smaller set of actors with only top actors
actor_small = actor_df.head(400)

# create new list of column names
temp_cols = list(actor_small.index)
len(temp_cols)

# Add multiple columns with NaN using columns param
df_movie = df_movie.reindex(columns = df_movie.columns.tolist() + temp_cols)

# add boolean values under each actor column 
for col in temp_cols:
    condition = (df_movie['main_actor_1'] == col) | (df_movie['main_actor_2'] ==  col) | (df_movie['main_actor_3'] ==  col )|( df_movie['main_actor_4'] ==  col)
    df_movie[col] = np.where(condition, 1,0)

In [7]:
# dropped features
df_movie = df_movie.drop(columns =['main_actor_1', 'main_actor_2', 'main_actor_3', 'main_actor_4', "genre_1", "genre_2", "genre_3", "genre_4"])
df_movie = df_movie.drop(columns =['trivia', 'domestic', 'worldwide'])
df_movie = df_movie.drop(columns =['title', 'release', 'mpaa', 'director', 'writer', 'producer', 'composer'])

In [8]:
# read the csv file containing rating data
df_rating = pd.read_csv("/Users/maelys/tensorflow-test/CS4050/FINAL PROJECT - Movies/data_rating.csv")

# turn rating variable into a binary
df_rating['rating_binary'] = np.where(df_rating['rating'] < 4, 0, 1)

In [9]:
# read questions dataset and get list of unique user ids
df_users = pd.read_csv("/Users/maelys/tensorflow-test/CS4050/FINAL PROJECT - Movies/question_movie.csv")
all_users = df_users['user_id'].unique()
df_rating = df_rating.loc[df_rating['user_id'].isin(all_users)]

In [10]:
users = list(df_rating['user_id'])
user_dict = CountFreq(users)

user_df = pd.DataFrame.from_dict(user_dict, orient= 'index', columns=['count'])
user_df = user_df.sort_values(by = "count", ascending=False)

user_df['user_id'] = user_df.index

In [11]:
# create new list of column names
temp_cols = list(user_df.index)
len(temp_cols)

# Add multiple columns with NaN using columns param
user_bools = user_df.reindex(columns = user_df.columns.tolist() + temp_cols)

# add boolean values under each column 
for col in temp_cols:
    condition = (user_bools['user_id'] == col)
    user_bools[col] = np.where(condition, 1,0)

user_bools = user_bools.drop(columns=['count'])

In [12]:
# merge users with ratings & movie
df_rating = pd.merge(df_rating, user_bools)
df = pd.merge(df_rating, df_movie)

In [13]:
# drop unnecessary columns
df_final = df.drop(columns =['movie_id', 'user_id', 'rating'])

# Build Models 

In [14]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

# split train/test set 
X = df_final.drop(columns =['rating_binary']) 
y = df_final['rating_binary']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, train_size=0.8, random_state=0)

In [15]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
scaler.fit(X_train)
X_train_scale = scaler.transform(X_train)
X_test_scale = scaler.transform(X_test)

In [16]:
# build model
model = LogisticRegression(random_state=0).fit(X_train_scale, y_train)

# make a prediction
pred = model.predict(X_test_scale)

# computer p, r, f score
p = precision_score(y_test, pred, zero_division=1, average=None, labels = [0,1])
r = recall_score(y_test, pred, zero_division=1, average=None, labels = [0,1])
f = f1_score(y_test, pred, zero_division=1, average=None, labels = [0,1])

print(p, r, f)

[0.75321131 0.67674573] [0.84405612 0.54139179] [0.79605031 0.6015488 ]


In [17]:
from sklearn.metrics import accuracy_score
# get accuracy
a = accuracy_score(y_test, pred)
print("test accuracy", a)

test accuracy 0.7301996050595079


In [18]:
pred_train = model.predict(X_train_scale)
a_train = accuracy_score(y_train, pred_train)
print("train accuracy", a_train)

train accuracy 0.7351412655525534


# Predictions for Question Movie Data

In [19]:
df_question = pd.read_csv("/Users/maelys/tensorflow-test/CS4050/FINAL PROJECT - Movies/question_movie.csv")

# merge questions & ratings
df_qr = pd.merge(user_bools, df_question)

# merge question & ratings with movie
df_qrm = pd.merge(df_qr, df_movie)
df_qrm = df_qrm.sort_values(by ="order")

# split X & y 
X_q = df_qrm.drop(columns =['recommend', 'user_id', 'movie_id', 'order']) 
y_q = df_qrm[['order', 'user_id', 'movie_id', 'recommend']] # recommend

In [20]:
# scale X 
X_q_scale = scaler.transform(X_q)

# predict y
y_pred = model.predict(X_q_scale)

# append results to dataframe & save csv file
y_q['recommend'] = y_pred
y_q.recommend.replace((1,0), ('TRUE', 'FALSE'), inplace=True)
y_q.to_csv('CS4050_RESULTS.csv', index=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  y_q['recommend'] = y_pred
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  y_q.recommend.replace((1,0), ('TRUE', 'FALSE'), inplace=True)


In [21]:
# get final run time
end_time = time.time()
run_time = end_time - start_time
print("Total Run Time", run_time, "seconds")

Total Run Time 88.39608907699585 seconds
