# IMDB Score Predictor

## Load the Dataframe

In [None]:
import pandas as pd

In [None]:
movies = pd.read_csv('../data/cleaned/indomovie-data.csv')

In [None]:
movies.info()

In [None]:
movies.shape

## Drop Unnecessary Columns

In [None]:
movies.drop(columns=['url', 'title', 'description', 'year', 'imdb_votes', 'metascore', 'gross'], inplace=True)

## Remove Movies without IMDB Score

In [None]:
movies.dropna(subset='imdb_score', inplace=True)

In [None]:
movies.shape

In [None]:
movies.info()

## Fill Null Values

In [None]:
filler = {
    'director': 'No director',
    'stars': 'No actors',
    'runtime': 0,
    'genre': 'No genre',
    'rating': 'No rating',
}

movies.fillna(value=filler, inplace=True)

In [None]:
movies.info()

In [None]:
movies.head()

## Convert Runtime Values into Integer

In [None]:
movies['runtime'] = movies['runtime'].astype('int64')

movies.dtypes

## Split Columns with Multiple Values

In [None]:
splitted_rows = []

for index, row in movies.iterrows():
    directors = row['director'].split(', ')
    stars = row['stars'].split(', ')
    genres = row['genre'].split(', ')

    for director in directors:
        for star in stars:
            for genre in genres:
                splitted_row = {
                    'director': director,
                    'stars': star,
                    'genre': genre,
                    'runtime': row['runtime'],
                    'rating': row['rating'],
                    'imdb_score': row['imdb_score']
                }

                splitted_rows.append(splitted_row)

In [None]:
splitted_movies = pd.DataFrame(splitted_rows)

splitted_movies.head()

In [None]:
splitted_movies.info()

## Set the Features and the Label

In [None]:
X = splitted_movies
y = X.pop('imdb_score')

## Set the Training and Test Dataset

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

## Transform Columns to become Feature Columns

In [None]:
import tensorflow as tf

In [None]:
# from tensorflow.keras.layers import Input
# from tensorflow.keras.layers.experimental.preprocessing import StringLookup, CategoryEncoding, Normalization

In [None]:
CATEGORICAL_COLUMNS = [col for col in X_train.columns if col != 'runtime']
NUMERICAL_COLUMNS = ['runtime']

In [None]:
feature_columns = []

for feature in CATEGORICAL_COLUMNS:
    vocabulary = X_train[feature].unique()

    feature_columns.append(tf.feature_column.categorical_column_with_vocabulary_list(feature, vocabulary))

for feature in NUMERICAL_COLUMNS:
    feature_columns.append(tf.feature_column.numeric_column(feature, dtype=tf.int64))

feature_columns

In [None]:
    # # Create a StringLookup layer
    # input_layer = Input(shape=(), dtype=tf.string)
    # embedding_layer = StringLookup(vocabulary=vocabulary)(input_layer)
    
    # # Create a CategoryEncoding layer
    # encoded_layer = CategoryEncoding(num_tokens=len(vocabulary))(embedding_layer)
    
    # feature_columns.append(encoded_layer)

    # # Handle numerical features using Normalization
    # input_layer = Input(shape=(1,), dtype=tf.int64)  # Change dtype to tf.int64
    # normalization_layer = Normalization()(input_layer)

    # # Append the normalization layer to the feature_columns list
    # feature_columns.append(normalization_layer)

## Make Input Functions

In [None]:
def make_input_fn(data_df, label_df, num_epochs=10, shuffle=True, batch_size=32):
    def input_function():  # inner function, this will be returned
        ds = tf.data.Dataset.from_tensor_slices((dict(data_df), label_df))  # create tf.data.Dataset object with data and its label
        
        if shuffle:
            ds = ds.shuffle(1000)  # randomize order of data
        
        ds = ds.batch(batch_size).repeat(num_epochs)  # split dataset into batches of 32 and repeat process for number of epochs
        
        return ds  # return a batch of the dataset
    
    return input_function  # return a function object for use

In [None]:
train_input_fn = make_input_fn(X_train, y_train)
test_input_fn = make_input_fn(X_test, y_test, num_epochs=1, shuffle=False)

## Create Linear Estimator

In [None]:
linear_est = tf.estimator.LinearRegressor(feature_columns=feature_columns)

## Training the Model

In [None]:
linear_est.train(train_input_fn)

In [None]:
# lin_reg = LinearRegression()

# lin_reg.fit(X_train,y_train)

## Evaluate the model

In [None]:
result = linear_est.evaluate(test_input_fn)

In [None]:
print(result['average_loss'])

## Visualize the Predictions by the Model

In [None]:
import matplotlib.pyplot as plt

In [None]:
pred_dicts = list(linear_est.predict(test_input_fn))

In [None]:
predicted_scores = [pred['predictions'][0] for pred in pred_dicts]

scores = pd.Series(predicted_scores)

scores.plot(kind='hist', bins=20, title='Predicted IMDB Score')

plt.show()

In [None]:
# import numpy as np
from sklearn.metrics import mean_squared_error, r2_score

In [None]:
mse = mean_squared_error(y_test, scores)

print("Mean Squared Error (MSE):", mse)

In [None]:
r2 = r2_score(y_test, scores)

print("R-squared score (Coefficient of Determination):", r2)