# Big Data Analytics

## add names here:




### The aim of this project is to use the IMDB Dataset: (Link) and use data such as actors, director, genre, isAdult and runtime to predict how well a movie will perform. We use the boxoffice and userrating as measures of performance. We will train our model with 80% of the data and test it on the remaining 20%. We will also give our predicitons for upcoming movies that do not have any ratings/boxoffice yet.

### Our hypothesis is that actors & director will positively influence the performance while the variable runtime will negatively influence it. 



In [6]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import time
from sklearn.linear_model import Ridge,LinearRegression,Lasso
from sklearn.neural_network import MLPRegressor
from sklearn.decomposition import PCA
from sklearn.preprocessing import PolynomialFeatures
%matplotlib inline



# Load Data and Preprocessing

## title basics 

In [7]:
df = pd.read_csv("data/title.basics.tsv",sep="\t")
# Select all non tv Series
df = df[(df['endYear']=='\\N')]
df.drop(["endYear"],axis=1,inplace=True)
df = df[(df['titleType']=="movie")]
# replace all \N with NAN and subsequently drop them
df.replace("\\N",np.nan,inplace=True)
df.dropna(inplace=True)
# Convert data types to int where feasible
df['startYear'] = df['startYear'].astype(int)
df['isAdult'] = df['isAdult'].astype(int)
df['runtimeMinutes'] = df['runtimeMinutes'].astype(int)

df = df[(df['startYear']>=2000) & (df['startYear']<=2020)]

df.reset_index(inplace=True,drop=True)
df.to_csv("data/title.basics_preprocessed.csv",index=False)
df

FileNotFoundError: [Errno 2] No such file or directory: 'data/title.basics.tsv'

## name basics 

In [8]:
df = pd.read_csv("data/name.basics.tsv",sep="\t")
# Drop unnecessary columns
df.drop(["birthYear", "deathYear","primaryProfession","knownForTitles"],axis=1,inplace=True)
# replace all \N with NAN and subsequently drop them
df.replace("\\N",np.nan,inplace=True)
df.dropna(inplace=True)

df.reset_index(inplace=True,drop=True)
df.to_csv("data/name.basics_preprocessed.csv",index=False)
df

FileNotFoundError: [Errno 2] No such file or directory: 'data/name.basics.tsv'

## title principals

In [9]:
df = pd.read_csv("data/title.principals.tsv",sep="\t")
# Drop unnecessary columns
df.drop(["ordering", "category","job","characters"],axis=1,inplace=True)
# replace all \N with NAN and subsequently drop them
df.replace("\\N",np.nan,inplace=True)
df.dropna(inplace=True)

df.reset_index(inplace=True,drop=True)
df.to_csv("data/title.principals_preprocessed.csv",index=False)
df

FileNotFoundError: [Errno 2] No such file or directory: 'data/title.principals.tsv'

## title ratings

In [10]:
df = pd.read_csv("data/title.ratings.tsv", sep ="\t")
# Drop unnecessary columns
df.drop(["numVotes"], axis = 1, inplace = True)
df.dropna(inplace = True)

df.reset_index(inplace=True, drop=True)
df.to_csv("data/title.ratings.csv", index=False)
df

Unnamed: 0,tconst,averageRating
0,tt0000001,5.6
1,tt0000002,6.1
2,tt0000003,6.5
3,tt0000004,6.2
4,tt0000005,6.1
...,...,...
993148,tt9916576,5.9
993149,tt9916578,9.1
993150,tt9916720,5.1
993151,tt9916766,6.7


## Box Office Data

In [None]:
df = pd.read_csv("data/boxOffice.csv")
df.dropna(how="all",subset=["domestic","international","worldwide"],inplace=True)
df.reset_index(inplace=True,drop=True)
df.to_csv("data/boxOffice_preprocessed.csv",index=False)

In [None]:




df = pd.read_csv("filename")


# extract train and test variables
data = df.to_numpy()
X = df.loc[:, df.columns != ''].to_numpy()   # enter column name of test variable
y = np.log(df['']).to_numpy()                # enter column name of test variable

print(X.shape)
print(y.shape)

In [36]:
# title.crew preprocessing
import os

print(os.listdir('.\data'))
df = pd.read_csv('.\data\data.tsv', sep='\t', header=0)
df.drop(['writers'], axis = 1, inplace=True)
print(df['directors'].isnull().values.any())
df

['data.tsv', 'README.md', 'title.crew.tsv']
False


Unnamed: 0,tconst,directors
0,tt0000001,nm0005690
1,tt0000002,nm0721526
2,tt0000003,nm0721526
3,tt0000004,nm0721526
4,tt0000005,nm0005690
...,...,...
7869181,tt9916848,"nm5519454,nm5519375"
7869182,tt9916850,"nm5519375,nm5519454"
7869183,tt9916852,"nm5519454,nm5519375"
7869184,tt9916856,nm10538645


# Data Preprocessing

## Steps performed

### dropping columns, imputing missing values, categorical datapoints


# Kfold to decrease potential overfitting

In [None]:
from sklearn.model_selection import KFold

def kfold_validation(X,y,model):
    start = time.time()
    kf = KFold(n_splits=5,shuffle=True, random_state=69420) 
    mses = []
    models = []
    count = 0
    for trainIndices,testIndices in kf.split(X):
        print(f"iteration: {count}")
        Xtrain,Xval = X[trainIndices,:],X[testIndices,:]
        ytrain,yval = y[trainIndices],y[testIndices]
        model.fit(Xtrain,ytrain)
        yhat = model.predict(Xval)
        mse = np.sum(np.square(yval-yhat))/yval.size # mean squared error
        mses.append(mse)
        models.append(model)
        count+=1
    print(f"time used (seconds): {time.time()-start}")
    return mses, models


# Standardise to make data more comparable

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split

Xtrain,Xtest, ytrain,ytest = train_test_split(X,y,shuffle=True,train_size=0.8)

def standardise_data(X):
    mean = np.mean(X)
    std = np.std(X)
    X_std = (X-mean)/std
    return X_std, mean, std

#kf = KFold(n_splits=1,shuffle=True, random_state=69420) 

Xtrain_std, X_train_mean, X_train_std_div = standardise_data(Xtrain)
Xtest_std = (Xtest-X_train_mean)/X_train_std_div


# Linear Regression

In [None]:
model = LinearRegression()
mses,models = kfold_validation(Xtrain,ytrain,model)
yhat=models[np.argmin(mses)].predict(Xtest)
np.mean(np.square(yhat-ytest))

# Random Forest

In [None]:
from sklearn.ensemble import RandomForestRegressor 

model = RandomForestRegressor()
mses,models = kfold_validation(Xtrain,ytrain,model)
yhat=models[np.argmin(mses)].predict(Xtest)
np.mean(np.square(yhat-ytest))


# Data Visualisation