## Problem 8
####  Train a regression model and predict the rating of the 3 movies mentioned.

In [81]:
import matplotlib.pyplot as plt
import pickle
import string
import re
import numpy as np
from tqdm import tqdm as timer
from sklearn import linear_model
import sklearn.metrics as mcs

#### Movie to rating dictionary

In [26]:
def clean_string(s):
    return(re.sub(r'\(.*\)|\{.*\}|\'|\"', "", s).lstrip().rstrip())

In [27]:
M2R = {}
printable = set(string.printable)

with open ('./project_2_data/movie_rating.txt') as infile:
    for line in timer(infile, total=348547, desc='movie_rating.txt'):
        line = filter(lambda x : x in printable, line.strip().translate(None, "&$ "))
        arr = line.split('\t\t')
        M2R[clean_string(arr[0])] = arr[1]

movie_rating.txt: 100%|██████████| 348547/348547 [00:03<00:00, 97923.85it/s] 


In [85]:
len(M2R)

310876

#### Movie to director dictionary

In [52]:
M2D = {}
printable = set(string.printable)

with open ('./project_2_data/director_movies.txt') as infile:
    for line in timer(infile, total=348547, desc='director_movies.txt'):
        line = filter(lambda x : x in printable, line.strip().translate(None, "&$ "))
        arr = line.split('\t\t')
        for movie in arr[1:]:
            M2D[clean_string(movie)] = arr[0]

director_movies.txt: 389663it [00:07, 50002.31it/s]                            


In [53]:
len(M2D)

621002

#### Load data from previous questions

In [18]:
A2PR = pickle.load(open('A2PR.pkl', 'rb'))
A2M = pickle.load(open('A2M.pkl', 'rb'))
M2A = pickle.load(open('M2A.pkl', 'rb'))

In [29]:
# some movies do not have ratings, we don't use them for training
available_movies = set(M2A.keys()).intersection(M2R.keys())

In [31]:
len(available_movies)

75234

#### Sort movies by rating to get top 250 movies

In [50]:
# top 250 movies (games)
sorted_movies = sorted(available_movies, key=lambda k : float(M2R[k]), reverse=True)[:250]

In [55]:
top_directors = set()
for movie in sorted_movies:
    if len(top_directors) >= 100:
        break;
    if movie in M2D.keys():
        top_directors.add(M2D[movie])


In [56]:
len(top_directors)

100

#### Build the feature vector for training linear regression model

In [59]:
def build_feature(movie):
    features = [0] * 6
    actors = M2A[movie]
    page_ranks = []
    for actor in actors:
        page_ranks.append(A2PR[actor])
    sorted_page_ranks = sorted(page_ranks, reverse=True)
    features[:5] = sorted_page_ranks[:5]

    if movie in M2D.keys() and M2D[movie] in top_directors:
        features[5] = 1
    return np.array(features)

feature_vector = np.vstack(map(build_feature, timer(available_movies)))

100%|██████████| 75234/75234 [1:16:07<00:00, 16.01it/s]


In [87]:
target = []
for movie in timer(available_movies):
    target.append(float(M2R[movie]))
target_vector = np.array(target)

100%|██████████| 75234/75234 [00:00<00:00, 206850.98it/s]


#### Training model

In [66]:
# Linear Regression model from sklearn
model = linear_model.LinearRegression()
model.fit(feature_vector, target_vector)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [77]:
test_movies = [
    "Batman v Superman: Dawn of Justice (2016)",
    "Mission: Impossible - Rogue Nation (2015)",
    "Minions (2015)"
]
clean_test_movies = []
for movie in test_movies:
    clean_test_movies.append(clean_string(movie.translate(None, "&$ ")))

In [78]:
test_feature_vector = map(build_feature, clean_test_movies)

In [79]:
model.predict(test_feature_vector)

array([ 6.11802137,  6.04641428,  6.13936857])

#### Goodness of fit of Regression Model

In [80]:
prediction = model.predict(feature_vector)

In [84]:
mcs.mean_squared_error(target_vector, prediction)

1.4411438060362047

In [83]:
mcs.mean_absolute_error(target_vector, prediction)

0.94690717321048978