In [163]:
import numpy as np
import pandas as pd

from sklearn import linear_model
from sklearn import preprocessing
from sklearn import feature_extraction
from sklearn import metrics
from sklearn import pipeline
from sklearn import svm

df = pd.read_csv("movie_metadata.csv")
print(df.shape)


(5043, 28)


In [149]:
# Some preliminary analysis to see which features (only numerical features considered) correspond most strongly to imdb_score.
# Num_voted_users, num_user_for_reviews, duration, and movie_facebook_likes appear to be most important.
corr = df.select_dtypes(include = ['float64', 'int64']).iloc[:, 1:].corr()
corr = corr['imdb_score'].to_dict()
print("Features corresponding to IMDB score (ordered from most important to least):\n")
for feat in sorted(corr.items(), key = lambda x: -abs(float(x[1]))):
    print(feat)



Features corresponding to IMDB score (ordered from most important to least):

('imdb_score', 1.0)
('num_voted_users', 0.41096520027034994)
('num_user_for_reviews', 0.29247543745185667)
('duration', 0.26166152097667805)
('movie_facebook_likes', 0.24704851902724911)
('title_year', -0.20916700456475851)
('gross', 0.19802122755403362)
('director_facebook_likes', 0.17080222836186262)
('cast_total_facebook_likes', 0.085787347548007695)
('actor_2_facebook_likes', 0.083808112171109797)
('actor_1_facebook_likes', 0.076098817087680751)
('facenumber_in_poster', -0.062957524485050834)
('aspect_ratio', 0.059445227874183416)
('actor_3_facebook_likes', 0.052632729576778253)
('budget', 0.030687727112958172)


In [161]:
#Function currently in ModelDB
#Splits dataframe according to the weights provided. If the optional y dataframe is provided, it will also be split accordingly.
def randomSplit(X, weights, seed, y=None):
    result = []
    yresult = []
    np.random.seed(seed)
    df = X
    s = float(sum(weights))
    cweights = [0.0]
    for w in weights:
        cweights.append(cweights[-1] + w / s)
    zipped = zip(cweights,cweights[1:])
    for i in range(0,len(zipped)-1):
        lower_bound, higher_bound = zipped[i]
        #generating the correct mask for the dataframe, based on the weights array
        msk = np.logical_and(np.random.rand(len(df)) <= higher_bound, np.random.rand(len(df)) >= lower_bound)
        #when mask is applied to dataframe, it splits the frame randomly
        result.append(df[msk])
        df = df[~msk]
        if y is not None:
            yresult.append(y[msk])
            y = y[~msk]
        result.append(df)
        yresult.append(y)
    return result, yresult



In [164]:
## Current features included:
## 'content_rating', 'color', 'movie_facebook_likes', 'duration', 'num_critic_for_reviews', 
## 'gross', 'num_user_for_reviews', 'title_year', 'num_voted_users', 'director_facebook_likes'

columnsOfInterest = ['imdb_score','content_rating', 'color', 'duration', 'num_critic_for_reviews', 'gross', 
                     'num_user_for_reviews', 'title_year', 'num_voted_users', 'movie_facebook_likes', 
                     'director_facebook_likes']
df = df[columnsOfInterest]
columnsToEncode = list(df.select_dtypes(include=['category','object']))

# We need to encode the categorial columns (content_rating and color).
label_enc = preprocessing.LabelEncoder()
hot_enc = preprocessing.OneHotEncoder(sparse=False)
for feature in columnsToEncode:    
    df[feature] = label_enc.fit_transform(df[feature])
    hot_encoded = hot_enc.fit_transform(df[feature].reshape(-1, 1))
    hot_encoded_df = pd.DataFrame(hot_encoded)
    # Don't need the original column anymore
    del df[feature]
    # Merge the hot-encoded columns to the original df dataframe
    df = df.merge(hot_encoded_df, left_index=True, right_index=True)

#Drop any dataframe rows that have missing values.
df = df.dropna(how='any')
print(df.head(5))

#We extract the labels for model. 
label = df['imdb_score']
del df['imdb_score']

# Model DB will store randomSplit event.
data, labels = randomSplit(df, [0.7, 0.3], 1000, label)
xtrain, ytrain = data
xtest, ytest = labels

model = linear_model.LinearRegression()

## Will eventually use Model DB for fit / predict
model.fit(xtrain, xtest)
ypred = model.predict(ytrain)
##Model DB for score metrics
RMSE = metrics.mean_squared_error(ytest, ypred)**0.5
predictions = pd.DataFrame([ytest, ypred])

print("RMSE", RMSE)

   imdb_score  duration  num_critic_for_reviews      gross  \
0         7.9       178                     723  760505847   
1         7.1       169                     302  309404152   
2         6.8       148                     602  200074175   
3         8.5       164                     813  448130642   
5         6.6       132                     462   73058679   

   num_user_for_reviews  title_year  num_voted_users  movie_facebook_likes  \
0                  3054        2009           886204                 33000   
1                  1238        2007           471220                     0   
2                   994        2015           275868                 85000   
3                  2701        2012          1144337                164000   
5                   738        2012           212204                 24000   

   director_facebook_likes  0_x ...   12  13  14  15  16  17  18  0_y  1_y  \
0                        0    0 ...    0   0   0   0   0   0   0    0    0   
1 