In [1]:
from sklearn import linear_model
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler

## Read, Clean Data

In [2]:
df = pd.read_csv('../data/tmdb_5000_movies.csv.gz') #https://www.kaggle.com/tmdb/tmdb-movie-metadata
df.dropna(inplace=True)
df.budget = df.budget/1000000
df.revenue = df.revenue/1000000
df.head(2)

Unnamed: 0,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,status,tagline,title,vote_average,vote_count
0,237.0,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.avatarmovie.com/,19995,"[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...",en,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,"[{""name"": ""Ingenious Film Partners"", ""id"": 289...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2009-12-10,2787.965087,162.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,Enter the World of Pandora.,Avatar,7.2,11800
1,300.0,"[{""id"": 12, ""name"": ""Adventure""}, {""id"": 14, ""...",http://disney.go.com/disneypictures/pirates/,285,"[{""id"": 270, ""name"": ""ocean""}, {""id"": 726, ""na...",en,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...",139.082615,"[{""name"": ""Walt Disney Pictures"", ""id"": 2}, {""...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2007-05-19,961.0,169.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"At the end of the world, the adventure begins.",Pirates of the Caribbean: At World's End,6.9,4500


## Identify Features, Rescale data and split train/test

In [6]:
predictors = ['budget','popularity','runtime','vote_average','vote_count','id']
m = len(df)
n = len(predictors) +1
X_orig = df[predictors].values
y_orig = df['revenue'].values.reshape(-1,1)
print('X_orig.shape:',X_orig.shape, 'y_orig.shape', y_orig.shape)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_orig)
y_scaled = scaler.fit_transform(y_orig)

#Split the data into train and validation (use random state for ability to reproduce results)
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y_scaled, test_size=0.2,random_state=42)

X_orig.shape: (1493, 6) y_orig.shape (1493, 1)


## Define and Train the model, Predict 

In [5]:
# Model to try to predict the Revenue 
reg = linear_model.ElasticNet(alpha=0.5,l1_ratio=0.2)
# alpha: Constants that multiplies the penalty
# l1_ratio = 0 => use only L2 norm(diminishes the size of the params) [=Ridge]
# l1_ratio = 1=> use only L1 norm(tends to drop less useful parameters) [=Lasso]
# choose some value in between to mix the penalties

reg.fit(X_train,y_train)
y_pred = reg.predict(X_test)

## Inspect regression

In [5]:
print('Coefficients:')
for (p,v) in zip(predictors,np.round(reg.coef_,3)):
    print(p,v)
print('Intercept:', np.round(reg.intercept_,3))

# The mean squared error
print("\nMean squared error: %.2f" % np.sqrt(mean_squared_error(y_test, y_pred)))

# Explained variance score: 1 is perfect prediction
print('Variance score: %.2f' % r2_score(y_test, y_pred))

Coefficients:
budget 0.283
popularity 0.121
runtime 0.008
vote_average 0.0
vote_count 0.319
id -0.0
Intercept: [0.003]

Mean squared error: 0.56
Variance score: 0.68
