In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, KFold


In [2]:
#Load in the data
X = pd.read_csv("../Data/trainX.csv")
y = pd.read_csv("../Data/trainY.csv")

In [3]:
# create a 75/25 stratified split of the data 
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=7, test_size=0.25)

In [4]:
print ('Train set:', X_train.shape,  y_train.shape)
print ('Test set:', X_test.shape,  y_test.shape)

Train set: (2249, 41) (2249, 1)
Test set: (750, 41) (750, 1)


In [5]:
# This is the started point you need and can input into models from here, 
# let me know if you have any questions or need any explanation/help!

## Random Forest

In [6]:
# Import the model we are using
from sklearn.ensemble import RandomForestRegressor
# Instantiate model with 1000 decision trees
rf = RandomForestRegressor(n_estimators = 500, random_state = 42)
# Train the model on training data
rf.fit(X_train, y_train.values.ravel());

In [7]:
# Use the forest's predict method on the test data
predictions = rf.predict(X_test)
# Calculate the absolute errors
errors = abs(predictions - y_test.values)
# Print out the mean absolute error (mae)
print('Mean Absolute Error:', round(np.mean(errors), 2))

Mean Absolute Error: 2.84


In [8]:
# Calculate mean absolute percentage error (MAPE)
mape = 100 * (errors / y_test.values)
# Calculate and display accuracy
accuracy = 100 - np.mean(mape)
print('Accuracy:', round(accuracy, 2), '%.')

Accuracy: 73.18 %.


In [11]:
# Get numerical feature importances
importances = list(rf.feature_importances_)
# List of tuples with variable and importance
feature_importances = [(feature, round(importance, 2)) for feature, importance in zip(X_train, importances)]
# Sort the feature importances by most important first
feature_importances = sorted(feature_importances, key = lambda x: x[1], reverse = True)
# Print out the feature and importances 
[print('Variable: {:20} Importance: {}'.format(*pair)) for pair in feature_importances];

Variable: log_budget_processed Importance: 0.4
Variable: log_num_male_crew    Importance: 0.07
Variable: log_year             Importance: 0.06
Variable: log_overview_len     Importance: 0.05
Variable: log_runtime_processed Importance: 0.05
Variable: log_num_cast         Importance: 0.05
Variable: log_week_of_year     Importance: 0.04
Variable: log_genre_rank       Importance: 0.03
Variable: log_title_len        Importance: 0.03
Variable: log_num_crew         Importance: 0.03
Variable: topStudio            Importance: 0.02
Variable: log_num_studios      Importance: 0.02
Variable: log_day_of_week      Importance: 0.02
Variable: has_tagline          Importance: 0.01
Variable: has_keywords         Importance: 0.01
Variable: log_num_genres       Importance: 0.01
Variable: log_numTopStudios    Importance: 0.01
Variable: log_num_production_countries Importance: 0.01
Variable: log_month            Importance: 0.01
Variable: log_season           Importance: 0.01
Variable: log_num_languages    I

## Extra Trees

In [19]:
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedKFold
from numpy import mean
from numpy import std

In [20]:
model = ExtraTreesRegressor()
# evaluate the model
cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=1)
n_scores = cross_val_score(model, X_train, y_train, scoring='neg_mean_absolute_error', cv=cv, n_jobs=-1, error_score='raise')
# report performance
print('MAE: %.3f (%.3f)' % (mean(n_scores), std(n_scores)))

MAE: -1.492 (0.112)


In [24]:
model.fit(X_train, y_train.values.ravel());

In [25]:
# Use the forest's predict method on the test data
predictions = model.predict(X_test)
# Calculate the absolute errors
errors = abs(predictions - y_test.values)
# Print out the mean absolute error (mae)
print('Mean Absolute Error:', round(np.mean(errors), 2))

Mean Absolute Error: 2.8


In [26]:
# Calculate mean absolute percentage error (MAPE)
mape = 100 * (errors / y_test.values)
# Calculate and display accuracy
accuracy = 100 - np.mean(mape)
print('Accuracy:', round(accuracy, 2), '%.')

Accuracy: 73.42 %.
