In [49]:
import pandas as pd
from pandas.api.types import is_numeric_dtype
import numpy as np
import ast
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
import matplotlib.pyplot as plt

In [122]:
class Data:
    def __init__(self):
        pass
    
    def get_target_df(self, df, target):
        '''returns target dataframe'''
        return df[target]
    
    def get_non_numeric_columns(self, df):
        '''Get non-numeric columns'''
        non_float_columns = []
        for col in df.columns:
            if not is_numeric_dtype(df[col]):
                non_float_columns.append(col)
        return non_float_columns
    
    def get_numeric_columns(self, df):
        '''Get numeric columns'''
        numeric_columns = []
        for col in df.columns:
            if is_numeric_dtype(df[col]):
                numeric_columns.append(col)
        return numeric_columns
    
    def clean_data(self, df, drop_na):
        '''fill null values and impute data'''
        fn1 = lambda row: 0 if pd.isnull(row.homepage) else 1
        fn2 = lambda row: 0 if pd.isnull(row.belongs_to_collection) else 1
        fn3 = lambda row: 0 if pd.isnull(row.genres) else len(ast.literal_eval(row.genres))
        fn4 = lambda row: 0 if pd.isnull(row.spoken_languages) else len(ast.literal_eval(row.spoken_languages))
        fn5 = lambda row: 0 if pd.isnull(row.Keywords) else len(ast.literal_eval(row.Keywords))
        fn6 = lambda row: 0 if pd.isnull(row.production_countries) else len(ast.literal_eval(row.production_countries))
        fn7 = lambda row: 0 if pd.isnull(row.production_companies) else len(ast.literal_eval(row.production_companies))  
        df['homepage'] = df.apply(fn1, axis=1)
        df['belongs_to_collection'] = df.apply(fn2, axis=1)
        df['genres'] = df.apply(fn3, axis=1)
        df['spoken_languages'] = df.apply(fn4, axis=1)
        df['Keywords'] = df.apply(fn5, axis=1)
        df['production_countries'] = df.apply(fn6, axis=1)
        df['production_companies'] = df.apply(fn7, axis=1)
        df['original_language'] = df['original_language'].astype('category')
        df['release_date'] = pd.to_datetime(df['release_date'])
        df['release_date'] = [row.year for row in df['release_date']]
        df['release_date'].fillna(df['release_date'].mean(), inplace=True)
        df['release_date'] = df['release_date'].astype('category')
        df['runtime'].fillna(df['runtime'].mean(), inplace=True)

        df = df.drop(['imdb_id', 'cast', 'crew', 'status',
                            'poster_path', 'tagline', 'title', 'overview', 'original_title'], axis=1)
        return df
    
    def encode_features(self, df):
        '''Encode the features'''
        columns = ['original_language', 'release_date']
        le = LabelEncoder()
        for col in columns:
            df[col] = le.fit_transform(df[col])
        return df
    
    def create_train_df(self, train_feature_file, target, preprocessing):
        '''Prepare training data'''
        train_feature_df = self.load_data(train_feature_file)       
        if preprocessing:
            train_feature_df = self.clean_data(train_feature_df, drop_na=True)
            train_feature_df = self.encode_features(train_feature_df)
        target_df = self.get_target_df(train_feature_df, target)
        train_df = train_feature_df.drop(['id', target], axis=1)
        return train_df, target_df
    
    def create_test_df(self, test_file):
        '''Prepare testing data'''
        test_df = self.load_data(test_file)
        test_df = self.clean_data(test_df, drop_na=False)
        test_df = self.encode_features(test_df)
        test_rowids = test_df['id']
        test_df = test_df.drop(['id'], axis=1)
        return test_df, test_rowids
        
    def load_data(self, file):
        '''Load the data'''
        return pd.read_csv(file)


In [123]:
data = Data()

In [124]:
train_df, target_df = data.create_train_df('train.csv', target='revenue', preprocessing=True)

In [125]:
test_df, test_rowids = data.create_test_df('test.csv')

In [126]:
X_train, X_test, y_train, y_test = train_test_split(train_df, target_df, test_size=0.2, random_state=42)

<h3> Linear Regression </h3>

In [127]:
reg = LinearRegression().fit(X_train, y_train)

In [128]:
movie_revenue_y_preds = reg.predict(X_test)

In [129]:
reg.score(X_test, y_test)
print('Coefficients: \n', reg.coef_)
print("Mean squared error: %.2f"
      % mean_squared_error(y_test, movie_revenue_y_preds))
# Explained variance score
print('Variance score: %.2f' % r2_score(y_test, movie_revenue_y_preds))

Coefficients: 
 [ 5.50704393e+07  2.41196481e+00 -2.92633837e+06  2.04183588e+07
  2.28899414e+05  2.24814316e+06 -3.93514465e+06 -3.27512208e+06
 -2.29647598e+05  3.04345234e+05 -3.06980608e+06  7.81170271e+05]
Mean squared error: 6006561556819613.00
Variance score: 0.64


<h3> Random Forest </h3>

In [108]:
n_estimators = [int(x) for x in np.linspace(start = 100, stop = 1000, num = 10)]
max_features = ['auto', 'sqrt']
max_depth = [int(x) for x in np.linspace(10, 100, num = 10)]
max_depth.append(None)
min_samples_split = [2, 5, 10]
min_samples_leaf = [1, 2, 4]
bootstrap = [True, False]
# parameters for random forest
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}

In [None]:
sorted(sklearn.metrics.SCORERS.keys())

In [109]:
rfc = RandomForestRegressor()
rf_random = RandomizedSearchCV(estimator = rfc, param_distributions = random_grid, n_iter = 15, cv = 3, 
                               scoring='neg_mean_squared_error', verbose=2, random_state=42, n_jobs = -1)
# Fit the random search model
rf_random.fit(X_train, y_train)

Fitting 3 folds for each of 15 candidates, totalling 45 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:    6.9s
[Parallel(n_jobs=-1)]: Done  45 out of  45 | elapsed:   17.1s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done  45 out of  45 | elapsed:   17.1s finished


RandomizedSearchCV(cv=3, error_score='raise-deprecating',
                   estimator=RandomForestRegressor(bootstrap=True,
                                                   criterion='mse',
                                                   max_depth=None,
                                                   max_features='auto',
                                                   max_leaf_nodes=None,
                                                   min_impurity_decrease=0.0,
                                                   min_impurity_split=None,
                                                   min_samples_leaf=1,
                                                   min_samples_split=2,
                                                   min_weight_fraction_leaf=0.0,
                                                   n_estimators='warn',
                                                   n_jobs=None, oob_score=False,
                                                   random_sta...


In [110]:
rf_random.best_params_

{'n_estimators': 500,
 'min_samples_split': 10,
 'min_samples_leaf': 1,
 'max_features': 'sqrt',
 'max_depth': 70,
 'bootstrap': False}

In [111]:
y_preds = rf_random.predict(X_test)

In [112]:
print("Mean squared error: %.2f" % mean_squared_error(y_test, y_preds))
# Explained variance score
print('Variance score: %.2f' % r2_score(y_test, y_preds))

Mean squared error: 4461092132013514.00
Variance score: 0.73


In [119]:
# make predictions on test data
preds = rf_random.predict(test_df)

In [120]:
d = {'id': test_rowids, 'revenue': preds}
results = pd.DataFrame(d)
# save results to csv
results.to_csv("results.csv", header=True, index=False)