In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
import pandas as pd
df = pd.read_csv("cleaned_data.csv")

### Prediction model

In [3]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier

from sklearn.tree import ExtraTreeRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.linear_model import Ridge

from sklearn.metrics import classification_report
from sklearn.metrics import balanced_accuracy_score
from sklearn import metrics

from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV

from sklearn.preprocessing import MinMaxScaler
from sklearn.pipeline import make_pipeline

from sklearn.preprocessing import OneHotEncoder

from sklearn.compose import ColumnTransformer
import numpy as np

In [4]:
df

Unnamed: 0,name,rating,genre,year,score,votes,director,writer,star,country,budget,gross,runtime
0,The Shining,R,Drama,1980,8.4,927000.0,other,Stephen King,other,United Kingdom,19000000.0,4.699877e+07,146.0
1,The Blue Lagoon,R,Adventure,1980,5.8,65000.0,other,other,other,United States,4500000.0,5.885311e+07,104.0
2,Star Wars: Episode V - The Empire Strikes Back,PG,Action,1980,8.7,1200000.0,other,other,other,United States,18000000.0,5.383751e+08,124.0
3,Airplane!,PG,Comedy,1980,7.7,221000.0,other,other,other,United States,3500000.0,8.345354e+07,88.0
4,Caddyshack,R,Comedy,1980,7.3,108000.0,other,other,other,United States,6000000.0,3.984634e+07,98.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
5482,The Call of the Wild,PG,Adventure,2020,6.8,42000.0,other,other,Harrison Ford,Canada,135000000.0,1.111055e+08,100.0
5483,The Eight Hundred,Not Rated,Action,2020,6.8,3700.0,other,other,other,China,80000000.0,4.614216e+08,149.0
5484,Star Trek First Frontier,unrated,Sci-Fi,2020,5.7,165.0,other,other,other,United States,370000.0,7.850054e+07,85.0
5485,Black Wall Street Burning,R,Drama,2020,6.6,35.0,other,other,other,United States,5000.0,7.850054e+07,78.0


In [5]:
X, y = df.drop(['name','rating'], axis = 1), df['rating']

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)

In [7]:
num = list(X_train.select_dtypes('number').columns)
cat = list(X_train.select_dtypes('category').columns)

model_scale = MinMaxScaler()
model_encoder = OneHotEncoder()
transformer = ColumnTransformer([('num',model_scale,num),
                                 ('cat',model_encoder,cat)])

#### KNeighborsClassifier

In [8]:
model_class = KNeighborsClassifier()
model_combined = make_pipeline(transformer, model_class)
model_combined_optimal = GridSearchCV(model_combined,
          param_grid = {'kneighborsclassifier__n_neighbors': np.arange(1,51,2)},
                                     n_jobs = -1, cv = 10, return_train_score=True)
model_combined_optimal.fit(X_train,y_train)
model_combined_optimal

GridSearchCV(cv=10, error_score=nan,
             estimator=Pipeline(memory=None,
                                steps=[('columntransformer',
                                        ColumnTransformer(n_jobs=None,
                                                          remainder='drop',
                                                          sparse_threshold=0.3,
                                                          transformer_weights=None,
                                                          transformers=[('num',
                                                                         MinMaxScaler(copy=True,
                                                                                      feature_range=(0,
                                                                                                     1)),
                                                                         ['year',
                                                                          'sc

In [9]:
model_combined_optimal.best_params_

{'kneighborsclassifier__n_neighbors': 25}

In [10]:
model_combined_optimal.score(X_train,y_train)

0.5905673274094326

In [11]:
model_combined_optimal.score(X_test,y_test)

0.5537340619307832

In [12]:
Y_predict = model_combined_optimal.predict(X_train)
print(classification_report(y_train, Y_predict))

              precision    recall  f1-score   support

    Approved       0.00      0.00      0.00         1
           G       0.75      0.03      0.07        86
       NC-17       0.00      0.00      0.00         7
   Not Rated       0.00      0.00      0.00        40
          PG       0.61      0.23      0.33       751
       PG-13       0.56      0.48      0.52      1390
           R       0.60      0.84      0.70      2087
       TV-MA       0.00      0.00      0.00         2
     Unrated       0.00      0.00      0.00        12
           X       0.00      0.00      0.00         1
     unrated       0.00      0.00      0.00        12

    accuracy                           0.59      4389
   macro avg       0.23      0.14      0.15      4389
weighted avg       0.58      0.59      0.55      4389



In [13]:
Y_predict = model_combined_optimal.predict(X_test)
print(classification_report(y_test, Y_predict))

              precision    recall  f1-score   support

           G       0.00      0.00      0.00        25
       NC-17       0.00      0.00      0.00         5
   Not Rated       0.00      0.00      0.00         8
          PG       0.51      0.22      0.30       167
       PG-13       0.48      0.42      0.45       341
           R       0.59      0.79      0.67       541
     Unrated       0.00      0.00      0.00         6
     unrated       0.00      0.00      0.00         5

    accuracy                           0.55      1098
   macro avg       0.20      0.18      0.18      1098
weighted avg       0.52      0.55      0.52      1098



#### Tree

In [14]:
model_tree = DecisionTreeClassifier(random_state=42)
model_combined = make_pipeline(transformer, model_tree)
parameters = {'decisiontreeclassifier__criterion': ('gini','entropy'),
              'decisiontreeclassifier__max_depth': [5, 10, 20, 30, 50, 100,1000,2000], 
              'decisiontreeclassifier__min_samples_leaf': [1,2,4,8,20,40,80,100,200]}
scorer = metrics.make_scorer(balanced_accuracy_score)
model_combined_optimal = GridSearchCV(model_combined,
          param_grid = parameters, n_jobs = -1, cv = 10,  scoring=scorer, return_train_score=True)
model_combined_optimal = model_combined_optimal.fit(X_train, y_train)
best_model_tree = model_combined_optimal.best_estimator_
best_model_tree

Pipeline(memory=None,
         steps=[('columntransformer',
                 ColumnTransformer(n_jobs=None, remainder='drop',
                                   sparse_threshold=0.3,
                                   transformer_weights=None,
                                   transformers=[('num',
                                                  MinMaxScaler(copy=True,
                                                               feature_range=(0,
                                                                              1)),
                                                  ['year', 'score', 'votes',
                                                   'budget', 'gross',
                                                   'runtime']),
                                                 ('cat',
                                                  OneHotEncoder(categories='auto',
                                                                drop=None,
                                   

In [15]:
Y_predict = best_model_tree.predict(X_train)
print(classification_report(y_train, Y_predict))

              precision    recall  f1-score   support

    Approved       0.50      1.00      0.67         1
           G       0.75      0.92      0.82        86
       NC-17       0.62      0.71      0.67         7
   Not Rated       0.61      0.95      0.75        40
          PG       0.82      0.96      0.88       751
       PG-13       0.89      0.92      0.91      1390
           R       0.97      0.88      0.92      2087
       TV-MA       0.00      0.00      0.00         2
     Unrated       1.00      0.33      0.50        12
           X       0.00      0.00      0.00         1
     unrated       0.83      0.42      0.56        12

    accuracy                           0.90      4389
   macro avg       0.64      0.64      0.61      4389
weighted avg       0.91      0.90      0.90      4389



In [16]:
Y_predict = best_model_tree.predict(X_test)
print(classification_report(y_test, Y_predict))

              precision    recall  f1-score   support

           G       0.24      0.32      0.28        25
       NC-17       0.00      0.00      0.00         5
   Not Rated       0.11      0.25      0.15         8
          PG       0.28      0.37      0.32       167
       PG-13       0.41      0.41      0.41       341
           R       0.61      0.54      0.57       541
     Unrated       0.00      0.00      0.00         6
     unrated       0.00      0.00      0.00         5

    accuracy                           0.46      1098
   macro avg       0.21      0.24      0.22      1098
weighted avg       0.48      0.46      0.47      1098



#### Logistic Regression

In [17]:
lr = LogisticRegression(penalty='none', max_iter=5000)
model_combined = make_pipeline(transformer, lr)
grid={"logisticregression__C":np.logspace(-3,3,7)}
scorer = metrics.make_scorer(balanced_accuracy_score)
clf = GridSearchCV(model_combined,
          param_grid = grid, n_jobs = -1, cv = 10,  scoring=scorer, return_train_score=True)
best_lr = clf.fit(X_train, y_train)

In [18]:
Y_predict = best_lr.predict(X_train)
print(classification_report(y_train, Y_predict))

              precision    recall  f1-score   support

    Approved       1.00      1.00      1.00         1
           G       0.54      0.16      0.25        86
       NC-17       0.00      0.00      0.00         7
   Not Rated       0.00      0.00      0.00        40
          PG       0.50      0.14      0.21       751
       PG-13       0.53      0.37      0.43      1390
           R       0.56      0.86      0.68      2087
       TV-MA       1.00      0.50      0.67         2
     Unrated       0.00      0.00      0.00        12
           X       0.00      0.00      0.00         1
     unrated       0.00      0.00      0.00        12

    accuracy                           0.55      4389
   macro avg       0.38      0.28      0.30      4389
weighted avg       0.53      0.55      0.50      4389



In [19]:
Y_predict = best_lr.predict(X_test)
print(classification_report(y_test, Y_predict))

              precision    recall  f1-score   support

           G       0.70      0.28      0.40        25
       NC-17       0.00      0.00      0.00         5
   Not Rated       0.00      0.00      0.00         8
          PG       0.42      0.12      0.19       167
       PG-13       0.56      0.37      0.44       341
           R       0.58      0.87      0.70       541
     Unrated       0.00      0.00      0.00         6
     unrated       0.00      0.00      0.00         5

    accuracy                           0.57      1098
   macro avg       0.28      0.21      0.22      1098
weighted avg       0.54      0.57      0.52      1098



#### AdaBoostClassifier

In [20]:
mod_ada = AdaBoostClassifier(DecisionTreeClassifier())
model_combined = make_pipeline(transformer, mod_ada)
n_estimators = [int(x) for x in np.linspace(start = 1, stop = 30, num = 1)]
parameters = {'adaboostclassifier__n_estimators': n_estimators
             }
model_combined_optimal = GridSearchCV(model_combined,
          param_grid = parameters, n_jobs = -1, cv = 10,  scoring=scorer, return_train_score=True)
model_combined_optimal = model_combined_optimal.fit(X_train, y_train)
best_model_tree = model_combined_optimal.best_estimator_
best_model_tree

Pipeline(memory=None,
         steps=[('columntransformer',
                 ColumnTransformer(n_jobs=None, remainder='drop',
                                   sparse_threshold=0.3,
                                   transformer_weights=None,
                                   transformers=[('num',
                                                  MinMaxScaler(copy=True,
                                                               feature_range=(0,
                                                                              1)),
                                                  ['year', 'score', 'votes',
                                                   'budget', 'gross',
                                                   'runtime']),
                                                 ('cat',
                                                  OneHotEncoder(categories='auto',
                                                                drop=None,
                                   

In [21]:
Y_predict = best_model_tree.predict(X_train)
print(classification_report(y_train, Y_predict))

              precision    recall  f1-score   support

    Approved       1.00      1.00      1.00         1
           G       1.00      1.00      1.00        86
       NC-17       1.00      1.00      1.00         7
   Not Rated       1.00      1.00      1.00        40
          PG       1.00      1.00      1.00       751
       PG-13       1.00      1.00      1.00      1390
           R       1.00      1.00      1.00      2087
       TV-MA       1.00      1.00      1.00         2
     Unrated       1.00      1.00      1.00        12
           X       1.00      1.00      1.00         1
     unrated       1.00      1.00      1.00        12

    accuracy                           1.00      4389
   macro avg       1.00      1.00      1.00      4389
weighted avg       1.00      1.00      1.00      4389



In [22]:
Y_predict = best_model_tree.predict(X_test)
print(classification_report(y_test, Y_predict))

              precision    recall  f1-score   support

           G       0.25      0.28      0.26        25
       NC-17       0.00      0.00      0.00         5
   Not Rated       0.00      0.00      0.00         8
          PG       0.32      0.33      0.32       167
       PG-13       0.46      0.48      0.47       341
           R       0.62      0.59      0.60       541
     Unrated       0.25      0.17      0.20         6
     unrated       0.00      0.00      0.00         5

    accuracy                           0.49      1098
   macro avg       0.24      0.23      0.23      1098
weighted avg       0.50      0.49      0.50      1098



#### Keeping only good features by intuition

In [23]:
X, y = df.drop(['name','rating', 'writer','star', 'director', 'country'], axis = 1), df['rating']

In [24]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)

In [25]:
num = list(X_train.select_dtypes('number').columns)
cat = list(X_train.select_dtypes('category').columns)

model_scale = MinMaxScaler()
model_encoder = OneHotEncoder()
transformer = ColumnTransformer([('num',model_scale,num),
                                 ('cat',model_encoder,cat)])

lr = LogisticRegression(penalty='none', max_iter=5000)
model_combined = make_pipeline(transformer, lr)
grid={"logisticregression__C":np.logspace(-3,3,7)}
scorer = metrics.make_scorer(balanced_accuracy_score)
clf = GridSearchCV(model_combined,
          param_grid = grid, n_jobs = -1, cv = 10,  scoring=scorer, return_train_score=True)
best_lr = clf.fit(X_train, y_train)

In [26]:
Y_predict = best_lr.predict(X_train)
print(classification_report(y_train, Y_predict))

              precision    recall  f1-score   support

    Approved       1.00      1.00      1.00         1
           G       0.53      0.21      0.30        92
       NC-17       0.00      0.00      0.00         9
   Not Rated       0.00      0.00      0.00        42
          PG       0.48      0.12      0.19       739
       PG-13       0.54      0.37      0.44      1387
           R       0.57      0.87      0.69      2092
       TV-MA       1.00      0.50      0.67         2
     Unrated       0.00      0.00      0.00        12
           X       0.00      0.00      0.00         1
     unrated       0.00      0.00      0.00        12

    accuracy                           0.56      4389
   macro avg       0.37      0.28      0.30      4389
weighted avg       0.53      0.56      0.51      4389



In [27]:
Y_predict = best_lr.predict(X_test)
print(classification_report(y_test, Y_predict))

              precision    recall  f1-score   support

           G       0.45      0.26      0.33        19
       NC-17       0.00      0.00      0.00         3
   Not Rated       0.00      0.00      0.00         6
          PG       0.55      0.12      0.19       179
       PG-13       0.50      0.39      0.44       344
           R       0.56      0.82      0.67       536
     Unrated       0.00      0.00      0.00         6
     unrated       0.00      0.00      0.00         5

    accuracy                           0.55      1098
   macro avg       0.26      0.20      0.20      1098
weighted avg       0.53      0.55      0.50      1098



In [28]:
## 2

In [29]:
mod_ada = AdaBoostClassifier(DecisionTreeClassifier())
model_combined = make_pipeline(transformer, mod_ada)
n_estimators = [int(x) for x in np.linspace(start = 1, stop = 30, num = 1)]
parameters = {'adaboostclassifier__n_estimators': n_estimators
             }
model_combined_optimal = GridSearchCV(model_combined,
          param_grid = parameters, n_jobs = -1, cv = 10,  scoring=scorer, return_train_score=True)
model_combined_optimal = model_combined_optimal.fit(X_train, y_train)
best_model_tree = model_combined_optimal.best_estimator_
best_model_tree

Y_predict = best_model_tree.predict(X_train)
print(classification_report(y_train, Y_predict))

Y_predict = best_model_tree.predict(X_test)
print(classification_report(y_test, Y_predict))

              precision    recall  f1-score   support

    Approved       1.00      1.00      1.00         1
           G       1.00      1.00      1.00        92
       NC-17       1.00      1.00      1.00         9
   Not Rated       1.00      1.00      1.00        42
          PG       1.00      1.00      1.00       739
       PG-13       1.00      1.00      1.00      1387
           R       1.00      1.00      1.00      2092
       TV-MA       1.00      1.00      1.00         2
     Unrated       1.00      1.00      1.00        12
           X       1.00      1.00      1.00         1
     unrated       1.00      1.00      1.00        12

    accuracy                           1.00      4389
   macro avg       1.00      1.00      1.00      4389
weighted avg       1.00      1.00      1.00      4389

              precision    recall  f1-score   support

           G       0.32      0.42      0.36        19
       NC-17       0.00      0.00      0.00         3
   Not Rated       0.00 

#### Regression

In [30]:
X, y = df.drop(['name','gross', 'year', 'score', 'runtime'], axis = 1), df['gross'] # removed low correlated ones

In [31]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)

In [32]:
num = list(X_train.select_dtypes('number').columns)
cat = list(X_train.select_dtypes('category').columns)

model_scale = MinMaxScaler()
model_encoder = OneHotEncoder()
transformer = ColumnTransformer([('num',model_scale,num),
                                 ('cat',model_encoder,cat)])

mod_ada_2 = AdaBoostRegressor(ExtraTreeRegressor()) # base_estimator=None by default
model_combined = make_pipeline(transformer, mod_ada_2)

n_estimators = [int(x) for x in np.linspace(start = 1, stop = 30, num = 1)]
parameters = {'adaboostregressor__n_estimators': n_estimators
             }
gridCV= GridSearchCV(mod_ada_2, parameters, cv=10)
model_combined_optimal = GridSearchCV(model_combined,
          param_grid = parameters, n_jobs = -1, cv = 10, return_train_score=True)
model_combined_optimal = model_combined_optimal.fit(X_train, y_train)
best_model_extra = model_combined_optimal.best_estimator_
best_model_extra

Pipeline(memory=None,
         steps=[('columntransformer',
                 ColumnTransformer(n_jobs=None, remainder='drop',
                                   sparse_threshold=0.3,
                                   transformer_weights=None,
                                   transformers=[('num',
                                                  MinMaxScaler(copy=True,
                                                               feature_range=(0,
                                                                              1)),
                                                  ['votes', 'budget']),
                                                 ('cat',
                                                  OneHotEncoder(categories='auto',
                                                                drop=None,
                                                                dtype=<class 'numpy.float64'>,
                                                                handle_unknown=

In [33]:
best_model_extra.score(X_train,y_train)

0.7822950744891555

In [34]:
best_model_extra.score(X_test,y_test)

0.4652897896689281

#### Ridge regression

In [35]:
model_scale = MinMaxScaler()
model_ridge = Ridge()
model_encoder = OneHotEncoder()
num = list(X_train.select_dtypes('number').columns)
cat = list(X_train.select_dtypes('category').columns)
transformer = ColumnTransformer([('num',model_scale,num),
                                 ('cat',model_encoder,cat)])
model_combined = make_pipeline(transformer, model_ridge)
alpha_grid = np.linspace(0.0001, 50, 50)
model_combined_optimal = GridSearchCV(model_combined, 
                             param_grid = {'ridge__alpha': np.linspace(0.0001, 50, 50)})
model_combined_optimal.fit(X_train,y_train)

GridSearchCV(cv=None, error_score=nan,
             estimator=Pipeline(memory=None,
                                steps=[('columntransformer',
                                        ColumnTransformer(n_jobs=None,
                                                          remainder='drop',
                                                          sparse_threshold=0.3,
                                                          transformer_weights=None,
                                                          transformers=[('num',
                                                                         MinMaxScaler(copy=True,
                                                                                      feature_range=(0,
                                                                                                     1)),
                                                                         ['votes',
                                                                          

In [36]:
model_combined_optimal.best_params_

{'ridge__alpha': 0.0001}

In [37]:
model_combined_optimal.score(X_train,y_train)

0.6651766630546422

In [38]:
model_combined_optimal.score(X_test,y_test)

0.5945967337938072

#### Ridge regression reduced params

In [4]:
X, y = df.drop(['name','gross', 'year', 'score', 'runtime', 'writer'], axis = 1), df['gross'] 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)

In [5]:
model_scale = MinMaxScaler()
model_ridge = Ridge()
model_encoder = OneHotEncoder()
num = list(X_train.select_dtypes('number').columns)
cat = list(X_train.select_dtypes('category').columns)
transformer = ColumnTransformer([('num',model_scale,num),
                                 ('cat',model_encoder,cat)])
model_combined = make_pipeline(transformer, model_ridge)
alpha_grid = np.linspace(0.0001, 50, 50)
model_combined_optimal = GridSearchCV(model_combined, 
                             param_grid = {'ridge__alpha': np.linspace(0.0001, 1, 50)})
model_combined_optimal.fit(X_train,y_train)

GridSearchCV(cv=None, error_score=nan,
             estimator=Pipeline(memory=None,
                                steps=[('columntransformer',
                                        ColumnTransformer(n_jobs=None,
                                                          remainder='drop',
                                                          sparse_threshold=0.3,
                                                          transformer_weights=None,
                                                          transformers=[('num',
                                                                         MinMaxScaler(copy=True,
                                                                                      feature_range=(0,
                                                                                                     1)),
                                                                         ['votes',
                                                                          

In [6]:
model_combined_optimal.best_params_

{'ridge__alpha': 0.8367510204081633}

In [7]:
model_combined_optimal.score(X_train,y_train)

0.6602228106048932

In [8]:
model_combined_optimal.score(X_test,y_test)

0.6238786684810974

In [11]:
model_combined_optimal.best_estimator_.named_steps

{'columntransformer': ColumnTransformer(n_jobs=None, remainder='drop', sparse_threshold=0.3,
                   transformer_weights=None,
                   transformers=[('num',
                                  MinMaxScaler(copy=True, feature_range=(0, 1)),
                                  ['votes', 'budget']),
                                 ('cat',
                                  OneHotEncoder(categories='auto', drop=None,
                                                dtype=<class 'numpy.float64'>,
                                                handle_unknown='error',
                                                sparse=True),
                                  [])],
                   verbose=False),
 'ridge': Ridge(alpha=0.8367510204081633, copy_X=True, fit_intercept=True, max_iter=None,
       normalize=False, random_state=None, solver='auto', tol=0.001)}

# Save final results

In [9]:
import pickle

In [12]:
with open('model.pkl','wb') as f:
    pickle.dump(model_combined_optimal.best_estimator_,f)