In [1]:
print('starting')

starting


In [6]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from IPython.display import display
from pycaret.regression import *

In [3]:
def open_col(df, col_name):
    col_list = (df[col_name]
                .apply(lambda x: x if x == x else '[]')
                .apply(eval)
                .explode()
                .apply(lambda x: x if x == x else {})
               )
    col_df = pd.DataFrame(col_list.tolist(), index=col_list.index)
    return df[['movie_id']].join(col_df, how='right')
 
def month_replace(m):
    if m in ['06', '07']:
        return 'summer'
    elif m in ['11', '12']:
        return 'winter'
    else:
        return 'other'
    
def year_replace(y):
    if y <= 1960:
        return '(-inf, 1960]'
    elif y <= 1980:
        return '(1960, 1980]'
    elif y <= 1995:
        return '(1980, 1995]'
    elif y <= 2010:
        return '(1995, 2010]'
    else:
        return '(2010, inf]'

def multiple_values(df, sub_df, col_name, prefix):
    n = (sub_df
         .fillna('Other')
         .groupby('movie_id')[col_name].apply(', '.join)
         .str.get_dummies(', '))
    n.columns = prefix + n.columns
    n = n.reset_index()
    df = df.merge(n, on='movie_id')
    return df
    
def size_value(df, sub_df, prefix):
    n = sub_df.groupby('movie_id').size().reset_index()
    n = n.rename(columns={0: prefix+'_size'})
    return df.merge(n, on='movie_id')

In [4]:
def pre_procces(file_name, train=False):

    # reading data #
    df = pd.read_csv(file_name, sep='\t')
    remove_cols = ['status', 'poster_path', 'backdrop_path', 'video']
    df = df.drop(remove_cols, axis='columns')
    df = df.rename(columns={'id':'movie_id'})

    df = df.fillna({'belongs_to_collection': ''})
    df['belongs_to_collection'] = '[' + df['belongs_to_collection'] + ']'

    subs_cols = ['belongs_to_collection', 'production_companies', 'production_countries', 
                 'genres', 'spoken_languages', 'Keywords', 'cast', 'crew']
    subs_dict = {}
    for n in subs_cols:
        sub_df = open_col(df, n)
        subs_dict[n] = sub_df

    languages = ['en', 'fr', 'es', 'de', 'it', 'ru', 'ja', 'hi', 'zh', 'ar', 'pt', 'ko', 'cn', 'la', 'pl']
    subs_dict['spoken_languages'].loc[~subs_dict['spoken_languages']['iso_639_1'].isin(languages), 'iso_639_1'] = 'Other'

    prod_comp = ['Warner Bros. Pictures', 'Universal Pictures', 'Paramount', 'Columbia Pictures', '20th Century Fox', 
                 'Metro-Goldwyn-Mayer', 'New Line Cinema', 'Canal+', 'Touchstone Pictures', 'Walt Disney Pictures', 
                 'Miramax', 'Sony Pictures', 'United Artists', 'Relativity Media', 'DreamWorks Pictures', 
                 'TriStar Pictures', 'Lionsgate', 'StudioCanal', 'Village Roadshow Pictures', 'Working Title Films',
                 'Amblin Entertainment', 'Regency Enterprises', 'Fox Searchlight Pictures', 'Focus Features', 
                 'Imagine Entertainment', 'BBC Films', 'Dimension Films', 'Film4 Productions', 'Castle Rock Entertainment', 
                 'Screen Gems', 'Hollywood Pictures', 'Dune Entertainment', 'Malpaso Productions', 'New Regency Pictures', 
                 'PolyGram Filmed Entertainment', 'Participant Media', "Centre national du cinéma et de l'image animée", 
                 'Legendary Entertainment', 'Davis Entertainment', 'TF1 Films Production']
    subs_dict['production_companies'].loc[~subs_dict['production_companies']['name'].isin(prod_comp), 'name'] = 'Other'

    prod_cntr = ['United States of America', 'United Kingdom', 'France', 'Germany', 'Canada', 'India', 'Japan', 
                 'Italy', 'Spain', 'Australia', 'China', 'Russia', 'Hong Kong', 'South Korea', 'Belgium', 'Ireland', 
                 'Sweden', 'Denmark', 'Mexico', 'Netherlands']
    subs_dict['production_countries'].loc[~subs_dict['production_countries']['name'].isin(prod_cntr), 'name'] = 'Other'


    # features_engineering #
    df['budget0'] = df['budget']==0

    if train:
        df['ratio'] = df['budget'] / df['revenue']
        df = df[(df['budget0']) | (df['ratio']>0.001)]
        df = df[df['ratio']<100]
        

    df['is_collection'] = df['belongs_to_collection'] != '[]'

    df['month'] = df['release_date'].str[5:7]
    df['month_cat'] = df['month'].apply(month_replace)
    
    df['year'] = df['release_date'].str[:4].astype('int')
    df['year_cat'] = df['year'].apply(year_replace)
    
    
    df = multiple_values(df, subs_dict['genres'], 'name', 'genre_')

    df = multiple_values(df, subs_dict['spoken_languages'], 'iso_639_1', 'lang_')
    
    df = multiple_values(df, subs_dict['production_companies'], 'name', 'prod_comp_')

    df = multiple_values(df, subs_dict['production_countries'], 'name', 'prod_cntr_')
    
    df = size_value(df, subs_dict['cast'], 'cast')
    
    df = size_value(df, subs_dict['crew'], 'crew')
    
    return df


train_df = pre_procces('train.tsv', train=True)

test_df = pre_procces('test.tsv')

### predict positive numbers!!

In [7]:
con_features = ['budget', 'popularity', 'revenue', 'runtime', 'vote_average', 'vote_count', 'cast_size', 'crew_size']
mlt_prefixes = ['genre', 'lang', 'prod_comp', 'prod_cntr']
mlt_features = [c for c in train_df.columns for p in mlt_prefixes if c.startswith(p+'_')]
cat_features = ['is_collection', 'month_cat', 'year_cat', 'budget0']

selected = con_features + mlt_features + cat_features
print(len(selected))

reg1 = setup(data=train_df[selected], test_data=test_df[selected], target='revenue',
             normalize=True, normalize_method='robust')

Unnamed: 0,Description,Value
0,session_id,4703
1,Target,revenue
2,Original Data,"(5142, 110)"
3,Missing Values,True
4,Numeric Features,7
5,Categorical Features,102
6,Ordinal Features,False
7,High Cardinality Features,False
8,High Cardinality Method,
9,Transformed Train Set,"(5142, 115)"


In [20]:
compare_models(sort='rmsle')

Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
et,Extra Trees Regressor,29853493.1735,4362532505302021.0,65166922.8186,0.7869,2.2845,13322.047,2.144
knn,K Neighbors Regressor,32855760.8,5280867214059110.0,71952518.0,0.7383,2.3275,15427.1946,0.179
rf,Random Forest Regressor,30526458.7646,4609771910544521.0,67134777.9166,0.7697,2.3398,13652.8545,2.15
lightgbm,Light Gradient Boosting Machine,29659182.0701,4337189479094829.0,65081388.4009,0.783,2.3782,13721.7957,0.135
huber,Huber Regressor,38393599.8067,9354105635008020.0,95485871.5503,0.5513,2.4345,13907.132,0.076
gbr,Gradient Boosting Regressor,31475974.3709,4426579075019575.0,65938533.2139,0.7781,2.4401,19251.4277,0.489
en,Elastic Net,36402724.2,5775282293807514.0,75289058.0,0.7141,2.6268,20352.4844,0.152
par,Passive Aggressive Regressor,54359511.0239,1.919306836633087e+16,137101264.1485,0.0747,2.7395,24674.5026,0.843
omp,Orthogonal Matching Pursuit,38659847.1013,5419047318084114.0,72996409.2919,0.7296,2.7627,36112.3343,0.018
dt,Decision Tree Regressor,42069092.5109,9290862879886420.0,96061690.3926,0.524,2.79,12071.8935,0.06


ExtraTreesRegressor(bootstrap=False, ccp_alpha=0.0, criterion='mse',
                    max_depth=None, max_features='auto', max_leaf_nodes=None,
                    max_samples=None, min_impurity_decrease=0.0,
                    min_impurity_split=None, min_samples_leaf=1,
                    min_samples_split=2, min_weight_fraction_leaf=0.0,
                    n_estimators=100, n_jobs=-1, oob_score=False,
                    random_state=7970, verbose=0, warm_start=False)

In [7]:
knn_model = create_model('knn')
predict_model(knn_model)
knn_model

Unnamed: 0,MAE,MSE,RMSE,R2,RMSLE,MAPE
0,35658132.0,6716497774772224.0,81954240.0,0.7269,2.1026,10094.7188
1,31804398.0,6193621476835328.0,78699568.0,0.6268,2.2116,9921.5674
2,30413930.0,4365365299642368.0,66070912.0,0.7386,2.2982,10205.5117
3,35158296.0,7046967590912000.0,83946216.0,0.7884,2.4681,17005.3809
4,35757536.0,6127517098311680.0,78278456.0,0.7425,2.1465,4033.6975
5,35309392.0,6948911876931584.0,83360136.0,0.7328,2.346,19253.5508
6,33092538.0,4927927900700672.0,70199200.0,0.7275,2.4258,13719.3545
7,32796182.0,3955633204232192.0,62893824.0,0.6581,2.4293,8506.1895
8,30254732.0,3567661829062656.0,59729908.0,0.8058,2.4998,39335.1914
9,28312472.0,2958568089190400.0,54392720.0,0.8355,2.3472,22196.7832


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE
0,K Neighbors Regressor,34738464.0,7475764240842752.0,86462504.0,0.713,2.2633,16128.1406


KNeighborsRegressor(algorithm='auto', leaf_size=30, metric='minkowski',
                    metric_params=None, n_jobs=-1, n_neighbors=5, p=2,
                    weights='uniform')

In [9]:
knn_model = tune_model(knn_model, optimize='rmsle', search_library='optuna', search_algorithm='tpe')
predict_model(knn_model)
knn_model

Unnamed: 0,MAE,MSE,RMSE,R2,RMSLE,MAPE
0,34398202.6191,6393042635645111.0,79956504.6487,0.7401,2.1335,7111.8139
1,29115956.2972,5210718791265230.0,72185308.6941,0.686,2.323,7535.1206
2,29171198.5247,4465592089104439.0,66825085.777,0.7326,2.4243,15683.1332
3,33988114.2178,8878456640038570.0,94225562.5615,0.7335,2.4715,14810.0918
4,35301233.7114,6493382428753125.0,80581526.5973,0.7271,2.1732,5156.5768
5,35253905.9272,7866340925334910.0,88692394.969,0.6976,2.3879,14667.9098
6,30803118.2092,4397151315104151.0,66311019.5601,0.7569,2.4604,10760.053
7,32453191.7508,4845359662621762.0,69608617.7324,0.5812,2.5432,10552.7893
8,27706748.874,3302173431350224.0,57464540.6433,0.8203,2.5751,34537.3583
9,28043151.05,2986120782630053.0,54645409.5293,0.834,2.4104,37090.3737


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE
0,K Neighbors Regressor,34607443.5218,7844712093087451.0,88570379.3211,0.6989,2.332,25165.3


KNeighborsRegressor(algorithm='auto', leaf_size=30, metric='minkowski',
                    metric_params=None, n_jobs=-1, n_neighbors=23, p=2,
                    weights='distance')

In [None]:
# before
# KNeighborsRegressor(algorithm='auto', leaf_size=30, metric='minkowski',
#                     metric_params=None, n_jobs=-1, n_neighbors=5, p=2,
#                     weights='uniform')
# validation 2.3275
# prediction 2.2633


# random
# baysian
# KNeighborsRegressor(algorithm='auto', leaf_size=30, metric='manhattan',
#                     metric_params=None, n_jobs=-1, n_neighbors=5, p=2,
#                     weights='distance')
# validation 2.2892
# prediction 2.2388

# tpe
# KNeighborsRegressor(algorithm='auto', leaf_size=30, metric='minkowski',
#                     metric_params=None, n_jobs=-1, n_neighbors=23, p=2,
#                     weights='distance')
# validation 2.3902
# prediction 2.3320

# Extra Tree

In [10]:
et_model = create_model('et', bootstrap=False, ccp_alpha=0.0, criterion='mae',
                    max_depth=10, max_features=0.7565906156347971,
                    max_leaf_nodes=None, max_samples=None,
                    min_impurity_decrease=0.007191975016434702,
                    min_impurity_split=None, min_samples_leaf=2,
                    min_samples_split=10, min_weight_fraction_leaf=0.0,
                    n_estimators=230, n_jobs=-1, oob_score=False,
                    random_state=5331, warm_start=False)
predict_model(et_model)
et_model

Unnamed: 0,MAE,MSE,RMSE,R2,RMSLE,MAPE
0,30891099.0405,5155656491901310.0,71802900.3029,0.7904,1.893,4744.8691
1,26596708.1569,4312554351000089.5,65670041.503,0.7401,2.1159,4903.8184
2,25623940.3283,3243513170001050.5,56951849.5749,0.8058,2.2414,11744.1448
3,32056733.4398,8275197916274229.0,90968114.833,0.7516,2.271,8151.1407
4,30529198.1687,4397365992447097.0,66312638.2558,0.8152,1.946,1573.3507
5,32234666.3815,6808025513898382.0,82510759.9886,0.7382,2.1693,6964.1386
6,28421398.5329,4053241923724946.5,63665076.1699,0.7759,2.2753,14404.4362
7,28946739.1375,3267755894413431.5,57164288.6286,0.7176,2.3724,6055.2321
8,26375222.6975,2972399455134722.0,54519716.2056,0.8382,2.3516,15054.8596
9,25616849.8978,2498995807945272.5,49989957.0708,0.8611,2.163,10034.7012


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE
0,Extra Trees Regressor,31794559.9569,6765338407915637.0,82251677.2347,0.7403,2.1362,21409.8398


ExtraTreesRegressor(bootstrap=False, ccp_alpha=0.0, criterion='mae',
                    max_depth=10, max_features=0.7565906156347971,
                    max_leaf_nodes=None, max_samples=None,
                    min_impurity_decrease=0.007191975016434702,
                    min_impurity_split=None, min_samples_leaf=2,
                    min_samples_split=10, min_weight_fraction_leaf=0.0,
                    n_estimators=230, n_jobs=-1, oob_score=False,
                    random_state=5331, verbose=0, warm_start=False)

In [12]:
# et_model = tune_model(et_model, optimize='rmsle', search_library='optuna', search_algorithm='tpe')
# et_model = tune_model(et_model, optimize='rmsle', tuner_verbose=100)
et_model = tune_model(et_model, optimize='rmsle', search_library='tune-sklearn', search_algorithm='bayesian')
predict_model(et_model)
et_model

Unnamed: 0,MAE,MSE,RMSE,R2,RMSLE,MAPE
0,30891099.0405,5155656491901310.0,71802900.3029,0.7904,1.893,4744.8691
1,26596708.1569,4312554351000089.5,65670041.503,0.7401,2.1159,4903.8184
2,25623940.3283,3243513170001050.5,56951849.5749,0.8058,2.2414,11744.1448
3,32056733.4398,8275197916274229.0,90968114.833,0.7516,2.271,8151.1407
4,30529198.1687,4397365992447097.0,66312638.2558,0.8152,1.946,1573.3507
5,32234666.3815,6808025513898382.0,82510759.9886,0.7382,2.1693,6964.1386
6,28421398.5329,4053241923724946.5,63665076.1699,0.7759,2.2753,14404.4362
7,28946739.1375,3267755894413431.5,57164288.6286,0.7176,2.3724,6055.2321
8,26375222.6975,2972399455134722.0,54519716.2056,0.8382,2.3516,15054.8596
9,25616849.8978,2498995807945272.5,49989957.0708,0.8611,2.163,10034.7012


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE
0,Extra Trees Regressor,31794559.9569,6765338407915637.0,82251677.2347,0.7403,2.1362,21409.8398


ExtraTreesRegressor(bootstrap=False, ccp_alpha=0.0, criterion='mae',
                    max_depth=10, max_features=0.7565906156347971,
                    max_leaf_nodes=None, max_samples=None,
                    min_impurity_decrease=0.007191975016434702,
                    min_impurity_split=None, min_samples_leaf=2,
                    min_samples_split=10, min_weight_fraction_leaf=0.0,
                    n_estimators=230, n_jobs=-1, oob_score=False,
                    random_state=5331, verbose=0, warm_start=False)

In [None]:
# before
# ExtraTreesRegressor(bootstrap=False, ccp_alpha=0.0, criterion='mse',
#                     max_depth=None, max_features='auto', max_leaf_nodes=None,
#                     max_samples=None, min_impurity_decrease=0.0,
#                     min_impurity_split=None, min_samples_leaf=1,
#                     min_samples_split=2, min_weight_fraction_leaf=0.0,
#                     n_estimators=100, n_jobs=-1, oob_score=False,
#                     random_state=8697, verbose=0, warm_start=False)
# validation 2.2826
# prediction 2.2355



# random
# ExtraTreesRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mae', 
#                     max_depth=6, max_features=1.0, max_leaf_nodes=None, 
#                     max_samples=None, min_impurity_decrease=0.0005, 
#                     min_impurity_split=None, min_samples_leaf=3, 
#                     min_samples_split=10, min_weight_fraction_leaf=0.0, 
#                     n_estimators=280, n_jobs=-1,
#                     oob_score=False, 
#                     random_state=5331, verbose=0,
#                     warm_start=False)
# validation 2.2946
# prediction 2.2400


# baysian
# ExtraTreesRegressor(bootstrap=False, ccp_alpha=0.0, criterion='mae',
#                     max_depth=10, max_features=0.7565906156347971,
#                     max_leaf_nodes=None, max_samples=None,
#                     min_impurity_decrease=0.007191975016434702,
#                     min_impurity_split=None, min_samples_leaf=2,
#                     min_samples_split=10, min_weight_fraction_leaf=0.0,
#                     n_estimators=230, n_jobs=-1, oob_score=False,
#                     random_state=5331, verbose=0, warm_start=False)
# validation 2.1799
# prediction 2.1362


# tpe
# ExtraTreesRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse', max_depth=9,
#                     max_features=0.9666163614098705, max_leaf_nodes=None,
#                     max_samples=None,
#                     min_impurity_decrease=1.5993663974787875e-08,
#                     min_impurity_split=None, min_samples_leaf=1,
#                     min_samples_split=10, min_weight_fraction_leaf=0.0,
#                     n_estimators=97, n_jobs=-1, oob_score=False,
#                     random_state=5331, verbose=0, warm_start=False)
# validation 2.6104
# prediction 2.5576

# Elastic Net

In [13]:
en_model = create_model('en')
predict_model(en_model)
en_model

Unnamed: 0,MAE,MSE,RMSE,R2,RMSLE,MAPE
0,37409844.0,6096615882358784.0,78080832.0,0.7521,2.348,11461.8613
1,33072166.0,5085674533289984.0,71313912.0,0.6935,2.6646,8203.6191
2,32840812.0,4427192427610112.0,66537152.0,0.7349,2.6365,27575.4355
3,39195940.0,9023467517116416.0,94991936.0,0.7291,2.6522,29854.4043
4,38510712.0,6459274867769344.0,80369616.0,0.7285,2.4212,3789.0674
5,39959060.0,8308341503688704.0,91150104.0,0.6806,2.6529,8262.4336
6,34825048.0,4793012777385984.0,69231592.0,0.735,2.6911,30177.5996
7,38753560.0,5313778635767808.0,72895672.0,0.5408,2.7829,12648.9639
8,34988724.0,4443322982596608.0,66658256.0,0.7581,2.7884,50026.5625
9,34471376.0,3802141810491392.0,61661508.0,0.7886,2.6306,21524.8965


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE
0,Elastic Net,39172448.0,8686982599278592.0,93203984.0,0.6665,2.5227,17952.5254


ElasticNet(alpha=1.0, copy_X=True, fit_intercept=True, l1_ratio=0.5,
           max_iter=1000, normalize=False, positive=False, precompute=False,
           random_state=5331, selection='cyclic', tol=0.0001, warm_start=False)

In [16]:
# en_model = tune_model(en_model, optimize='rmsle', search_library='optuna', search_algorithm='tpe')
# en_model = tune_model(en_model, optimize='rmsle', search_library='tune-sklearn', search_algorithm='bayesian')
en_model = tune_model(en_model, optimize='rmsle', tuner_verbose=100)
predict_model(en_model)
en_model

Unnamed: 0,MAE,MSE,RMSE,R2,RMSLE,MAPE
0,39384084.0,6959355626782720.0,83422752.0,0.717,2.3406,9239.6592
1,33741500.0,5593993576448000.0,74793008.0,0.6629,2.557,6712.4194
2,33739524.0,4907315278905344.0,70052232.0,0.7062,2.6801,30210.5312
3,40150064.0,1.0262492567568384e+16,101303960.0,0.6919,2.6868,10032.9375
4,38440744.0,6836596334657536.0,82683712.0,0.7127,2.3899,3082.7603
5,41305540.0,9339664083189760.0,96641936.0,0.6409,2.548,5254.6284
6,35872704.0,5241426690441216.0,72397696.0,0.7102,2.6806,27015.0879
7,38718076.0,5244837431345152.0,72421248.0,0.5467,2.7874,13088.4453
8,35569748.0,4868770631778304.0,69776576.0,0.735,2.7463,46210.2617
9,35475304.0,4288723218857984.0,65488344.0,0.7616,2.667,49451.5703


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE
0,Elastic Net,40375600.0,9581982916804608.0,97887600.0,0.6322,2.5503,16668.0156


ElasticNet(alpha=2.44, copy_X=True, fit_intercept=False, l1_ratio=0.512,
           max_iter=1000, normalize=True, positive=False, precompute=False,
           random_state=5331, selection='cyclic', tol=0.0001, warm_start=False)

In [13]:
# before
# ElasticNet(alpha=1.0, copy_X=True, fit_intercept=True, l1_ratio=0.5,
#            max_iter=1000, normalize=False, positive=False, precompute=False,
#            random_state=8697, selection='cyclic', tol=0.0001, warm_start=False)
# validation 2.6268
# prediction 2.5227


# random
# ElasticNet(alpha=2.44, copy_X=True, fit_intercept=False, l1_ratio=0.512,
#            max_iter=1000, normalize=True, positive=False, precompute=False,
#            random_state=5331, selection='cyclic', tol=0.0001, warm_start=False)
# validation 2.6084
# prediction 2.5503

# baysian
# ElasticNet(alpha=0.7727239844876345, copy_X=True, fit_intercept=False,
#            l1_ratio=0.19723604623903962, max_iter=1000, normalize=True,
#            positive=False, precompute=False, random_state=5331,
#            selection='cyclic', tol=0.0001, warm_start=False)
# validation 2.6137
# prediction 2.5300

# tpe
# ElasticNet(alpha=0.994136515608409, copy_X=True, fit_intercept=False,
#            l1_ratio=0.15777391156322604, max_iter=1000, normalize=False,
#            positive=False, precompute=False, random_state=5331,
#            selection='cyclic', tol=0.0001, warm_start=False)
# validation 2.6144
# prediction 2.5574

# Saving

In [11]:
save_model(et_model, 'model')

Transformation Pipeline and Model Succesfully Saved


(Pipeline(memory=None,
          steps=[('dtypes',
                  DataTypes_Auto_infer(categorical_features=[],
                                       display_types=True, features_todrop=[],
                                       id_columns=[], ml_usecase='regression',
                                       numerical_features=[], target='revenue',
                                       time_features=[])),
                 ('imputer',
                  Simple_Imputer(categorical_strategy='not_available',
                                 fill_value_categorical=None,
                                 fill_value_numerical=None,
                                 numeric_strategy...
                  ExtraTreesRegressor(bootstrap=False, ccp_alpha=0.0,
                                      criterion='mae', max_depth=10,
                                      max_features=0.7565906156347971,
                                      max_leaf_nodes=None, max_samples=None,
                          

In [50]:
prediction_df = pre_procces('test.tsv')

model = load_model('model')

prediction_df = predict_model(model, prediction_df)


Transformation Pipeline and Model Successfully Loaded


In [51]:
(prediction_df['Label']<0).sum()

317

In [52]:
prediction_df.loc[prediction_df['Label']<0, 'Label'] = 0

In [53]:
(prediction_df[['movie_id', 'Label']]['Label']<0).sum()

0