In [1]:
import math
import numpy as np
import pandas as pd
from tqdm import tqdm
import lightgbm as lgb
import gc
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_log_error
import warnings
warnings.filterwarnings('ignore')
pd.set_option('max_columns', None)
pd.set_option('max_rows', None)

In [2]:
train = pd.read_csv('../input/train.csv')
test = pd.read_csv('../input/test.csv')

data = pd.concat([train, test], axis=0, ignore_index=True)

In [3]:
drop_cols = ['belongs_to_collection', 'homepage', 'imdb_id', 'original_title', 'overview',
             'poster_path', 'tagline', 'title']

data.drop(drop_cols, axis=1, inplace=True)

In [4]:
list_cols = ['genres', 'production_companies', 'production_countries', 'spoken_languages', 'Keywords', 'cast', 'crew']
cat_cols = ['original_language']
num_cols = ['budget', 'popularity', 'runtime']

In [5]:
for i in list_cols:
    data[i + '_length'] = data[i].apply(lambda x: 0 if pd.isnull(x) else len(eval(x)))

In [6]:
data['genres_0'] = data['genres'].apply(lambda x: np.nan if pd.isnull(x) else eval(x)[0]['name'])
cat_cols.append('genres_0')

In [7]:
data.drop(list_cols, axis=1, inplace=True)
data.columns

Index(['id', 'budget', 'original_language', 'popularity', 'release_date',
       'runtime', 'status', 'revenue', 'genres_length',
       'production_companies_length', 'production_countries_length',
       'spoken_languages_length', 'Keywords_length', 'cast_length',
       'crew_length', 'genres_0'],
      dtype='object')

In [8]:
data['release_year'] = data['release_date'].apply(lambda x: '19' + x.split('/')[2] if int(x.split('/')[2]) > 20 else '20' + x.split('/')[2])
data['release_month'] = data['release_date'].apply(lambda x: x.split('/')[0])
data['release_day'] = data['release_date'].apply(lambda x: x.split('/')[1])

data['release_date'] = pd.to_datetime(data['release_year'] + '-' + data['release_month'] + '-' + data['release_day'])
data.drop(['release_year', 'release_month', 'release_day'], axis=1, inplace=True)

In [9]:
data['release_month'] = data['release_date'].dt.month

In [10]:
def count_encode(df, cat_cols):
    for col in cat_cols:
        print(col)
        vc = df[col].value_counts(dropna=True, normalize=True)
        df[col + '_count'] = df[col].map(vc).astype('float32')
    return df

data = count_encode(data, cat_cols)

original_language
genres_0


In [11]:
def cat_num_stats(df, cat_cols, num_cols):
    for f1 in tqdm(cat_cols):
        g = df.groupby(f1, as_index=False)
        for f2 in tqdm(num_cols):
            tmp = g[f2].agg({
                '{}_{}_max'.format(f1, f2): 'max',
                '{}_{}_min'.format(f1, f2): 'min',
                '{}_{}_median'.format(f1, f2): 'median',
                '{}_{}_mean'.format(f1, f2): 'mean',
                '{}_{}_sum'.format(f1, f2): 'sum',
                '{}_{}_skew'.format(f1, f2): 'skew',
                '{}_{}_std'.format(f1, f2): 'std'
            })
            df = df.merge(tmp, on=f1, how='left')
            del tmp
            gc.collect()
    return df

data = cat_num_stats(data, cat_cols, num_cols)

  0%|                                                                                            | 0/2 [00:00<?, ?it/s]
  0%|                                                                                            | 0/3 [00:00<?, ?it/s][A
 33%|████████████████████████████                                                        | 1/3 [00:00<00:00,  9.91it/s][A
100%|████████████████████████████████████████████████████████████████████████████████████| 3/3 [00:00<00:00, 10.88it/s][A
 50%|██████████████████████████████████████████                                          | 1/2 [00:00<00:00,  3.51it/s]
  0%|                                                                                            | 0/3 [00:00<?, ?it/s][A
100%|████████████████████████████████████████████████████████████████████████████████████| 3/3 [00:00<00:00, 13.18it/s][A
100%|████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:00<00:00,  3.81it/s]


In [12]:
def label_encode(df, cat_cols, verbose=True):
    for col in cat_cols:
        df[col], _ = df[col].factorize(sort=True)
        if df[col].max() > 32000:
            df[col] = df[col].astype('int32')
        else:
            df[col] = df[col].astype('int16')
        if verbose:
            print(col)
    return df

data = label_encode(data, cat_cols + ['status'])

original_language
genres_0
status


In [13]:
data.columns

Index(['id', 'budget', 'original_language', 'popularity', 'release_date',
       'runtime', 'status', 'revenue', 'genres_length',
       'production_companies_length', 'production_countries_length',
       'spoken_languages_length', 'Keywords_length', 'cast_length',
       'crew_length', 'genres_0', 'release_month', 'original_language_count',
       'genres_0_count', 'original_language_budget_max',
       'original_language_budget_min', 'original_language_budget_median',
       'original_language_budget_mean', 'original_language_budget_sum',
       'original_language_budget_skew', 'original_language_budget_std',
       'original_language_popularity_max', 'original_language_popularity_min',
       'original_language_popularity_median',
       'original_language_popularity_mean', 'original_language_popularity_sum',
       'original_language_popularity_skew', 'original_language_popularity_std',
       'original_language_runtime_max', 'original_language_runtime_min',
       'original_langu

In [14]:
mean_revenue = int(data['revenue'].mean())
mean_revenue

65868516

In [15]:
train = data[data['revenue'].notnull()]
test = data[data['revenue'].isnull()]
train.shape, test.shape

((2400, 61), (600, 61))

In [16]:
mean_revenue = train['revenue'].mean()

used_cols = [i for i in train.columns if i not in ['id', 'release_date', 'revenue']]
y = train['revenue'] - mean_revenue
train = train[used_cols]
test = test[used_cols]

X_train, X_valid, y_train, y_valid = train_test_split(train, y, random_state=2020)

dtrain = lgb.Dataset(X_train, y_train)
dvalid = lgb.Dataset(X_valid, y_valid)

params = {
    'boosting_type': 'gbdt',
    'objective': 'regression',
    'metric': 'None',
    'learning_rate': 0.05,
    'seed': 2020
}

def rmsle(y_hat, data):
    y_true = data.get_label()
    y_hat = np.where(y_hat < 0, 1, y_hat)
    y_true = np.where(y_true < 0, 1, y_hat)
    res = np.sqrt(mean_squared_log_error(y_true, y_hat))
    return 'rmsle', res, True

model = lgb.train(
    params,
    dtrain,
    valid_sets=[dtrain, dvalid],
    num_boost_round=1000000,
    early_stopping_rounds=100,
    verbose_eval=50,
    feval=rmsle
)

pred = model.predict(test).astype(int)

Training until validation scores don't improve for 100 rounds
[50]	training's rmsle: 3.75542	valid_1's rmsle: 4.63204
[100]	training's rmsle: 3.38572	valid_1's rmsle: 4.52044
[150]	training's rmsle: 2.92482	valid_1's rmsle: 4.54527
Early stopping, best iteration is:
[67]	training's rmsle: 3.62901	valid_1's rmsle: 4.69979


In [17]:
sub = pd.DataFrame()
sub['ID'] = np.arange(0, 600)
sub['revenue'] = pred + mean_revenue
sub['revenue'] = sub['revenue'].astype(int)
sub.to_csv('../sub/baseline.csv', index=False, header=False)

In [18]:
sub.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 600 entries, 0 to 599
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype
---  ------   --------------  -----
 0   ID       600 non-null    int32
 1   revenue  600 non-null    int32
dtypes: int32(2)
memory usage: 4.8 KB


In [19]:
sub.head()

Unnamed: 0,ID,revenue
0,0,27639467
1,1,28253202
2,2,59196072
3,3,46207228
4,4,9925060


In [20]:
sub.describe()

Unnamed: 0,ID,revenue
count,600.0,600.0
mean,299.5,63996640.0
std,173.349358,97939440.0
min,0.0,2648269.0
25%,149.75,13752660.0
50%,299.5,27218960.0
75%,449.25,69160810.0
max,599.0,760519700.0
