In [1]:
import time
import numpy as np
import pandas as pd
from tqdm import tqdm
import lightgbm as lgb
import gc
from datetime import datetime
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import mean_squared_log_error
from sklearn.preprocessing import LabelEncoder
import warnings
warnings.filterwarnings('ignore')
pd.set_option('max_columns', None)
pd.set_option('max_rows', None)

In [2]:
train = pd.read_csv('../input/train.csv')
test = pd.read_csv('../input/test.csv')
data = pd.concat([train, test], axis=0, ignore_index=True)

## drop特征

In [3]:
drop_cols = ['poster_path', 'imdb_id']
data.drop(drop_cols, axis=1, inplace=True)

## 时间特征

In [4]:
data['release_year'] = data['release_date'].apply(lambda x: '19' + x.split('/')[2] if int(x.split('/')[2]) > 20 else '20' + x.split('/')[2]).astype(int)
data['release_month'] = data['release_date'].apply(lambda x: x.split('/')[0]).astype(int)
data['release_day'] = data['release_date'].apply(lambda x: x.split('/')[1]).astype(int)

data['release_date'] = pd.to_datetime(data['release_year'].astype(str) + '-' + data['release_month'].astype(str) + '-' + data['release_day'].astype(str))

data['release_date_weekday'] = data['release_date'].apply(lambda x: x.weekday())
data['release_date_TONOW'] = (datetime.now() - data['release_date']).dt.days

data.drop(['release_day', 'release_date'], axis=1, inplace=True)

## 判断是否为空

In [5]:
isnull_cols = ['homepage', 'tagline', 'belongs_to_collection']
for i in isnull_cols:
    data[i + 'isnull'] = np.where(pd.isnull(data[i]), 1, 0)
data.drop(isnull_cols, axis=1, inplace=True)

## 数值特征

In [6]:
num_cols = ['runtime', 'popularity', 'budget']

## 类别特征

In [7]:
data['title'][:5]

0                            Wrath of the Titans
1                                      From Hell
2                   Guess Who's Coming to Dinner
3    Talladega Nights: The Ballad of Ricky Bobby
4                                         Xanadu
Name: title, dtype: object

In [8]:
data['title'].isnull().sum(), data['original_title'].isnull().sum()

(0, 0)

In [9]:
cat_cols = ['original_language', 'status', 'title', 'original_title']

data['title=original_title'] = (data['title'] == data['original_title']).astype(int)

for i in tqdm(['original_language', 'status']):
    le = LabelEncoder()
    data[i] = le.fit_transform(data[i])

data['original_language_count'] = data['original_language'].map(data['original_language'].value_counts())
data['title_count'] = data['title'].map(data['title'].value_counts())

data.drop(['title', 'original_title'], axis=1, inplace=True)

100%|███████████████████████████████████████████████████████████████████████████████████| 2/2 [00:00<00:00, 676.94it/s]


## 嵌套特征

In [10]:
nested_cols = ['genres', 'production_companies', 'production_countries',
               'Keywords', 'spoken_languages', 'cast', 'crew']
for i in nested_cols:
    data[i + '_length'] = data[i].apply(lambda x: 0 if pd.isnull(x) else len(eval(x)))
#     print(i + '_length')
#     print(data[i + '_length'].unique())

data['genres_0'] = data['genres'].apply(lambda x: np.nan if pd.isnull(x) else eval(x)[0]['name'])

le = LabelEncoder()
data['genres_0'] = le.fit_transform(data['genres_0'].astype(str))
data['genres_0_count'] = data['genres_0'].map(data['genres_0'].value_counts())

data.drop(nested_cols, axis=1, inplace=True)

## 其他

In [11]:
data.drop('overview', axis=1, inplace=True)

In [12]:
train = data[data['revenue'].notnull()]
test = data[data['revenue'].isnull()]

y_mean = train['revenue'].mean()

used_cols = [i for i in train.columns if i not in ['id', 'release_date', 'revenue']]
y = train['revenue'] - y_mean
train = train[used_cols]
test = test[used_cols]

def rmsle(y_hat, data):
    y_true = data.get_label()
    y_hat = np.where(y_hat < 0, 1, y_hat)
    y_true = np.where(y_true < 0, 1, y_hat)
    res = np.sqrt(mean_squared_log_error(y_true, y_hat))
    return 'rmsle', res, True


prediction = pd.DataFrame()
skf = KFold(n_splits=5, random_state=2020, shuffle=True)
for fold_id, (trn_idx, val_idx) in enumerate(skf.split(train, y)):
    print('\nFold_{} Training ==============\n'.format(fold_id + 1))
    X_train = train.iloc[trn_idx]
    Y_train = y.iloc[trn_idx]
    X_val = train.iloc[val_idx]
    Y_val = y.iloc[val_idx]

    dtrain = lgb.Dataset(X_train, Y_train)
    dvalid = lgb.Dataset(X_val, Y_val, reference=dtrain)

    params = {
        'boosting_type': 'gbdt',
        'objective': 'regression',
        'metric': 'None',
        'learning_rate': 0.05,
        'seed': fold_id
    }

    model = lgb.train(
        params,
        dtrain,
        valid_sets=[dtrain, dvalid],
        num_boost_round=1000000,
        early_stopping_rounds=100,
        verbose_eval=50,
        feval=rmsle
    )
    
    pred = model.predict(test)
    prediction['label_{}'.format(fold_id)] = pred

prediction['mean'] = prediction.mean(1)

sub = pd.DataFrame()
sub['ID'] = np.arange(0, 600)
sub['revenue'] = prediction['mean'] + y_mean
sub['revenue'] = sub['revenue']



Training until validation scores don't improve for 100 rounds
[50]	training's rmsle: 3.84335	valid_1's rmsle: 4.23996
[100]	training's rmsle: 3.36703	valid_1's rmsle: 4.19428
Early stopping, best iteration is:
[38]	training's rmsle: 3.95159	valid_1's rmsle: 4.32449


Training until validation scores don't improve for 100 rounds
[50]	training's rmsle: 3.52135	valid_1's rmsle: 5.00586
[100]	training's rmsle: 2.98647	valid_1's rmsle: 5.09854
[150]	training's rmsle: 2.60171	valid_1's rmsle: 5.08954
[200]	training's rmsle: 2.28489	valid_1's rmsle: 4.98734
Early stopping, best iteration is:
[113]	training's rmsle: 2.87366	valid_1's rmsle: 5.11338


Training until validation scores don't improve for 100 rounds
[50]	training's rmsle: 3.70114	valid_1's rmsle: 4.86687
[100]	training's rmsle: 3.07967	valid_1's rmsle: 4.9222
Early stopping, best iteration is:
[1]	training's rmsle: 4.94402	valid_1's rmsle: 5.35134


Training until validation scores don't improve for 100 rounds
[50]	training's rms

In [13]:
sub.describe()

Unnamed: 0,ID,revenue
count,600.0,600.0
mean,299.5,64704950.0
std,173.349358,82245190.0
min,0.0,15766160.0
25%,149.75,23729260.0
50%,299.5,34049470.0
75%,449.25,70805320.0
max,599.0,690460000.0


In [14]:
sub['revenue'] = np.where(sub['revenue'] <=0, y_mean, sub['revenue'])
sub.to_csv('../sub/sub_{}.csv'.format(time.strftime('%Y%m%d')), index=False, header=False)