In [1]:
import time
import numpy as np
import pandas as pd
from tqdm import tqdm
import lightgbm as lgb
import gc
from datetime import datetime
from sklearn.model_selection import train_test_split, KFold, StratifiedKFold
from sklearn.preprocessing import MinMaxScaler, LabelEncoder
from sklearn.decomposition import PCA, TruncatedSVD
from itertools import product
from sklearn.metrics import mean_squared_log_error
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
import torch
import torch.nn as nn
torch.set_default_tensor_type(torch.FloatTensor)
import warnings
warnings.filterwarnings('ignore')
pd.set_option('max_columns', None)
pd.set_option('max_rows', None)

print(torch.__version__)

1.5.1


In [2]:
class MeanEncoder:
    def __init__(self, categorical_features, n_splits=10, target_type='classification', prior_weight_func=None):
        """
        :param categorical_features: list of str, the name of the categorical columns to encode
 
        :param n_splits: the number of splits used in mean encoding
 
        :param target_type: str, 'regression' or 'classification'
 
        :param prior_weight_func:
        a function that takes in the number of observations, and outputs prior weight
        when a dict is passed, the default exponential decay function will be used:
        k: the number of observations needed for the posterior to be weighted equally as the prior
        f: larger f --> smaller slope
        """
 
        self.categorical_features = categorical_features
        self.n_splits = n_splits
        self.learned_stats = {}
 
        if target_type == 'classification':
            self.target_type = target_type
            self.target_values = []
        else:
            self.target_type = 'regression'
            self.target_values = None
 
        if isinstance(prior_weight_func, dict):
            self.prior_weight_func = eval('lambda x: 1 / (1 + np.exp((x - k) / f))', dict(prior_weight_func, np=np))
        elif callable(prior_weight_func):
            self.prior_weight_func = prior_weight_func
        else:
            self.prior_weight_func = lambda x: 1 / (1 + np.exp((x - 2) / 1))
 
    @staticmethod
    def mean_encode_subroutine(X_train, y_train, X_test, variable, target, prior_weight_func):
        X_train = X_train[[variable]].copy()
        X_test = X_test[[variable]].copy()
 
        if target is not None:
            nf_name = '{}_pred_{}'.format(variable, target)
            X_train['pred_temp'] = (y_train == target).astype(int)  # classification
        else:
            nf_name = '{}_pred'.format(variable)
            X_train['pred_temp'] = y_train  # regression
        prior = X_train['pred_temp'].mean()
 
        col_avg_y = X_train.groupby(by=variable, axis=0)['pred_temp'].agg(['mean', 'size'])
        col_avg_y['size'] = prior_weight_func(col_avg_y['size'])
        col_avg_y[nf_name] = col_avg_y['size'] * prior + (1 - col_avg_y['size']) * col_avg_y['mean']
        col_avg_y.drop(['size', 'mean'], axis=1, inplace=True)
 
        nf_train = X_train.join(col_avg_y, on=variable)[nf_name].values
        nf_test = X_test.join(col_avg_y, on=variable).fillna(prior, inplace=False)[nf_name].values
 
        return nf_train, nf_test, prior, col_avg_y
 
    def fit_transform(self, X, y):
        """
        :param X: pandas DataFrame, n_samples * n_features
        :param y: pandas Series or numpy array, n_samples
        :return X_new: the transformed pandas DataFrame containing mean-encoded categorical features
        """
        X_new = X.copy()
        if self.target_type == 'classification':
            skf = StratifiedKFold(self.n_splits)
        else:
            skf = KFold(self.n_splits)
 
        if self.target_type == 'classification':
            self.target_values = sorted(set(y))
            self.learned_stats = {'{}_pred_{}'.format(variable, target): [] for variable, target in
                                  product(self.categorical_features, self.target_values)}
            for variable, target in product(self.categorical_features, self.target_values):
                nf_name = '{}_pred_{}'.format(variable, target)
                X_new.loc[:, nf_name] = np.nan
                for large_ind, small_ind in skf.split(X, y):
                    nf_large, nf_small, prior, col_avg_y = MeanEncoder.mean_encode_subroutine(
                        X_new.iloc[large_ind], y.iloc[large_ind], X_new.iloc[small_ind], variable, target, self.prior_weight_func)
                    X_new.iloc[small_ind, -1] = nf_small
                    self.learned_stats[nf_name].append((prior, col_avg_y))
        else:
            self.learned_stats = {'{}_pred'.format(variable): [] for variable in self.categorical_features}
            for variable in self.categorical_features:
                nf_name = '{}_pred'.format(variable)
                X_new.loc[:, nf_name] = np.nan
                for large_ind, small_ind in skf.split(X, y):
                    nf_large, nf_small, prior, col_avg_y = MeanEncoder.mean_encode_subroutine(
                        X_new.iloc[large_ind], y.iloc[large_ind], X_new.iloc[small_ind], variable, None, self.prior_weight_func)
                    X_new.iloc[small_ind, -1] = nf_small
                    self.learned_stats[nf_name].append((prior, col_avg_y))
        return X_new
 
    def transform(self, X):
        """
        :param X: pandas DataFrame, n_samples * n_features
        :return X_new: the transformed pandas DataFrame containing mean-encoded categorical features
        """
        X_new = X.copy()
 
        if self.target_type == 'classification':
            for variable, target in product(self.categorical_features, self.target_values):
                nf_name = '{}_pred_{}'.format(variable, target)
                X_new[nf_name] = 0
                for prior, col_avg_y in self.learned_stats[nf_name]:
                    X_new[nf_name] += X_new[[variable]].join(col_avg_y, on=variable).fillna(prior, inplace=False)[
                        nf_name]
                X_new[nf_name] /= self.n_splits
        else:
            for variable in self.categorical_features:
                nf_name = '{}_pred'.format(variable)
                X_new[nf_name] = 0
                for prior, col_avg_y in self.learned_stats[nf_name]:
                    X_new[nf_name] += X_new[[variable]].join(col_avg_y, on=variable).fillna(prior, inplace=False)[
                        nf_name]
                X_new[nf_name] /= self.n_splits
 
        return X_new

In [3]:
train = pd.read_csv('../input/train.csv', encoding='latin-1')
test = pd.read_csv('../input/test.csv', encoding='latin-1')
y_mean = int(train['revenue'].mean())
print('y_mean: ', y_mean)

# train['revenue'] = np.log1p(train['revenue'])

data = pd.concat([train, test], axis=0, ignore_index=True)

y_mean:  65868516


In [4]:
data.head(2)

Unnamed: 0,id,belongs_to_collection,budget,genres,homepage,imdb_id,original_language,original_title,overview,popularity,poster_path,production_companies,production_countries,release_date,runtime,spoken_languages,status,tagline,title,Keywords,cast,crew,revenue
0,0,"[{'id': 86780, 'name': 'Clash of the Titans Co...",150000000,"[{'id': 12, 'name': 'Adventure'}]",http://www.wrathofthetitansmovie.org,tt1646987,en,Wrath of the Titans,A decade after his heroic defeat of the monstr...,7.739904,/Albfq3ziSCQVyh5PzMSsFmmgHmy.jpg,"[{'name': 'Legendary Pictures', 'id': 923}, {'...","[{'iso_3166_1': 'ES', 'name': 'Spain'}, {'iso_...",3/27/12,99.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Feel the Wrath,Wrath of the Titans,"[{'id': 1449, 'name': 'underworld'}, {'id': 20...","[{'cast_id': 4, 'character': 'Perseus', 'credi...","[{'credit_id': '52fe4926c3a36847f818b96d', 'de...",301000000.0
1,1,,35000000,"[{'id': 27, 'name': 'Horror'}, {'id': 9648, 'n...",,tt0120681,en,From Hell,Frederick Abberline is an opium-huffing inspec...,7.79014,/f3J77Cy3pRSeeN52Pk8oIvgi6IN.jpg,[{'name': 'Twentieth Century Fox Film Corporat...,"[{'iso_3166_1': 'CZ', 'name': 'Czech Republic'...",10/19/01,122.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Only the legend will survive.,From Hell,"[{'id': 1465, 'name': 'loss of family'}, {'id'...","[{'cast_id': 19, 'character': 'Inspector Frede...","[{'credit_id': '52fe4273c3a36847f801fbfb', 'de...",74558115.0


## drop特征

In [5]:
drop_cols = ['poster_path', 'imdb_id']
data.drop(drop_cols, axis=1, inplace=True)

## 时间特征

In [6]:
data['release_year'] = data['release_date'].apply(lambda x: '19' + x.split('/')[2] if int(x.split('/')[2]) > 20 else '20' + x.split('/')[2]).astype(int)
data['release_month'] = data['release_date'].apply(lambda x: x.split('/')[0]).astype(int)
data['release_day'] = data['release_date'].apply(lambda x: x.split('/')[1]).astype(int)

data['release_date'] = pd.to_datetime(data['release_year'].astype(str) + '-' + data['release_month'].astype(str) + '-' + data['release_day'].astype(str))

data['release_date_weekday'] = data['release_date'].apply(lambda x: x.weekday())
data['release_date_TONOW'] = (datetime.now() - data['release_date']).dt.days

data.drop(['release_day', 'release_date'], axis=1, inplace=True)

## 判断是否为空

In [7]:
isnull_cols = ['homepage', 'tagline', 'belongs_to_collection', 'overview']
for i in isnull_cols:
    data[i + '_isnull'] = data[i].isnull().astype(int)
data.drop(isnull_cols, axis=1, inplace=True)

## 数值特征

In [8]:
num_cols = ['runtime', 'popularity', 'budget']

for i in num_cols:
    data[i] = np.log1p(data[i])

## 类别特征

In [9]:
data['title'][:5]

0                            Wrath of the Titans
1                                      From Hell
2                   Guess Who's Coming to Dinner
3    Talladega Nights: The Ballad of Ricky Bobby
4                                         Xanadu
Name: title, dtype: object

In [10]:
data['title'].isnull().sum(), data['original_title'].isnull().sum()

(0, 0)

In [11]:
cat_cols = ['original_language', 'status', 'title', 'original_title']

data['title=original_title'] = (data['title'] == data['original_title']).astype(int)

for i in tqdm(['original_language', 'status']):
    le = LabelEncoder()
    data[i] = le.fit_transform(data[i])

data['original_language_count'] = data['original_language'].map(data['original_language'].value_counts())
data['release_year_count'] = data['release_year'].map(data['release_year'].value_counts())
data['release_month_count'] = data['release_month'].map(data['release_month'].value_counts())
data['release_date_weekday_count'] = data['release_date_weekday'].map(data['release_date_weekday'].value_counts())
data['title_count'] = data['title'].map(data['title'].value_counts())

data.drop(['title', 'original_title'], axis=1, inplace=True)

100%|███████████████████████████████████████████████████████████████████████████████████| 2/2 [00:00<00:00, 668.84it/s]


## 嵌套特征

In [12]:
nested_cols = ['genres', 'production_companies', 'production_countries',
               'Keywords', 'spoken_languages', 'cast', 'crew']
for i in nested_cols:
    data[i + '_length'] = data[i].apply(lambda x: 0 if pd.isnull(x) else len(eval(x)))

# data['genres_0'] = data['genres'].apply(lambda x: np.nan if pd.isnull(x) else eval(x)[0]['name'])

# le = LabelEncoder()
# data['genres_0'] = le.fit_transform(data['genres_0'].astype(str))
# data['genres_0_count'] = data['genres_0'].map(data['genres_0'].value_counts())

# data.drop(nested_cols, axis=1, inplace=True)

In [13]:
def get_name(x):
    if pd.isnull(x):
        return []
    else:
        df = pd.DataFrame(eval(x))
        if 'name' in df.columns:
#             df['name'] = df['name'].apply(lambda s: ''.join(s.split()))
            return df['name'].tolist()
        else:
            return []


for i in nested_cols:
    print(i)
    data[i + '_name'] = data[i].apply(lambda x: get_name(x))

genres
production_companies
production_countries
Keywords
spoken_languages
cast
crew


In [14]:
def tfidf_emb(df_, cat_col, emb_size=10, seed=1024):
    print('Start tfidf ...')
    df = df_.copy()
    df[cat_col] = df[cat_col].fillna('-1')
    df[cat_col] = df[cat_col].apply(lambda x: ' '.join(x))
    tfidf_enc = TfidfVectorizer()
    tfidf_vec = tfidf_enc.fit_transform(df[cat_col])
    svd_enc = TruncatedSVD(n_components=emb_size, n_iter=20, random_state=seed)
    svd_vec = svd_enc.fit_transform(tfidf_vec)
    tfidf_df = pd.DataFrame(svd_vec)
    tfidf_df.columns = ['{}_tfidf_{}'.format(cat_col, i) for i in range(emb_size)]
    res = tfidf_df
    return res


def count2vec_emb(df_, cat_col, emb_size=10, seed=1024):
    print('Start count2vec ...')
    df = df_.copy()
    df[cat_col] = df[cat_col].fillna('-1')
    df[cat_col] = df[cat_col].apply(lambda x: ' '.join(x))
    count_enc = CountVectorizer()
    count_vec = count_enc.fit_transform(df[cat_col])
    svd_enc = TruncatedSVD(n_components=emb_size, n_iter=20, random_state=seed)
    svd_vec = svd_enc.fit_transform(count_vec)
    c2v_df = pd.DataFrame(svd_vec)
    c2v_df.columns = ['{}_count2vec_{}'.format(cat_col, i) for i in range(emb_size)]
    res = c2v_df
    return res


for i in [i + '_name' for i in nested_cols]:
    print(i)
    tfidf_df = tfidf_emb(data, i, emb_size=10, seed=1024)
    data = pd.concat([data, tfidf_df], axis=1)

for i in [i + '_name' for i in nested_cols]:
    print(i)
    c2v_df = count2vec_emb(data, i, emb_size=10, seed=1024)
    data = pd.concat([data, c2v_df], axis=1)

data.drop(nested_cols, axis=1, inplace=True)
data.drop([i + '_name' for i in nested_cols], axis=1, inplace=True)

genres_name
Start tfidf ...
production_companies_name
Start tfidf ...
production_countries_name
Start tfidf ...
Keywords_name
Start tfidf ...
spoken_languages_name
Start tfidf ...
cast_name
Start tfidf ...
crew_name
Start tfidf ...
genres_name
Start count2vec ...
production_companies_name
Start count2vec ...
production_countries_name
Start count2vec ...
Keywords_name
Start count2vec ...
spoken_languages_name
Start count2vec ...
cast_name
Start count2vec ...
crew_name
Start count2vec ...


In [15]:
train = data[data['revenue'].notnull()]
test = data[data['revenue'].isnull()]

train.fillna(-999, inplace=True)
test.fillna(-999, inplace=True)

used_cols = [i for i in train.columns if i not in ['id', 'release_date', 'revenue']]
y = train['revenue']
train = train[used_cols]
test = test[used_cols]

In [16]:
class_list = ['release_year', 'release_month', 'release_date_weekday', 'original_language']

ME = MeanEncoder(categorical_features=class_list, n_splits=5, target_type='regression', prior_weight_func=None)
train = ME.fit_transform(train, y)
test = ME.transform(test)

In [17]:
train['revenue'] = y

In [18]:
# 暂且选择这三种编码
enc_cols = []
stats_default_dict = {
    'max': train['revenue'].max(),
    'min': train['revenue'].min(),
    'median': train['revenue'].median(),
    'mean': train['revenue'].mean(),
    'sum': train['revenue'].sum(),
    'std': train['revenue'].std(),
    'skew': train['revenue'].skew(),
    'kurt': train['revenue'].kurt(),
    'mad': train['revenue'].mad()
}
enc_stats = ['max', 'min', 'mean']
skf = KFold(n_splits=5, shuffle=True, random_state=42)
for f in tqdm(['release_year', 'release_month', 'release_date_weekday', 'original_language']):
    enc_dict = {}
    for stat in enc_stats:
        enc_dict['{}_target_{}'.format(f, stat)] = stat
        train['{}_target_{}'.format(f, stat)] = 0
        test['{}_target_{}'.format(f, stat)] = 0
        enc_cols.append('{}_target_{}'.format(f, stat))
    for i, (trn_idx, val_idx) in enumerate(skf.split(train, y)):
        trn_x, val_x = train.iloc[trn_idx].reset_index(drop=True), train.iloc[val_idx].reset_index(drop=True)
        enc_df = trn_x.groupby(f, as_index=False)['revenue'].agg(enc_dict)
        val_x = val_x[[f]].merge(enc_df, on=f, how='left')
        test_x = test[[f]].merge(enc_df, on=f, how='left')
        for stat in enc_stats:
            val_x['{}_target_{}'.format(f, stat)] = val_x['{}_target_{}'.format(f, stat)].fillna(stats_default_dict[stat])
            test_x['{}_target_{}'.format(f, stat)] = test_x['{}_target_{}'.format(f, stat)].fillna(stats_default_dict[stat])
            train.loc[val_idx, '{}_target_{}'.format(f, stat)] = val_x['{}_target_{}'.format(f, stat)].values
            test['{}_target_{}'.format(f, stat)] += test_x['{}_target_{}'.format(f, stat)].values / skf.n_splits

100%|████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:00<00:00, 10.51it/s]


In [19]:
n_train = train.shape[0]

train.drop('revenue', axis=1, inplace=True)

min_max_scaler = MinMaxScaler()
min_max_scaler.fit(pd.concat([train, test]).values)
all_data = min_max_scaler.transform(pd.concat([train, test]).values)

In [20]:
pca = PCA(n_components=100)
all_features = pca.fit_transform(all_data)

In [21]:
train_features = torch.tensor(all_features[:n_train], dtype=torch.float)
test_features = torch.tensor(all_features[n_train:], dtype=torch.float)
train_labels = torch.tensor(y, dtype=torch.float).view(-1, 1)

loss = torch.nn.MSELoss()

def get_net(feature_num):
    net = nn.Linear(feature_num, 1)
    for param in net.parameters():
        nn.init.normal_(param, mean=0, std=0.01)
    return net


def log_rmse(net, features, labels):
    with torch.no_grad():
        # 将小于1的值设成1，使得取对数时数值更稳定
        clipped_preds = torch.max(net(features), torch.tensor(1.0))
        rmse = torch.sqrt(loss((clipped_preds + 1).log(), (labels + 1).log()))
    return rmse.item()


def train(net, train_features, train_labels, test_features, test_labels,
          num_epochs, learning_rate, weight_decay, batch_size):
    train_ls, test_ls = [], []
    dataset = torch.utils.data.TensorDataset(train_features, train_labels)
    train_iter = torch.utils.data.DataLoader(dataset, batch_size, shuffle=True)
    # 这里使用了Adam优化算法
    optimizer = torch.optim.Adam(params=net.parameters(), lr=learning_rate, weight_decay=weight_decay)
    net = net.float()
    for epoch in range(num_epochs):
        for X, y in train_iter:
            l = loss(net(X.float()), y.float())
            optimizer.zero_grad()
            l.backward()
            optimizer.step()
        train_ls.append(log_rmse(net, train_features, train_labels))
        if test_labels is not None:
            test_ls.append(log_rmse(net, test_features, test_labels))
    return train_ls, test_ls


def get_k_fold_data(k, i, X, y):
    # 返回第i折交叉验证时所需要的训练和验证数据
    assert k > 1
    fold_size = X.shape[0] // k
    X_train, y_train = None, None
    for j in range(k):
        idx = slice(j * fold_size, (j + 1) * fold_size)
        X_part, y_part = X[idx, :], y[idx]
        if j == i:
            X_valid, y_valid = X_part, y_part
        elif X_train is None:
            X_train, y_train = X_part, y_part
        else:
            X_train = torch.cat((X_train, X_part), dim=0)
            y_train = torch.cat((y_train, y_part), dim=0)
    return X_train, y_train, X_valid, y_valid


def k_fold(k, X_train, y_train, num_epochs,
           learning_rate, weight_decay, batch_size):
    train_l_sum, valid_l_sum = 0, 0
    for i in range(k):
        data = get_k_fold_data(k, i, X_train, y_train)
        net = get_net(X_train.shape[1])
        train_ls, valid_ls = train(net, *data, num_epochs, learning_rate,
                                   weight_decay, batch_size)
        train_l_sum += train_ls[-1]
        valid_l_sum += valid_ls[-1]
#         if i == 0:
#             semilogy(range(1, num_epochs + 1), train_ls, 'epochs', 'rmse',
#                          range(1, num_epochs + 1), valid_ls,
#                          ['train', 'valid'])
        print('fold %d, train rmse %f, valid rmse %f' % (i, train_ls[-1], valid_ls[-1]))
    return train_l_sum / k, valid_l_sum / k


k, num_epochs, lr, weight_decay, batch_size = 5, 100, 5, 0, 64
train_l, valid_l = k_fold(k, train_features, train_labels, num_epochs, lr, weight_decay, batch_size)
print('%d-fold validation: avg train rmse %f, avg valid rmse %f' % (k, train_l, valid_l))

def train_and_pred(train_features, test_features, train_labels,
                   num_epochs, lr, weight_decay, batch_size):
    net = get_net(train_features.shape[1])
    train_ls, _ = train(net, train_features, train_labels, None, None,
                        num_epochs, lr, weight_decay, batch_size)
#     semilogy(range(1, num_epochs + 1), train_ls, 'epochs', 'rmse')
    print('train rmse %f' % train_ls[-1])
    preds = net(test_features).detach().numpy()
    sub = pd.DataFrame({'ID': np.arange(0, 600)})
    sub['revenue'] = pd.Series(preds.reshape(1, -1)[0])
#     sub['revenue'] = np.expm1(sub['revenue'])
#     sub['revenue'] = np.where(sub['revenue'] <= 0, y_mean, sub['revenue'])
    sub['revenue'] = sub['revenue'].apply(lambda x: y_mean if x <= 0 else x)
    sub['revenue'] = sub['revenue'].astype(int)
    print(sub.describe())
    sub.to_csv('../sub/sub_{}.csv'.format(time.strftime('%Y%m%d')), index=False, header=False)


train_and_pred(train_features, test_features, train_labels, num_epochs, lr, weight_decay, batch_size)

fold 0, train rmse 9.016912, valid rmse 9.098090
fold 1, train rmse 9.033643, valid rmse 9.083103
fold 2, train rmse 9.028946, valid rmse 9.146161
fold 3, train rmse 9.041874, valid rmse 9.004050
fold 4, train rmse 9.088479, valid rmse 8.974241
5-fold validation: avg train rmse 9.041971, avg valid rmse 9.061129
train rmse 8.878341
               ID       revenue
count  600.000000  6.000000e+02
mean   299.500000  1.429278e+07
std    173.349358  2.714754e+07
min      0.000000  1.970000e+02
25%    149.750000  1.569000e+04
50%    299.500000  2.995750e+04
75%    449.250000  6.686350e+04
max    599.000000  6.586852e+07


In [22]:
# train.drop('revenue', axis=1, inplace=True)
# X_train, X_valid, y_train, y_valid = train_test_split(train, y, random_state=2020)

# dtrain = lgb.Dataset(X_train, y_train)
# dvalid = lgb.Dataset(X_valid, y_valid, reference=dtrain)

# params = {
#     'boosting_type': 'gbdt',
#     'objective': 'regression',
#     'metric': 'None',
#     'learning_rate': 0.05,
#     'seed': 2020
# }

# def rmsle(y_hat, data):
#     y_true = data.get_label()
#     y_hat = np.where(y_hat < 0, 1, y_hat)
#     y_true = np.where(y_true < 0, 1, y_true)
#     res = -np.sqrt(mean_squared_log_error(y_true, y_hat))
#     return 'rmsle', res, True

# model = lgb.train(
#     params,
#     dtrain,
#     valid_sets=[dtrain, dvalid],
#     num_boost_round=1000000,
#     early_stopping_rounds=100,
#     verbose_eval=50,
#     feval=rmsle
# )

# pred = model.predict(test)

# sub = pd.DataFrame()
# sub['ID'] = np.arange(0, 600)
# sub['revenue'] = pred
# sub['revenue'] = np.expm1(sub['revenue'])