In [1]:
import time
import numpy as np
import pandas as pd
from tqdm import tqdm
import lightgbm as lgb
import gc
from datetime import datetime
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import mean_squared_log_error
from sklearn.preprocessing import LabelEncoder
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
import warnings
warnings.filterwarnings('ignore')
pd.set_option('max_columns', None)
pd.set_option('max_rows', None)

In [2]:
train = pd.read_csv('../input/train.csv', encoding='latin-1')
test = pd.read_csv('../input/test.csv', encoding='latin-1')

y_mean = train['revenue'].mean()
train['revenue'] = np.log1p(train['revenue'])

data = pd.concat([train, test], axis=0, ignore_index=True)

In [3]:
data.head()

Unnamed: 0,id,belongs_to_collection,budget,genres,homepage,imdb_id,original_language,original_title,overview,popularity,poster_path,production_companies,production_countries,release_date,runtime,spoken_languages,status,tagline,title,Keywords,cast,crew,revenue
0,0,"[{'id': 86780, 'name': 'Clash of the Titans Co...",150000000,"[{'id': 12, 'name': 'Adventure'}]",http://www.wrathofthetitansmovie.org,tt1646987,en,Wrath of the Titans,A decade after his heroic defeat of the monstr...,7.739904,/Albfq3ziSCQVyh5PzMSsFmmgHmy.jpg,"[{'name': 'Legendary Pictures', 'id': 923}, {'...","[{'iso_3166_1': 'ES', 'name': 'Spain'}, {'iso_...",3/27/12,99.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Feel the Wrath,Wrath of the Titans,"[{'id': 1449, 'name': 'underworld'}, {'id': 20...","[{'cast_id': 4, 'character': 'Perseus', 'credi...","[{'credit_id': '52fe4926c3a36847f818b96d', 'de...",19.522621
1,1,,35000000,"[{'id': 27, 'name': 'Horror'}, {'id': 9648, 'n...",,tt0120681,en,From Hell,Frederick Abberline is an opium-huffing inspec...,7.79014,/f3J77Cy3pRSeeN52Pk8oIvgi6IN.jpg,[{'name': 'Twentieth Century Fox Film Corporat...,"[{'iso_3166_1': 'CZ', 'name': 'Czech Republic'...",10/19/01,122.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Only the legend will survive.,From Hell,"[{'id': 1465, 'name': 'loss of family'}, {'id'...","[{'cast_id': 19, 'character': 'Inspector Frede...","[{'credit_id': '52fe4273c3a36847f801fbfb', 'de...",18.127089
2,2,,4000000,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",,tt0061735,en,Guess Who's Coming to Dinner,Matt and Christina Drayton are a couple whose ...,5.032469,/bhdXWhembE6a6q11NqpUcEdtAbw.jpg,"[{'name': 'Stanley Kramer Productions', 'id': ...","[{'iso_3166_1': 'US', 'name': 'United States o...",12/11/67,108.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,A love story of today,Guess Who's Coming to Dinner,"[{'id': 582, 'name': 'san francisco'}, {'id': ...","[{'cast_id': 1, 'character': 'Matt Drayton', '...","[{'credit_id': '52fe431dc3a36847f803b563', 'de...",17.852697
3,3,,72500000,"[{'id': 35, 'name': 'Comedy'}]",http://www.sonypictures.com/homevideo/talladeg...,tt0415306,en,Talladega Nights: The Ballad of Ricky Bobby,Lifelong friends and national idols Ricky Bobb...,6.936688,/hi8whfL7t6cL2LITLJjzJ7UWuZA.jpg,"[{'name': 'Columbia Pictures', 'id': 5}, {'nam...","[{'iso_3166_1': 'US', 'name': 'United States o...",8/4/06,116.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,The story of a man who could only count to #1,Talladega Nights: The Ballad of Ricky Bobby,"[{'id': 5922, 'name': 'north carolina'}, {'id'...","[{'cast_id': 8, 'character': 'Ricky Bobby', 'c...","[{'credit_id': '52fe4521c3a36847f80be20f', 'de...",18.909053
4,4,,20000000,"[{'id': 14, 'name': 'Fantasy'}, {'id': 10402, ...",,tt0081777,en,Xanadu,A beautiful muse inspires an artist and his ol...,3.782547,/wc3dC8DrI6qgD5sQlIkbjU7wcaS.jpg,"[{'name': 'Universal Pictures', 'id': 33}]","[{'iso_3166_1': 'US', 'name': 'United States o...",8/8/80,96.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,"A Fantasy, A Musical, A Place Where Dreams Com...",Xanadu,"[{'id': 4344, 'name': 'musical'}, {'id': 6234,...","[{'cast_id': 1, 'character': 'Kira', 'credit_i...","[{'credit_id': '53700504c3a368121800ea1b', 'de...",16.927763


## drop特征

In [4]:
drop_cols = ['poster_path', 'imdb_id']
data.drop(drop_cols, axis=1, inplace=True)

## 时间特征

In [5]:
data['release_year'] = data['release_date'].apply(lambda x: '19' + x.split('/')[2] if int(x.split('/')[2]) > 20 else '20' + x.split('/')[2]).astype(int)
data['release_month'] = data['release_date'].apply(lambda x: x.split('/')[0]).astype(int)
data['release_day'] = data['release_date'].apply(lambda x: x.split('/')[1]).astype(int)

data['release_date'] = pd.to_datetime(data['release_year'].astype(str) + '-' + data['release_month'].astype(str) + '-' + data['release_day'].astype(str))

data['release_date_weekday'] = data['release_date'].apply(lambda x: x.weekday())
data['release_date_TONOW'] = (datetime.now() - data['release_date']).dt.days

data.drop(['release_day', 'release_date'], axis=1, inplace=True)

## 判断是否为空

In [6]:
isnull_cols = ['homepage', 'tagline', 'belongs_to_collection', 'overview']
for i in isnull_cols:
    data[i + '_isnull'] = data[i].isnull().astype(int)
data.drop(isnull_cols, axis=1, inplace=True)

## 数值特征

In [7]:
num_cols = ['runtime', 'popularity', 'budget']

for i in ['budget']:
    data[i] = np.log1p(data[i])

## 类别特征

In [8]:
data['title'][:5]

0                            Wrath of the Titans
1                                      From Hell
2                   Guess Who's Coming to Dinner
3    Talladega Nights: The Ballad of Ricky Bobby
4                                         Xanadu
Name: title, dtype: object

In [9]:
data['title'].isnull().sum(), data['original_title'].isnull().sum()

(0, 0)

In [10]:
cat_cols = ['original_language', 'status', 'title', 'original_title']

data['title=original_title'] = (data['title'] == data['original_title']).astype(int)

for i in tqdm(['original_language', 'status']):
    le = LabelEncoder()
    data[i] = le.fit_transform(data[i])

data['original_language_count'] = data['original_language'].map(data['original_language'].value_counts())
data['title_count'] = data['title'].map(data['title'].value_counts())

data.drop(['title', 'original_title'], axis=1, inplace=True)

100%|███████████████████████████████████████████████████████████████████████████████████| 2/2 [00:00<00:00, 501.29it/s]


## 嵌套特征

In [11]:
nested_cols = ['genres', 'production_companies', 'production_countries',
               'Keywords', 'spoken_languages', 'cast', 'crew']
for i in nested_cols:
    data[i + '_length'] = data[i].apply(lambda x: 0 if pd.isnull(x) else len(eval(x)))

# data['genres_0'] = data['genres'].apply(lambda x: np.nan if pd.isnull(x) else eval(x)[0]['name'])

# le = LabelEncoder()
# data['genres_0'] = le.fit_transform(data['genres_0'].astype(str))
# data['genres_0_count'] = data['genres_0'].map(data['genres_0'].value_counts())

# data.drop(nested_cols, axis=1, inplace=True)

In [12]:
def get_name(x):
    if pd.isnull(x):
        return []
    else:
        df = pd.DataFrame(eval(x))
        if 'name' in df.columns:
#             df['name'] = df['name'].apply(lambda s: ''.join(s.split()))
            return df['name'].tolist()
        else:
            return []


for i in nested_cols:
    print(i)
    data[i + '_name'] = data[i].apply(lambda x: get_name(x))

genres
production_companies
production_countries
Keywords
spoken_languages
cast
crew


In [13]:
def tfidf_emb(df_, cat_col, emb_size=10, seed=1024):
    print('Start tfidf ...')
    df = df_.copy()
    df[cat_col] = df[cat_col].fillna('-1')
    df[cat_col] = df[cat_col].apply(lambda x: ' '.join(x))
    tfidf_enc = TfidfVectorizer()
    tfidf_vec = tfidf_enc.fit_transform(df[cat_col])
    svd_enc = TruncatedSVD(n_components=emb_size, n_iter=20, random_state=seed)
    svd_vec = svd_enc.fit_transform(tfidf_vec)
    tfidf_df = pd.DataFrame(svd_vec)
    tfidf_df.columns = ['{}_tfidf_{}'.format(cat_col, i) for i in range(emb_size)]
    res = tfidf_df
    return res


def count2vec_emb(df_, cat_col, emb_size=10, seed=1024):
    print('Start count2vec ...')
    df = df_.copy()
    df[cat_col] = df[cat_col].fillna('-1')
    df[cat_col] = df[cat_col].apply(lambda x: ' '.join(x))
    count_enc = CountVectorizer()
    count_vec = count_enc.fit_transform(df[cat_col])
    svd_enc = TruncatedSVD(n_components=emb_size, n_iter=20, random_state=seed)
    svd_vec = svd_enc.fit_transform(count_vec)
    c2v_df = pd.DataFrame(svd_vec)
    c2v_df.columns = ['{}_count2vec_{}'.format(cat_col, i) for i in range(emb_size)]
    res = c2v_df
    return res


for i in [i + '_name' for i in nested_cols]:
    print(i)
    tfidf_df = tfidf_emb(data, i, emb_size=10, seed=1024)
    data = pd.concat([data, tfidf_df], axis=1)

for i in [i + '_name' for i in nested_cols]:
    print(i)
    c2v_df = count2vec_emb(data, i, emb_size=10, seed=1024)
    data = pd.concat([data, c2v_df], axis=1)

data.drop(nested_cols, axis=1, inplace=True)
data.drop([i + '_name' for i in nested_cols], axis=1, inplace=True)

genres_name
Start tfidf ...
production_companies_name
Start tfidf ...
production_countries_name
Start tfidf ...
Keywords_name
Start tfidf ...
spoken_languages_name
Start tfidf ...
cast_name
Start tfidf ...
crew_name
Start tfidf ...
genres_name
Start count2vec ...
production_companies_name
Start count2vec ...
production_countries_name
Start count2vec ...
Keywords_name
Start count2vec ...
spoken_languages_name
Start count2vec ...
cast_name
Start count2vec ...
crew_name
Start count2vec ...


In [14]:
train = data[data['revenue'].notnull()]
test = data[data['revenue'].isnull()]

used_cols = [i for i in train.columns if i not in ['id', 'release_date', 'revenue']]
y = train['revenue']
train = train[used_cols]
test = test[used_cols]

X_train, X_valid, y_train, y_valid = train_test_split(train, y, random_state=2020)

dtrain = lgb.Dataset(X_train, y_train)
dvalid = lgb.Dataset(X_valid, y_valid, reference=dtrain)

params = {
    'boosting_type': 'gbdt',
    'objective': 'regression',
    'metric': 'None',
    'learning_rate': 0.05,
    'seed': 2020
}

def rmsle(y_hat, data):
    y_true = data.get_label()
    y_hat = np.where(y_hat < 0, 1, y_hat)
    y_true = np.where(y_true < 0, 1, y_true)
    res = -np.sqrt(mean_squared_log_error(y_true, y_hat))
    return 'rmsle', res, True

model = lgb.train(
    params,
    dtrain,
    valid_sets=[dtrain, dvalid],
    num_boost_round=1000000,
    early_stopping_rounds=100,
    verbose_eval=50,
    feval=rmsle
)

pred = model.predict(test)

sub = pd.DataFrame()
sub['ID'] = np.arange(0, 600)
sub['revenue'] = pred
sub['revenue'] = np.expm1(sub['revenue'])

Training until validation scores don't improve for 100 rounds
[50]	training's rmsle: 0.158781	valid_1's rmsle: 0.202718
[100]	training's rmsle: 0.116965	valid_1's rmsle: 0.203308
Early stopping, best iteration is:
[1]	training's rmsle: 0.261183	valid_1's rmsle: 0.241942


In [15]:
sub.describe()

Unnamed: 0,ID,revenue
count,600.0,600.0
mean,299.5,8243012.0
std,173.349358,945045.7
min,0.0,5570158.0
25%,149.75,7876238.0
50%,299.5,8329069.0
75%,449.25,8829137.0
max,599.0,9835144.0


In [16]:
sub['revenue'] = np.where(sub['revenue'] <= 0, y_mean, sub['revenue'])
sub.to_csv('../sub/sub_{}.csv'.format(time.strftime('%Y%m%d')), index=False, header=False)