In [20]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

trade = pd.read_csv('trade_train.csv', index_col=0)
stock = pd.read_csv('stocks.csv', index_col=0)
answer = pd.read_csv('answer_sheet.csv')

In [23]:
import lightgbm as lgb
from catboost import CatBoostRegressor, Pool
from sklearn.linear_model import Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

In [141]:
seq2seq_df = pd.read_csv('seq2seq_df.csv')

# make dataset

In [49]:
def makeDataset(before_df, use_cut=False, cut_quantile=0.99, rolling_range=[2,3], diff_range=[1,2]):
    df = before_df.copy()
    df = df.sort_values(by=['기준년월','종목번호','그룹번호']).reset_index(drop=True)
    
    # cut
    if use_cut:
        cut1 = df['매수고객수'].quantile(cutQuantile)
        cut2 = df['매도고객수'].quantile(cutQuantile)

        df.loc[df['매수고객수'] > cut1] = cut1
        df.loc[df['매도고객수'] > cut2] = cut2

    # rolling
    rolling_cols = ['매도고객수', '매수고객수']
    for rolling_col in rolling_cols:
        for i in rolling_range:
            df[str(rolling_col)+'rolling_mean'+str(i)] = \
                df.groupby(['그룹번호','종목번호'])[rolling_col].transform(lambda x : x.rolling(i).mean())
        for i in rolling_range:
            df[str(rolling_col)+'rolling_std'+str(i)] = \
                df.groupby(['그룹번호','종목번호'])[rolling_col].transform(lambda x : x.rolling(i).std())
        for i in rolling_range:
            df[str(rolling_col)+'rolling_max'+str(i)] = \
                df.groupby(['그룹번호','종목번호'])[rolling_col].transform(lambda x : x.rolling(i).max())
        for i in rolling_range:
            df[str(rolling_col)+'rolling_min'+str(i)] = \
                df.groupby(['그룹번호','종목번호'])[rolling_col].transform(lambda x : x.rolling(i).min())

    # diff
    diff_cols = ['매도고객수', '매수고객수']
    for diff_col in diff_cols:
        for i in rolling_range:
            df[str(diff_col)+'diff1'] = df.groupby(['그룹번호','종목번호'])[diff_col].diff(i)
            df[str(diff_col)+'diff2'] = df.groupby(['그룹번호','종목번호'])[diff_col].diff(i)

    df = df.dropna(axis=0).reset_index(drop=True)

    # target
    df['target'] = df.groupby(['종목번호','그룹번호'])['매수고객수'].shift(-1)

    df_cols = df.columns
    for cols in df_cols:
        if df[cols].dtypes == np.float64:
            df[cols] = round(df[cols], 2)
    
    print(f'cut use : {use_cut}, quantile : {cut_quantile}, rolling range: {rolling_range}, diff range : {diff_range}')         
    return df

# CV

In [84]:
def makeCV(df, train_ = [201910, 201911, 201912, 202001, 202002, 202003, 202004], val_ = 202005, test_ = 202006, use_catboost=False):
    
    y_train = df.loc[df['기준년월'].isin(train_), 'target'].reset_index(drop=True)
    y_val = df.loc[df['기준년월']==val_, 'target'].reset_index(drop=True)
    
    X_train = df.loc[df['기준년월'].isin(train_)].reset_index(drop=True)
    X_val = df.loc[df['기준년월']==val_].reset_index(drop=True)
    X_test = df.loc[df['기준년월']==test_].reset_index(drop=True)
    
    X_train = X_train.iloc[:, :-2]
    X_val = X_val.iloc[:, :-2]
    X_test = X_test.iloc[:, :-2]
    
    X_train['year'] = X_train['기준년월'].map(str).map(lambda x : x[:4])
    X_val['year'] = X_val['기준년월'].map(str).map(lambda x : x[:4])
    X_test['year'] = X_test['기준년월'].map(str).map(lambda x : x[:4])
    X_train['year'] = X_train['year'].map({'2019' : 0, '2020' : 1})
    X_val['year'] = X_val['year'].map({'2019' : 0, '2020' : 1})
    X_test['year'] = X_test['year'].map({'2019' : 0, '2020' : 1})

    weight_idx = []
    for i in train_:
        idx = X_train.loc[X_train['기준년월']==i].index[0]
        weight_idx.append(idx)
    print(f'weight_idx : {weight_idx}')
        
    X_train.drop('기준년월', axis=1 , inplace=True)
    X_val.drop('기준년월', axis=1 , inplace=True)
    X_test.drop('기준년월', axis=1 ,inplace=True)
    
    if use_catboost:
        X_train_cat, y_train_cat = X_train.copy(), y_train.copy()
        X_val_cat, y_val_cat = X_val.copy(), y_val.copy()
        X_test_cat = X_test.copy()
        return X_train_cat, y_train_cat, X_val_cat, y_val_cat, X_test_cat
    
    return X_train, y_train, X_val, y_val, X_test

# Encoding
- 종목번호, 그룹번호
- 나중에 cat2vec 하면 여기에 이용해보기

In [71]:
def encoding(X_train, X_val, X_test, category_cols = ['종목번호', '시장구분', '표준산업구분코드_대분류','그룹번호']):
    idx = X_train.loc[X_train['year']==1, category_cols].shape[0]
    import category_encoders as ce
    encoder = ce.TargetEncoder()
    encoder.fit(X_train.loc[X_train['year']==1, category_cols], y_train[-idx:])

    X_train[category_cols] = encoder.transform(X_train[category_cols])
    X_val[category_cols] = encoder.transform(X_val[category_cols])
    X_test[category_cols] = encoder.transform(X_test[category_cols])
    
    return X_train, X_val, X_test

# submission

In [52]:
def make_sub(X_test_cat, prediction):
    answer = pd.read_csv('answer_sheet.csv')
    result_cols = ['종목번호','그룹번호']
    sub = X_test_cat[result_cols]
    sub['pred'] = prediction
    sub = sub.sort_values(by=['그룹번호','pred'], ascending=[True, False])
    group_num = sub['그룹번호'].unique()
    for num in group_num:
        val = sub.loc[sub['그룹번호']==num][:3]['종목번호'].sort_values().values
        answer.loc[answer['그룹명']==num,'종목번호1':] = val
    return answer

# catboost

In [116]:
def modelCatboost(X_train_cat, y_train_cat ,X_val_cat, y_val_cat, X_test_cat, category_cols, weight=None, params=None, selected_feature=None):
    
    if selected_feature:
        X_train_cat, X_val_cat, X_test_cat = \
        X_train_cat[selected_feature], X_val_cat[selected_feature], X_test_cat[selected_feature]

    sample_weight = np.zeros(X_train.shape[0])    
    for idx in weight:    
        sample_weight[idx:] = weight[idx]
    
    if not params:
        params = {
            'iterations': 5000,
            'learning_rate': 0.05,
            'random_seed': 42,
            'use_best_model': True,
            'task_type' : 'GPU',
            'early_stopping_rounds' : 500,
            'eval_metric' : 'RMSE'
        }

    train_pool = Pool(X_train_cat, 
                      y_train_cat, 
                      cat_features=category_cols, 
                      weight=sample_weight)
    validate_pool = Pool(X_val_cat, 
                         y_val_cat, 
                         cat_features=category_cols)

    model = CatBoostRegressor(**params)
    model.fit(train_pool, eval_set=validate_pool, verbose=100)
    
    pred_train = model.predict(X_train_cat)
    pred_val = model.predict(X_val_cat)
    pred_test = model.predict(X_test_cat)
    
    # model score
    print('catboost best score')
    print(model.best_score_)
    
    # model feature importance
    print('catboost feature importance')
    feature_importances = model.get_feature_importance(train_pool)
    feature_names = X_train.columns
    for score, name in sorted(zip(feature_importances, feature_names), reverse=True):
        print('{}: {}'.format(name, score))
        
    return model, pred_train, pred_val, pred_test

# lgb

In [107]:
def modelLightgbm(X_train, y_train ,X_val, y_val, X_test, category_cols, weight=None, params=None, selected_feature=None):
    
    if selected_feature:
        X_train, X_val, X_test = \
        X_train[selected_feature], X_val[selected_feature], X_test[selected_feature]
        
    sample_weight = np.zeros(X_train.shape[0])    
    for idx in weight:    
        sample_weight[idx:] = weight[idx]
    
    if not params:
        params = {'objective': 'regression',
                     'metric': 'rmse',
                     'boosting_type': 'gbdt',
                     'learning_rate': 0.005,
                     'seed': 42,
                     'num_iterations' : 5000,
                     'early_stopping_rounds' : 1000
                    }

    trn_data = lgb.Dataset(X_train,
                           label=y_train,
                           categorical_feature=category_cols, 
                           weight=sample_weight)
    val_data = lgb.Dataset(X_val,
                           label=y_val,
                           categorical_feature=category_cols)

    model = lgb.train(params,
                    trn_data,
                    valid_sets=[trn_data, val_data],
                    verbose_eval=100)
    
    pred_train = model.predict(X_train)
    pred_val = model.predict(X_val)
    pred_test = model.predict(X_test)
    
    # model feature importance
    print('lightgbm feature importance')
    feature_importances = model.feature_importance()
    feature_names = model.feature_name()
    for score, name in sorted(zip(feature_importances, feature_names), reverse=True):
        print('{}: {}'.format(name, score))
        
    return model, pred_train, pred_val, pred_test

# Ridge, Lasso

In [19]:
def linear(X_train, y_train, X_val, y_val, X_test, use_ridge=False, use_lasso=False, max_iter=1000):
    
    if use_ridge:
        ridge = Ridge(max_iter=max_iter)
        ridge.fit(X_train, y_train)
        
        pred_train = ridge.predict(X_train)
        pred_val = ridge.predict(X_val)
        pred_test = ridge.predict(X_test)
        
        print(f'ridge train rmse : {np.sqrt(mean_squared_error(y_train, pred_train))}')
        print(f'ridge validation rmse : {np.sqrt(mean_squared_error(y_val, pred_val))}')
        
        return model, pred_train, pred_val, pred_test

    if use_lasso:
        lasso = Lasso(max_iter=max_iter)
        lasso.fit(X_train, y_train)
        
        pred_train = lasso.predict(X_train)
        pred_val = lasso.predict(X_val)
        pred_test = lasso.predict(X_test)
        
        print(f'lasso train rmse : {np.sqrt(mean_squared_error(y_train, pred_train))}')
        print(f'lasso validation rmse : {np.sqrt(mean_squared_error(y_val, pred_val))}')
        
        return model, pred_train, pred_val, pred_test

# Rf

In [102]:
rf = RandomForestRegressor(n_estimators=2000, n_jobs=-1, random_state=42)
rf.fit(X_train, y_train)
rf_val_pred = rf.predict(X_val)

In [103]:
print(f'random forest validation rmse : {np.sqrt(mean_squared_error(y_val, rf_val_pred))}')

random forest validation rmse : 7.195825827483215


# Ensemble

In [109]:
cat_pred = model.predict(X_test_cat)
lgb_pred = clf.predict(X_test)
ridge_pred = ridge.predict(X_test)
lasso_pred = lasso.predict(X_test)
rf_pred = rf.predict(X_test)

In [113]:
import warnings
warnings.filterwarnings(action='ignore')

ave_cols = ['종목번호', '그룹번호']
ave_df = X_test_cat[ave_cols]
ave_df['cat'] = cat_pred
ave_df['lgb'] = lgb_pred
ave_df['ridge'] = ridge_pred
ave_df['lasso'] = lasso_pred
ave_df['rf'] = rf_pred

# cat
- smaller

In [58]:
df = makeDataset(seq2seq_df, use_cut=False, cut_quantile=0.99, rolling_range=[2,3], diff_range=[1])

In [147]:
X_train_cat, y_train_cat, X_val_cat, y_val_cat, X_test_cat = \
    makeCV(df, train_ = [201910, 201911, 201912, 202001, 202002, 202003, 202004], val_ = 202005, test_ = 202006, use_catboost=True)

weight_idx : [0, 6192, 12384, 18720, 25104, 31536, 37968]


In [150]:
category_cols = ['종목번호', '시장구분', '표준산업구분코드_대분류','그룹번호']
weight = {0:1, 6144:1, 12336:1, 18528:2, 24864:2, 31248:3, 37680:3}
selected_feature = ['종목번호',  '시장구분', '표준산업구분코드_대분류', '그룹번호', '그룹내고객수',
       '매수고객수', '매도고객수', '매도고객수rolling_mean2', '매도고객수rolling_mean3',
       '매도고객수rolling_std2', '매도고객수rolling_std3', 
       '매수고객수rolling_mean2', '매수고객수rolling_mean3', '매수고객수rolling_std2',
       '매수고객수rolling_std3',  '매도고객수diff1', '매수고객수diff1']
model, pred_train, pred_val, pred_test=\
    modelCatboost(X_train_cat, y_train_cat, X_val_cat, y_val_cat, X_test_cat , category_cols, weight=weight, params=None, selected_feature=None)

0:	learn: 13.0370648	test: 20.4117509	best: 20.4117509 (0)	total: 34.1ms	remaining: 2m 50s
100:	learn: 9.7544461	test: 15.7890480	best: 15.7890480 (100)	total: 2.2s	remaining: 1m 46s
200:	learn: 8.3769739	test: 14.1968691	best: 14.1941071 (199)	total: 4.39s	remaining: 1m 44s
300:	learn: 7.3204293	test: 12.3297154	best: 12.3267530 (290)	total: 6.9s	remaining: 1m 47s
400:	learn: 6.5200231	test: 11.3890727	best: 11.3890727 (400)	total: 9.44s	remaining: 1m 48s
500:	learn: 6.1165350	test: 10.8075134	best: 10.8016425 (496)	total: 11.8s	remaining: 1m 46s
600:	learn: 5.6761064	test: 9.9206066	best: 9.9205899 (598)	total: 14.4s	remaining: 1m 45s
700:	learn: 5.3059512	test: 9.3528984	best: 9.3467027 (698)	total: 17s	remaining: 1m 44s
800:	learn: 5.2144634	test: 9.2900421	best: 9.2763927 (784)	total: 19.4s	remaining: 1m 41s
900:	learn: 5.0907373	test: 9.1639777	best: 9.1623334 (897)	total: 22.1s	remaining: 1m 40s
1000:	learn: 4.9793504	test: 8.9985207	best: 8.9848443 (983)	total: 24.5s	remaining:

In [151]:
cat_answer = make_sub(X_test_cat, pred_test)
cat_answer.to_csv('0913_cat_5_Fs_rolling23_diff1_lowerWeight.csv', index=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sub['pred'] = prediction


In [142]:
df = makeDataset(seq2seq_df, use_cut=False, cut_quantile=0.99, rolling_range=[2], diff_range=[1])

cut use : False, quantile : 0.99, rolling range: [2], diff range : [1]


In [143]:
X_train, y_train, X_val, y_val, X_test = \
    makeCV(df, train_ = [201910,201911, 201912, 202001, 202002, 202003, 202004], val_ = 202005, test_ = 202006, use_catboost=False)

weight_idx : [0, 6192, 12384, 18720, 25104, 31536, 37968]


In [144]:
X_train, X_val, X_test=\
    encoding(X_train, X_val, X_test, category_cols = ['종목번호', '시장구분', '표준산업구분코드_대분류','그룹번호'])

In [145]:
category_cols = ['종목번호', '시장구분', '표준산업구분코드_대분류','그룹번호']
weight = {0:1, 6144:1, 12336:1, 18528:2, 24864:2, 31248:6, 37680:6}
model, pred_train, pred_val, pred_test=\
    modelLightgbm(X_train, y_train, X_val, y_val, X_test , category_cols, weight=weight, params=None, selected_feature=None)



You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3493
[LightGBM] [Info] Number of data points in the train set: 44400, number of used features: 28
[LightGBM] [Info] Start training from score 2.137842
Training until validation scores don't improve for 1000 rounds
[100]	training's rmse: 9.54359	valid_1's rmse: 14.9162
[200]	training's rmse: 6.94481	valid_1's rmse: 11.1425
[300]	training's rmse: 5.54281	valid_1's rmse: 9.19433
[400]	training's rmse: 4.79602	valid_1's rmse: 8.34084
[500]	training's rmse: 4.35919	valid_1's rmse: 7.84182
[600]	training's rmse: 4.09998	valid_1's rmse: 7.59755
[700]	training's rmse: 3.9311	valid_1's rmse: 7.47428
[800]	training's rmse: 3.79707	valid_1's rmse: 7.42138
[900]	training's rmse: 3.70309	valid_1's rmse: 7.3844
[1000]	training's rmse: 3.62237	valid_1's rmse: 7.35422
[1100]	training's rmse: 3.54466	valid_1's rmse: 7.30107
[1200]	training's rmse: 3.4782

In [146]:
lgb_answer = make_sub(X_test_cat, pred_test)
lgb_answer.to_csv('0913_lgb_2_rolling2_diff1.csv', index=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sub['pred'] = prediction


In [152]:
model, pred_train, pred_val, pred_test = \
    linear(X_train, y_train, X_val, y_val, X_test, use_ridge=False, use_lasso=True, max_iter=1000)

lasso train rmse : 4.700378025828403
lasso validation rmse : 6.630539596472221


  model = cd_fast.enet_coordinate_descent(


In [153]:
lasso_answer = make_sub(X_test_cat, pred_test)
lasso_answer.to_csv('0913_lasso_2.csv', index=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sub['pred'] = prediction


# 기초 통계
- catboost랑 ensemble
- 6월달에 구매한 애들만 평균내서!

In [162]:
X_test_cat['pred'] = pred_test

In [163]:
X_test_cat.shape

(6432, 29)

In [174]:
X_test_cat['target'] = df.loc[df['기준년월']==202005, 'target'].reset_index(drop=True)

In [175]:
tmp = X_test_cat[['종목번호','그룹번호','pred','target']]
tmp.head()

Unnamed: 0,종목번호,그룹번호,pred,target
0,A000100,MAD01,-0.298653,0.0
1,A000100,MAD02,-0.42931,0.0
2,A000100,MAD03,3.558176,7.0
3,A000100,MAD04,26.316327,26.0
4,A000100,MAD05,19.814012,20.0


In [176]:
pred_test = tmp['pred'] + tmp['target']

In [177]:
lasso_answer = make_sub(X_test_cat, pred_test)
lasso_answer.to_csv('0913_statistic_lasso_ensemble.csv', index=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sub['pred'] = prediction
