In [1]:
import numpy as np
import pandas as pd
from sklearn.metrics import mean_squared_error, roc_auc_score

In [2]:
#datasets = ['check_1_r', 'check_2_r', 'check_3_r', 'check_4_c', 'check_5_c', 'check_6_c', 'check_7_c', 'check_8_c']
dataset = 'check_1_r'
result_dir = './res'
data_dir = './data'

In [3]:
df = pd.read_csv('{}/{}/test-target.csv'.format(data_dir, dataset))
df.head()

Unnamed: 0,line_id,target
0,0,38.682374
1,2,32.982233
2,6,43.440548
3,10,32.219988
4,13,0.0


In [4]:
df_pred = pd.read_csv('{}/{}/pred.csv'.format(result_dir, dataset))
df_pred.head()

Unnamed: 0,line_id,prediction
0,0,39.346196
1,2,24.313224
2,6,45.143301
3,10,38.668763
4,13,3.012409


In [5]:
df = pd.merge(df, df_pred, on='line_id', left_index=True)
df.head()

Unnamed: 0,line_id,target,prediction
0,0,38.682374,39.346196
1,2,32.982233,24.313224
2,6,43.440548,45.143301
3,10,32.219988,38.668763
4,13,0.0,3.012409


In [6]:
score = roc_auc_score(df.target.values, df.prediction.values) if dataset[-1] == 'c' else \
            np.sqrt(mean_squared_error(df.target.values, df.prediction.values))
print('Score {:0.4f}'.format(score))

Score 10.5086


In [7]:
#df.memory_usage().sum()

# Пройдем весь путь с одним датасетом

In [8]:
import argparse
import os
import pandas as pd
import pickle
import time

from sklearn.linear_model import Ridge, LogisticRegression
from sklearn.preprocessing import StandardScaler

import lightgbm as lgb

from sklearn import model_selection

from utils import transform_datetime_features
from sdsj_feat import load_data

In [9]:
#os.system('python train.py --mode {} --train-csv {} --model-dir {}'.format(
#        'regression' if dataset[-1] == 'r' else 'classification',
#        '{}/{}/train.csv'.format(data_dir, dataset),
#        '{}/{}/'.format(result_dir, dataset)

In [10]:
#parser = argparse.ArgumentParser()
#parser.add_argument('--train-csv', required=True)
#parser.add_argument('--model-dir', required=True)
#parser.add_argument('--mode', choices=['classification', 'regression'], required=True)
#args = parser.parse_args()

model_dir = 'exp'

start_time = time.time()

df_X, df_y, model_config, _ = load_data('{}/{}/train.csv'.format(data_dir, dataset))

model_config['mode'] = 'regression'

params =  {
	'task': 'train',
	'boosting_type': 'gbdt',
	'objective': 'regression',# if args.mode == 'regression' else 'binary',
	'metric': 'rmse',
	"learning_rate": 0.01,
	"num_leaves": 200,
	"feature_fraction": 0.70,
	"bagging_fraction": 0.70,
	'bagging_freq': 4,
	"max_depth": -1,
        "verbosity" : -1,
	"reg_alpha": 0.3,
	"reg_lambda": 0.1,
	#"min_split_gain":0.2,
	"min_child_weight":10,
	'zero_as_missing':True,
        'num_threads': 4,
    }

params['seed'] = 1
model = lgb.train(params, lgb.Dataset(df_X, label=df_y), 600)

model_config['model'] = model
model_config['params'] = params

#result_dir = './res'

model_config_filename = os.path.join(model_dir, 'model_config.pkl')
with open(model_config_filename, 'wb') as fout:
    pickle.dump(model_config, fout, protocol=pickle.HIGHEST_PROTOCOL)

print('Train time: {}'.format(time.time() - start_time))

Dataset read, shape (365, 41)
Transform datetime done, shape (365, 46)
Transform categorical done, shape (365, 46)
Used 43 columns
Train time: 0.3782505989074707


## Делаем предсказания

In [11]:
#call from validate
#    os.system('python predict.py --prediction-csv {} --test-csv {} --model-dir {}'.format(
#        '{}/{}/pred.csv'.format(result_dir, dataset),
#        '{}/{}/test.csv'.format(data_dir, dataset),
#        '{}/{}/'.format(result_dir, dataset)
#    ))

#    parser = argparse.ArgumentParser()
#    parser.add_argument('--test-csv', required=True)
#    parser.add_argument('--prediction-csv', type=argparse.FileType('w'), required=True)
#    parser.add_argument('--model-dir', required=True)
#    args = parser.parse_args()

start_time = time.time()

# load model
model_config_filename = os.path.join(model_dir, 'model_config.pkl')
with open(model_config_filename, 'rb') as fin:
    model_config = pickle.load(fin)

test_csv = '{}/{}/test.csv'.format(data_dir, dataset)
    
X_scaled, _, _, df = load_data(test_csv, datatype='test', cfg=model_config)

model = model_config['model']
    #df = pd.read_csv(args.test_csv, usecols=['line_id',])
    #print(args.test_csv)
    #df = pd.read_csv(args.test_csv)
if model_config['mode'] == 'regression':
    df['prediction'] = model.predict(X_scaled)
elif model_config['mode'] == 'classification':
    #df['prediction'] = model.predict_proba(X_scaled)[:, 1]
    df['prediction'] = model.predict(X_scaled)

df[['line_id', 'prediction']].to_csv('exp/pred.csv', index=False)

print('Prediction time: {}'.format(time.time() - start_time))

Dataset read, shape (172, 41)
Transform datetime done, shape (172, 46)
Transform categorical done, shape (172, 46)
Used 43 columns
Prediction time: 0.15910553932189941


## Отдельные функции

In [12]:
df = pd.read_csv('{}/{}/train.csv'.format(data_dir, dataset))
df.head()

Unnamed: 0,datetime_0,target,number_0,number_1,number_2,number_3,number_4,number_5,number_6,number_7,...,number_30,number_31,number_32,number_33,number_34,number_35,number_36,number_37,number_38,line_id
0,2017-03-11,0.0,0.48584,0.169376,0.866834,0.968541,0.00106,1.0,0.573018,0.981073,...,0,0,0,0,0,0,0,0,0,1
1,2017-01-10,30.231827,0.303703,0.200773,0.869347,0.973308,0.009912,1.0,0.141864,0.993691,...,0,0,0,0,0,0,0,0,0,3
2,2017-02-26,0.0,0.568567,0.190993,0.929648,0.987607,0.002207,1.0,1.0,1.0,...,0,0,0,0,0,0,0,0,0,4
3,2017-02-02,22.756754,0.336114,0.051069,0.919598,0.983794,0.006191,1.0,0.166898,0.984227,...,0,0,0,0,0,0,0,0,0,5
4,2017-06-08,22.754865,0.330694,0.132386,0.655779,0.889418,0.199192,0.727273,0.126565,0.70347,...,0,0,0,0,0,0,0,0,0,7


In [13]:
# features from datetime
df = transform_datetime_features(df)
df.head()

Unnamed: 0,datetime_0,target,number_0,number_1,number_2,number_3,number_4,number_5,number_6,number_7,...,number_35,number_36,number_37,number_38,line_id,number_year_datetime_0,number_weekday_datetime_0,number_month_datetime_0,number_day_datetime_0,number_hour_datetime_0
0,2017-03-11,0.0,0.48584,0.169376,0.866834,0.968541,0.00106,1.0,0.573018,0.981073,...,0,0,0,0,1,2017,5,3,11,0
1,2017-01-10,30.231827,0.303703,0.200773,0.869347,0.973308,0.009912,1.0,0.141864,0.993691,...,0,0,0,0,3,2017,1,1,10,0
2,2017-02-26,0.0,0.568567,0.190993,0.929648,0.987607,0.002207,1.0,1.0,1.0,...,0,0,0,0,4,2017,6,2,26,0
3,2017-02-02,22.756754,0.336114,0.051069,0.919598,0.983794,0.006191,1.0,0.166898,0.984227,...,0,0,0,0,5,2017,3,2,2,0
4,2017-06-08,22.754865,0.330694,0.132386,0.655779,0.889418,0.199192,0.727273,0.126565,0.70347,...,0,0,0,0,7,2017,3,6,8,0
