# Feature engineering

## 데이터 읽어오기
데이터를 읽어온 후  `parse_dates` 메소드를 이용하여 __click_time__ 컬럼을 시간 값으로 파싱

In [1]:
import pandas as pd
clicks = pd.read_csv('train_sample.csv',parse_dates=['click_time'])

clicks 데이터 전반

In [2]:
clicks.head()

Unnamed: 0,ip,app,device,os,channel,click_time,attributed_time,is_attributed
0,89489,3,1,13,379,2017-11-06 15:13:23,,0
1,204158,35,1,13,21,2017-11-06 15:41:07,2017-11-07 08:17:19,1
2,3437,6,1,13,459,2017-11-06 15:42:32,,0
3,167543,3,1,13,379,2017-11-06 15:56:17,,0
4,147509,3,1,13,379,2017-11-06 15:57:01,,0


__click_time__을 연,월,일,시,분 단위로 쪼개서 추가

In [3]:
click_time = clicks['click_time']

clicks = clicks.assign(year = click_time.dt.year.astype('int64'),
                       month = click_time.dt.month.astype('uint8'),
                       day = click_time.dt.day.astype('uint8'),
                       hour = click_time.dt.hour.astype('uint8'),
                       minute = click_time.dt.minute.astype('uint8'))
clicks.head()

Unnamed: 0,ip,app,device,os,channel,click_time,attributed_time,is_attributed,year,month,day,hour,minute
0,89489,3,1,13,379,2017-11-06 15:13:23,,0,2017,11,6,15,13
1,204158,35,1,13,21,2017-11-06 15:41:07,2017-11-07 08:17:19,1,2017,11,6,15,41
2,3437,6,1,13,459,2017-11-06 15:42:32,,0,2017,11,6,15,42
3,167543,3,1,13,379,2017-11-06 15:56:17,,0,2017,11,6,15,56
4,147509,3,1,13,379,2017-11-06 15:57:01,,0,2017,11,6,15,57


## 카테고리 feature 골라내기
연속적 데이터가 아닌 이산변수 혹은 비정형 데이터를 골라내어 라벨 인코딩
 - ip, app, device, os, channel은 전부 정형 이산 데이터이므로 모델의 성능을 높이기 위하여 라벨 인코딩 실시

In [4]:
from sklearn.preprocessing import LabelEncoder

cat_col = ['ip', 'app', 'device', 'os', 'channel']

for col in cat_col:
    label_encoder = LabelEncoder()
    clicks[col] = label_encoder.fit_transform(clicks[col])
    
clicks.head()

Unnamed: 0,ip,app,device,os,channel,click_time,attributed_time,is_attributed,year,month,day,hour,minute
0,27226,3,1,13,120,2017-11-06 15:13:23,,0,2017,11,6,15,13
1,110007,35,1,13,10,2017-11-06 15:41:07,2017-11-07 08:17:19,1,2017,11,6,15,41
2,1047,6,1,13,157,2017-11-06 15:42:32,,0,2017,11,6,15,42
3,76270,3,1,13,120,2017-11-06 15:56:17,,0,2017,11,6,15,56
4,57862,3,1,13,120,2017-11-06 15:57:01,,0,2017,11,6,15,57


### Lightgbm 사용
lightgbm을 사용하기 위해 데이터 셋을 나누어준다.

**_데이터 셋을 나누는 `train_valid_split`함수를 이용하여 나눔_**

In [5]:
def train_valid_split(data_set, validation_size = 0.1):#default validation size is 0.1
    size = int(len(data_set)*validation_size)
    train = data_set[:-size*2]
    valid = data_set[-size*2:-size]
    test = data_set[-size:]
    return train, valid, test

Lightgbm으로 모델을 학습, 정확도 측정 후 출력하는 함수 생성

In [6]:
import lightgbm as lgb
from sklearn import metrics
import time
def model_train(train, valid, test=None, feature_col=None):
    if feature_col is None:
        feature_col = train.columns.drop(['click_time', 'attributed_time', 'is_attributed'])
        
    dtrain = lgb.Dataset(train[feature_col],label = train['is_attributed'])
    dvalid = lgb.Dataset(valid[feature_col],label = valid['is_attributed'])
    
    #parameter setting
    param = {'num_leaves':64,'objective':'binary',
             'metric':'auc', 'seed':7}
    num_round = 1000
    print('start of Training')
    start = time.time()
    bst= lgb.train(param,dtrain, num_round, dvalid,
                   early_stopping_rounds = 20,verbose_eval = False)
    print('end of Training')
    print(f'total duration = {time.time() - start}')
    pred = bst.predict(valid[feature_col])
    score = metrics.roc_auc_score(valid['is_attributed'], pred)
    print(f'the score of model is {score*100}%')
    
    if test is not None:
        test_pred = bst.predict(test[feature_col])
        test_score = metrics.roc_auc_score(test['is_attributed'], test_pred)
        print(f'test score is {test_score}')
        return bst,test_pred, test_score
    else:
        return bst, score

train valid로 데이터셋 분리

In [35]:
train , valid, test= train_valid_split(clicks)

print(len(train),len(valid), len(test))
print(int(len(train)/len(valid)))#train 데이터가 약 8배 더 많은 것을 알 수 있다 8:1:1

1840449 230056 230056
8


모델 학습

In [50]:
_ = model_train(train,valid,test)

start of Training
end of Training
total duration = 57.59195137023926
the score of model is 96.2557153080171%
test score is 0.9729837383416626


정확도를 더 올리기 위하여 다음과 같은 feature engineering을 수행 한다
 - interaction feature생성 (각 카테고리 컬럼을 조합시켜 생성)
 - 6시간 단위(window)로 쪼개서 발생한 사건의 수 생성(**_과거부터 현재 바로 직전까지 정보 활용_**)
 - 미래의 정보(데이터셋 기준으로)

In [7]:
import itertools #for문 작성시 조합을 위해
interaction = pd.DataFrame(index=clicks.index)
for col1, col2 in itertools.combinations(cat_col,2):
    new_col_name = '_'.join([col1,col2])
    new_value = clicks[col1].map(str) + '_' + clicks[col2].map(str)
    
    interaction[new_col_name] = label_encoder.fit_transform(new_value)

In [8]:
clicks = clicks.join(interaction)

train , valid, test = train_valid_split(clicks)
_ = model_train(train,valid, test)

start of Training
end of Training
total duration = 54.67475485801697
the score of model is 96.25675035856062%
test score is 0.9728261145172402
