### 2020 KMU D&A Machine Learning Session Week 4

In [1]:
import pandas as pd 
import numpy as np
import warnings

warnings.filterwarnings(action='ignore')

train = pd.read_csv('data/w4/tr_train.csv', encoding='utf-8')
test = pd.read_csv('data/w4/tr_test.csv', encoding='utf-8')

In [2]:
cust_train = pd.read_csv('data/w4/cust_train.csv', encoding='utf-8')
cust_test = pd.read_csv('data/w4/cust_test.csv', encoding = 'utf-8')

In [3]:
train['PD_BUY_AM'] = train['PD_BUY_AM'].map(lambda x : int(str(x).replace(',','')))
train['PD_BUY_CT']  = train['PD_BUY_CT'].map(lambda x: int(str(x).replace(',','')))
test['PD_BUY_AM']  = test['PD_BUY_AM'].map(lambda x: int(str(x).replace(',','')))
test['PD_BUY_CT']  = test['PD_BUY_CT'].map(lambda x: int(str(x).replace(',','')))

train['TOT_PAG_VIEW_CT'].fillna(0, inplace=True)
train['TOT_PAG_VIEW_CT'] = train['TOT_PAG_VIEW_CT'].apply(lambda x: int(x))

test['TOT_PAG_VIEW_CT'].fillna(0, inplace=True)
test['TOT_PAG_VIEW_CT'] = test['TOT_PAG_VIEW_CT'].apply(lambda x: int(x))

train['TOT_SESS_HR_V'].fillna(0, inplace=True)
train['TOT_SESS_HR_V'] = train['TOT_SESS_HR_V'].apply(lambda x: int(x.replace(',','')))

test['TOT_SESS_HR_V'].fillna(0, inplace=True)
test['TOT_SESS_HR_V'] = test['TOT_SESS_HR_V'].apply(lambda x: int(x.replace(',','')))

In [4]:
train.drop(columns=['CITY_NM', 'ZON_NM', 'PD_NM'], inplace=True)
test.drop(columns=['CITY_NM', 'ZON_NM', 'PD_NM'], inplace=True)

In [5]:
features_train = []
features_test = []

In [6]:
# 고객별 총 구매 금액, 평균 구매액
train['AMOUNT'] = train['PD_BUY_AM'] * train['PD_BUY_CT']
test['AMOUNT'] = test['PD_BUY_AM'] * test['PD_BUY_CT']

f = train.groupby('CLNT_ID')['AMOUNT'].agg([('총 구매금액', np.sum),
                                            ('평균 구매액', lambda x : np.round(np.mean(x)))]).reset_index()
features_train.append(f)

f = test.groupby('CLNT_ID')['AMOUNT'].agg([('총 구매금액', np.sum),
                                            ('평균 구매액', lambda x : np.round(np.mean(x)))]).reset_index()
features_test.append(f)

In [7]:
# 주말 방문 비율
train['date'] = pd.to_datetime(train['SESS_DT'], format='%Y%m%d')
test['date'] = pd.to_datetime(test['SESS_DT'], format='%Y%m%d')

f = train.groupby('CLNT_ID')['date'].agg([('주말방문비율', lambda x : np.mean(x.dt.dayofweek > 4))]).reset_index()
features_train.append(f)

f = test.groupby('CLNT_ID')['date'].agg([('주말방문비율', lambda x : np.mean(x.dt.dayofweek > 4))]).reset_index()
features_test.append(f)

In [8]:
# 고객별 총 세션 시간 평균
f = train.groupby('CLNT_ID')['TOT_SESS_HR_V'].mean().reset_index()
features_train.append(f)

f = test.groupby('CLNT_ID')['TOT_SESS_HR_V'].mean().reset_index()
features_test.append(f)

In [10]:
# 고객별 접속 기기
f = pd.get_dummies(train[['CLNT_ID', 'DVC_CTG_NM']].groupby('CLNT_ID').max())
features_train.append(f)

f = pd.get_dummies(test[['CLNT_ID', 'DVC_CTG_NM']].groupby('CLNT_ID').max())
features_test.append(f)

In [11]:
# 구매횟수 / 검색횟수
d1 = train.groupby('CLNT_ID')['PD_BUY_CT'].sum().reset_index()
d2 = train.groupby('CLNT_ID')['SEARCH_CNT'].sum().reset_index()
d3 = pd.merge(d1, d2, on=['CLNT_ID'])
d3['BUY_RATE'] = d3['PD_BUY_CT'] / d3['SEARCH_CNT']
features_train.append(d3.drop(columns=['PD_BUY_CT', 'SEARCH_CNT']))

d1 = test.groupby('CLNT_ID')['PD_BUY_CT'].sum().reset_index()
d2 = test.groupby('CLNT_ID')['SEARCH_CNT'].sum().reset_index()
d3 = pd.merge(d1, d2, on=['CLNT_ID'])
d3['BUY_RATE'] = d3['PD_BUY_CT'] / d3['SEARCH_CNT']
features_test.append(d3.drop(columns=['PD_BUY_CT', 'SEARCH_CNT']))

In [12]:
# 고객별 페이지 조회 횟수 평균
f = train.groupby('CLNT_ID')['TOT_PAG_VIEW_CT'].mean().reset_index()
features_train.append(f)

f = test.groupby('CLNT_ID')['TOT_PAG_VIEW_CT'].mean().reset_index()
features_test.append(f)

In [13]:
# 고객별 구매한 브랜드 종류 수
f = train.groupby('CLNT_ID')['PD_BRA_NM'].agg([('브랜드 수', lambda x : x.nunique())]).reset_index()
features_train.append(f)

f = test.groupby('CLNT_ID')['PD_BRA_NM'].agg([('브랜드 수', lambda x : x.nunique())]).reset_index()
features_test.append(f)

In [14]:
# 방문 요일
f = train.groupby('CLNT_ID')['date'].agg([('요일',lambda x : np.max(x.dt.dayofweek))]).reset_index()
features_train.append(f)

f = test.groupby('CLNT_ID')['date'].agg([('요일',lambda x : np.max(x.dt.dayofweek))]).reset_index()
features_test.append(f)

In [15]:
# 검색 키워드수
features_train.append(train.groupby('CLNT_ID')['KWD_NM'].agg([('키워드 수', lambda x : x.nunique())]).reset_index())
features_test.append(test.groupby('CLNT_ID')['KWD_NM'].agg([('키워드 수', lambda x : x.nunique())]).reset_index())

In [16]:
# 방문 횟수
features_train.append(train.groupby('CLNT_ID')['date'].agg([('방문횟수', lambda x: x.nunique())]).reset_index())
features_test.append(test.groupby('CLNT_ID')['date'].agg([('방문횟수', lambda x: x.nunique())]).reset_index())

In [17]:
# 구매주기
f = train.groupby('CLNT_ID')['date'].agg([('구매주기', lambda x: int(((x.max() - x.min()).days) / x.nunique()))]).reset_index()
features_train.append(f)
f = test.groupby('CLNT_ID')['date'].agg([('구매주기', lambda x: int(((x.max() - x.min()).days) / x.nunique()))]).reset_index()
features_test.append(f)

In [18]:
# A분류 라벨인코딩
from sklearn.preprocessing import LabelEncoder
CLASS = train['CLAC1_NM'].unique()
for cl in test['CLAC1_NM'].unique():
    if cl not in CLASS:
        CLASS.append(cl)
encoder = LabelEncoder()
encoder.fit(CLASS)

In [19]:
train['C1_LAB'] = encoder.transform(train['CLAC1_NM'])
test['C1_LAB'] = encoder.transform(test['CLAC1_NM'])

f = train[['CLNT_ID','C1_LAB']].groupby(['CLNT_ID','C1_LAB']).size().groupby('CLNT_ID').max()
f = pd.DataFrame({'CLNT_ID':f.index,'class_max_count':f.values})
features_train.append(f)

f = test[['CLNT_ID','C1_LAB']].groupby(['CLNT_ID','C1_LAB']).size().groupby('CLNT_ID').max()
f = pd.DataFrame({'CLNT_ID':f.index,'class_max_count':f.values})
features_test.append(f)

In [20]:
# C분류 상품종류
f = train.groupby('CLNT_ID')['CLAC3_NM'].agg([ ('상품종류', lambda x : x.nunique()) ]).reset_index()
features_train.append(f)
f = test.groupby('CLNT_ID')['CLAC3_NM'].agg([ ('상품종류', lambda x : x.nunique()) ]).reset_index()
features_test.append(f)

In [21]:
# 고가상품구매율
train['PD_PRICE'] = train['PD_BUY_AM'] / train['PD_BUY_CT']
price_4q = train.groupby('PD_C')['PD_PRICE'].mean().quantile([.25,.5,.75])
f = (train.groupby('CLNT_ID')['PD_PRICE']
     .agg([('고가상품구매율', lambda x: (x > price_4q.iloc[2]).mean().round(2))])
     .reset_index())
features_train.append(f)

test['PD_PRICE'] = test['PD_BUY_AM'] / test['PD_BUY_CT']
price_4q = test.groupby('PD_C')['PD_PRICE'].mean().quantile([.25,.5,.75])
f = (test.groupby('CLNT_ID')['PD_PRICE']
     .agg([('고가상품구매율', lambda x: (x > price_4q.iloc[2]).mean().round(2))])
     .reset_index())
features_test.append(f)

In [22]:
# A분류 상품별 카운팅
temp = train.copy()
scdata = pd.DataFrame({'CLNT_ID': np.sort(train.CLNT_ID.unique())})
for cate in CLASS:
    temp[cate] = 0
    temp.loc[(temp['CLAC1_NM'] == cate), cate] = 1
    ttt2 = temp.groupby('CLNT_ID')[cate].agg([(cate, np.sum)]).reset_index()
    scdata = pd.merge(scdata, ttt2, how='left')
scdata = scdata.fillna(0)
features_train.append(scdata)

temp = test.copy()
scdata = pd.DataFrame({'CLNT_ID': np.sort(test.CLNT_ID.unique())})
for cate in CLASS:
    temp[cate] = 0
    temp.loc[(temp['CLAC1_NM'] == cate), cate] = 1
    ttt2 = temp.groupby('CLNT_ID')[cate].agg([(cate, np.sum)]).reset_index()
    scdata = pd.merge(scdata, ttt2, how='left')
scdata = scdata.fillna(0)
features_test.append(scdata)

#### Feature 생성

In [23]:
data_train = pd.DataFrame({'CLNT_ID':np.sort(train.CLNT_ID.unique())})
for f in features_train:
    data_train = pd.merge(data_train, f, on=['CLNT_ID'], how='left')
    
data_test = pd.DataFrame({'CLNT_ID':np.sort(test.CLNT_ID.unique())})
for f in features_test:
    data_test = pd.merge(data_test, f, on=['CLNT_ID'], how='left')

#### ABC분석

In [24]:
f = train.groupby('CLNT_ID')['AMOUNT'].agg([('총구매액', np.sum),
                                            ('구매건수', np.size),
                                            ('평균구매액', lambda x : np.round(np.mean(x))),
                                            ('최대구매액', np.max)]).reset_index()
t_msort = f.sort_values(by='총구매액', ascending=False)
t_msort['cum_sum'] = t_msort['총구매액'].cumsum()
t_msort['cum_perc'] = 100*t_msort['cum_sum']/t_msort['총구매액'].sum()
t_msort

abc = t_msort.iloc[:,6]
reset = abc.reset_index()
del reset['index']
reset

t_msort["abc"] = 0

t_msort.loc[(t_msort['cum_perc'] <= 92.884075), 'abc'] = 2
t_msort.loc[(t_msort['cum_perc'] <= 78.737587), 'abc'] = 1
t_msort.loc[(t_msort['cum_perc'] > 92.884075), 'abc'] = 3

ttt = t_msort[['CLNT_ID','abc']]

data_train = pd.merge(data_train, ttt, on=['CLNT_ID'], how='left')

In [25]:
f = test.groupby('CLNT_ID')['AMOUNT'].agg([('총구매액', np.sum),
                                            ('구매건수', np.size),
                                            ('평균구매액', lambda x : np.round(np.mean(x))),
                                            ('최대구매액', np.max)]).reset_index()
t_msort = f.sort_values(by='총구매액', ascending=False)
t_msort['cum_sum'] = t_msort['총구매액'].cumsum()
t_msort['cum_perc'] = 100*t_msort['cum_sum']/t_msort['총구매액'].sum()
t_msort

abc = t_msort.iloc[:,6]
reset = abc.reset_index()
del reset['index']
reset

t_msort["abc"] = 0

t_msort.loc[(t_msort['cum_perc'] <= 92.884075), 'abc'] = 2
t_msort.loc[(t_msort['cum_perc'] <= 78.737587), 'abc'] = 1
t_msort.loc[(t_msort['cum_perc'] > 92.884075), 'abc'] = 3

ttt = t_msort[['CLNT_ID','abc']]

data_test = pd.merge(data_test, ttt, on=['CLNT_ID'], how='left')

In [28]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import log_loss

from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import LinearSVC
from sklearn.tree import DecisionTreeClassifier

#### cross_val_score

In [29]:
# from sklearn.model_selection import cross_val_score

# lr = LogisticRegression()
# scores = cross_val_score(lr, data_train, cust_train['LABEL'], scoring='neg_log_loss', cv=5)

# print(scores)

#### K-Fold 테스트

In [30]:
skf = StratifiedKFold(n_splits=5)

cv_acc = []
lr_score = []
for train_idx, test_idx in skf.split(data_train, cust_train['LABEL']):
    x_train, x_test = data_train.loc[train_idx,], data_train.loc[test_idx,]
    y_train, y_test = cust_train.loc[train_idx,'LABEL'], cust_train.loc[test_idx,'LABEL']
    
    mm = MinMaxScaler()
    mm.fit(x_train)
    x_train = mm.transform(x_train)
    mm.fit(x_test)
    x_test = mm.transform(x_test)

    lr = LogisticRegression()
    lr.fit(x_train, y_train)
    result = lr.predict_proba(x_test)

#     knn = KNeighborsClassifier(n_neighbors=5)    
#     knn.fit(x_train, y_train)
#     result = knn.predict_proba(x_test)

#     tree = DecisionTreeClassifier(random_state=10)
#     tree.fit(x_train,y_train)
#     result = tree.predict_proba(x_test)

    acc = np.round(log_loss(y_test, result), 3)
    cv_acc.append(acc)
    lr_score.append(lr.score(x_test, y_test))
    
print(cv_acc)
print(lr_score)

[1.391, 1.391, 1.391, 1.391, 1.391]
[0.39847209289067104, 0.3987761540069554, 0.3991942380418464, 0.399061211303472, 0.398555682250095]


#### Submission

In [32]:
from sklearn.linear_model import LogisticRegression

# 정규화
mm = MinMaxScaler()
mm.fit(x_train)
x_train = mm.transform(x_train)
mm.fit(x_test)
x_test = mm.transform(x_test)

# 모델 생성 및 학습
lr = LogisticRegression()
lr.fit(data_train, cust_train.LABEL)

# 테스트 데이터 예측
pred = pd.DataFrame(lr.predict_proba(data_test))

# 결과값 정제 및 내보내기
result = pd.concat([cust_test.CLNT_ID,pred],axis=1)
result.columns = ['CLNT_ID','F20','F30','F40','M20','M30','M40']
result.to_csv('data/w4/submit.csv',index=False)

#### 파라미터 튜닝

In [33]:
import pandas as pd
from sklearn.model_selection import train_test_split
# 모델 생성, 하이퍼파라미터 튜닝
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
#평가지표
from sklearn.metrics import mean_squared_error
from sklearn.metrics import log_loss
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import roc_auc_score, roc_curve
import matplotlib.pyplot as plt

In [34]:
X_train, X_test, y_train, y_test = train_test_split(data_train,cust_train['LABEL'],stratify=cust_train['LABEL'],random_state=66)

#### Tree 파라미터 수정

In [35]:
tree = DecisionTreeClassifier(random_state = 0)
tree.fit(X_train,y_train)

print(tree.score(X_test,y_test))

0.3006871807346144


In [36]:
dt = DecisionTreeClassifier(random_state = 0)

param_grid={'criterion' :['gini','entropy'],
                 'max_depth':[4,5,6,8,10],
                 'min_samples_leaf':[3,5,10,50,100]}

gcv=GridSearchCV(dt, param_grid=param_grid, cv=3, scoring='accuracy', n_jobs=-1)
gcv.fit(X_train,y_train)
print('final params', gcv.best_params_)   # 최적의 파라미터 값 출력
print('best score', gcv.best_score_)      # 최고의 점수
print(gcv.score(X_test,y_test))

final params {'criterion': 'entropy', 'max_depth': 4, 'min_samples_leaf': 3}
best score 0.3990006486661802
0.39908173193870106


In [37]:
tree = DecisionTreeClassifier(random_state=0)
tree.fit(X_train,y_train)

print(tree.score(X_test,y_test))

0.3006871807346144


In [38]:
dt = DecisionTreeClassifier(random_state = 0)

param_grid={'criterion' :['gini','entropy'],
                 'max_depth':[4,5,6,8,10],
                 'min_samples_leaf':[3,5,10,50,100]}

rcv = RandomizedSearchCV(dt, param_distributions=param_grid,scoring='accuracy', n_iter=8) #8번 반복
rcv.fit(X_train,y_train)
print('final params', rcv.best_params_)   # 최적의 파라미터 값 출력
print('best score', rcv.best_score_)      # 최고의 점수
print(rcv.score(X_test,y_test))

final params {'min_samples_leaf': 50, 'max_depth': 5, 'criterion': 'entropy'}
best score 0.3990057151481078
0.39920335684748237


In [39]:
print('기존모델 성능 : ', tree.score(X_test,y_test))
print('grid_search 이용해 튜닝한 모델 성능 : ', gcv.score(X_test, y_test))
print('random_search 이용해 튜닝한 모델 성능 : ', rcv.score(X_test, y_test))

기존모델 성능 :  0.3006871807346144
grid_search 이용해 튜닝한 모델 성능 :  0.39908173193870106
random_search 이용해 튜닝한 모델 성능 :  0.39920335684748237


#### LR

In [40]:
#기존모델
skf = StratifiedKFold(n_splits=5)

cv_acc = []
lr_score = []
for train_idx, test_idx in skf.split(data_train, cust_train['LABEL']):
    x_train, x_test = data_train.loc[train_idx,], data_train.loc[test_idx,]
    y_train, y_test = cust_train.loc[train_idx,'LABEL'], cust_train.loc[test_idx,'LABEL']
    
    mm = MinMaxScaler()
    mm.fit(x_train)
    x_train = mm.transform(x_train)
    mm.fit(x_test)
    x_test = mm.transform(x_test)

    lr = LogisticRegression()
    lr.fit(x_train, y_train)
    result = lr.predict_proba(x_test)

#     knn = KNeighborsClassifier(n_neighbors=5)    
#     knn.fit(x_train, y_train)
#     result = knn.predict_proba(x_test)

#     tree = DecisionTreeClassifier(random_state=10)
#     tree.fit(x_train,y_train)
#     result = tree.predict_proba(x_test)

    acc = np.round(log_loss(y_test, result), 3)
    cv_acc.append(acc)
    lr_score.append(lr.score(x_test, y_test))
    
print(cv_acc)
print(lr_score)

[1.391, 1.391, 1.391, 1.391, 1.391]
[0.39847209289067104, 0.3987761540069554, 0.3991942380418464, 0.399061211303472, 0.398555682250095]


In [41]:
#약한규제
logreg1 = LogisticRegression(C=0.01).fit(x_train, y_train)

print("훈련: {:.3f}".format(logreg1.score(x_train, y_train)))
print("테스트: {:.3f}".format(logreg1.score(x_test, y_test)))

훈련: 0.399
테스트: 0.399


In [42]:
#강한규제
logreg2 = LogisticRegression(C=100).fit(x_train, y_train)

print("훈련: {:.3f}".format(logreg2.score(x_train, y_train)))
print("테스트: {:.3f}".format(logreg2.score(x_test, y_test)))

훈련: 0.399
테스트: 0.398


In [43]:
from sklearn.model_selection import GridSearchCV

# 파라메터 후보
param_grid = {'C': [0.001, 0.01, 0.1, 1, 10, 100],
              'penalty': ['l1', 'l2']}

# 그리드 서치 진행
grid_search = GridSearchCV(LogisticRegression(), param_grid, cv=5)        


grid_search.fit(x_train, y_train)
grid_search.score(x_test, y_test)

0.39885974914481187