In [1]:
import numpy as np
import pandas as pd

# 1. 데이터 로드

In [2]:
# 실습 환경은 인코딩 없이 진행 됨
train = pd.read_csv('data/X_train.csv', encoding='euc-kr') 
target = pd.read_csv('data/y_train.csv')
test = pd.read_csv('data/X_test.csv', encoding='euc-kr')

print(train.shape, target.shape, test.shape)

df = train.merge(target, on='cust_id', how='left')
print(df.shape)

(3500, 10) (3500, 2) (2482, 10)
(3500, 11)


# 2. 전처리 & Feature Engineering

In [3]:
print(df.info())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3500 entries, 0 to 3499
Data columns (total 11 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   cust_id  3500 non-null   int64  
 1   총구매액     3500 non-null   int64  
 2   최대구매액    3500 non-null   int64  
 3   환불금액     1205 non-null   float64
 4   주구매상품    3500 non-null   object 
 5   주구매지점    3500 non-null   object 
 6   내점일수     3500 non-null   int64  
 7   내점당구매건수  3500 non-null   float64
 8   주말방문비율   3500 non-null   float64
 9   구매주기     3500 non-null   int64  
 10  gender   3500 non-null   int64  
dtypes: float64(3), int64(6), object(2)
memory usage: 328.1+ KB
None


In [4]:
# 실습 환경에서 desc 짤리기 때문에 출력 추가 코딩 필요
desc = df.describe(include="all") 

for col in df.columns:
    print(col,'\n', desc[col])
    pass

cust_id 
 count     3500.000000
unique            NaN
top               NaN
freq              NaN
mean      1749.500000
std       1010.507298
min          0.000000
25%        874.750000
50%       1749.500000
75%       2624.250000
max       3499.000000
Name: cust_id, dtype: float64
총구매액 
 count     3.500000e+03
unique             NaN
top                NaN
freq               NaN
mean      9.191925e+07
std       1.635065e+08
min      -5.242152e+07
25%       4.747050e+06
50%       2.822270e+07
75%       1.065079e+08
max       2.323180e+09
Name: 총구매액, dtype: float64
최대구매액 
 count     3.500000e+03
unique             NaN
top                NaN
freq               NaN
mean      1.966424e+07
std       3.199235e+07
min      -2.992000e+06
25%       2.875000e+06
50%       9.837000e+06
75%       2.296250e+07
max       7.066290e+08
Name: 최대구매액, dtype: float64
환불금액 
 count     1.205000e+03
unique             NaN
top                NaN
freq               NaN
mean      2.407822e+07
std       4.746453e+

In [5]:
# null - 환불금액
print(sum(df['환불금액'].isna())/df.shape[0])
# 65%가 null min=
print(df['환불금액'].min())
# min 5600.0 이므로, 0으로 대체
df['환불금액'] = df['환불금액'].fillna(0)
print(df.isna().sum().sum())

0.6557142857142857
5600.0
0


In [None]:
import scipy.stats
#dir(scipy.stats)

In [None]:
from scipy.stats import skew, kurtosis

df.dtypes
"""
총구매액         int64
최대구매액        int64
환불금액       float64
내점일수         int64
내점당구매건수    float64
주말방문비율     float64
구매주기         int64
gender       int64
"""
num_type = ['총구매액','최대구매액','환불금액','내점일수','내점당구매건수','주말방문비율','구매주기','gender']

df[num_type].apply(lambda x : skew(x)).sort_values()

df['총구매액'] = np.log1p(df['총구매액'])
df['최대구매액'] = np.log1p(df['최대구매액'])
df['환불금액'] = np.log1p(df['환불금액'])

In [6]:
# object - 주구매상품, 주구매지점 
#print(df['주구매상품'].value_counts())
#print(df['주구매지점'].value_counts())

# label encoding

from sklearn.preprocessing import LabelEncoder
enc = LabelEncoder()
df['주구매상품'] = enc.fit_transform(df['주구매상품'])
df['주구매지점'] = enc.fit_transform(df['주구매지점'])

In [7]:
# corr
df.corr()[(abs(df.corr()) > 0.7) & (abs(df.corr()) != 1)]

Unnamed: 0,cust_id,총구매액,최대구매액,환불금액,주구매상품,주구매지점,내점일수,내점당구매건수,주말방문비율,구매주기,gender
cust_id,,,,,,,,,,,
총구매액,,,0.70008,,,,,,,,
최대구매액,,0.70008,,,,,,,,,
환불금액,,,,,,,,,,,
주구매상품,,,,,,,,,,,
주구매지점,,,,,,,,,,,
내점일수,,,,,,,,,,,
내점당구매건수,,,,,,,,,,,
주말방문비율,,,,,,,,,,,
구매주기,,,,,,,,,,,


In [8]:
df.drop('최대구매액', axis=1, inplace=True)

In [9]:
# one-hot 
df = pd.get_dummies(df, columns=['주구매상품','주구매지점'], drop_first=True)

# 3. 분류 알고리즘 사용

In [10]:
# 데이터 분리
from sklearn.model_selection import train_test_split
target = df.gender
df.drop(['cust_id','gender'], axis=1, inplace=True)
X_train, X_test, y_train, y_test = train_test_split(df, target, test_size=0.3, random_state=42)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((2450, 70), (1050, 70), (2450,), (1050,))

In [11]:
from sklearn.ensemble import VotingClassifier, RandomForestClassifier, GradientBoostingClassifier

from sklearn.tree import DecisionTreeClassifier

In [18]:

#dir(sklearn.ensemble)
#help(sklearn.ensemble.RandomForestClassifier)
rfc = RandomForestClassifier()
gbc = GradientBoostingClassifier()

#rfc.fit(X_train, y_train)
#gbc.fit(X_train, y_train)

GradientBoostingClassifier()

# 4. 초매개변수 최적화

# 5. 모형 앙상블

In [21]:
vot = VotingClassifier(estimators=[
    ('rfc',rfc), ('gbc',gbc)
], voting='soft')


vot.fit(X_train, y_train)

VotingClassifier(estimators=[('rfc', RandomForestClassifier()),
                             ('gbc', GradientBoostingClassifier())],
                 voting='soft')

In [38]:
from sklearn.model_selection import KFold

kf = KFold(n_splits=3, random_state=42)
kf.split(X_train, y_train)



<generator object _BaseKFold.split at 0x000001DFBFC2A348>

In [35]:
from sklearn.model_selection import cross_val_score
#help(cross_val_score)
score = cross_val_score(vot, df, target, scoring="roc_auc", cv=3)
score

array([0.66100841, 0.66527322, 0.67567051])

# 6. 예측

In [16]:
from sklearn.metrics import roc_auc_score

In [22]:
pred = vot.predict(X_test)
roc_auc_score(pred, y_test)

0.6293410940266126

# 7. 제출 

In [25]:
test

Unnamed: 0,cust_id,총구매액,최대구매액,환불금액,주구매상품,주구매지점,내점일수,내점당구매건수,주말방문비율,구매주기
0,3500,70900400,22000000,4050000.0,골프,부산본점,13,1.461538,0.789474,26
1,3501,310533100,38558000,48034700.0,농산물,잠실점,90,2.433333,0.369863,3
2,3502,305264140,14825000,30521000.0,가공식품,본 점,101,14.623762,0.083277,3
3,3503,7594080,5225000,,주방용품,부산본점,5,2.000000,0.000000,47
4,3504,1795790,1411200,,수산품,청량리점,3,2.666667,0.125000,8
...,...,...,...,...,...,...,...,...,...,...
2477,5977,82581500,23976000,,골프,부산본점,8,1.750000,0.642857,40
2478,5978,480000,480000,,섬유잡화,광주점,1,1.000000,0.000000,0
2479,5979,260003790,25750000,,남성 캐주얼,본 점,19,3.736842,0.915493,18
2480,5980,88991520,18120000,,육류,본 점,5,3.600000,0.444444,60


In [23]:
pred

array([1, 1, 0, ..., 0, 0, 0], dtype=int64)

In [None]:
# 답안 제출 예시
# 수험번호.csv 생성
# DataFrame.to_csv("0000.csv", index=False)
