In [31]:
# 분석 절차(PDCNLDNSAER)
# 1. Package import
# 2. Data loading
# 3. Column select: 구조 파악 및 필요한 컬럼 선별
# 4. NaN: 결측치 처리
# 5. Label encoding: 범주형 변수의 변환
# 6. Derivative variable: 파생 변수 만들기
# 7. Normal: 정규화
# 8. Split: 데이터 분할
# 9. Analysis: 분석
# 10. Evaluation: 평가
# 11. Result save: 결과 저장

## 쇼핑몰 성별 예측값 산출

<img src='./images/3_b_1.jpg' style='width: 70%'>

In [32]:
# NPTLSMRRXX
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import roc_auc_score
from sklearn.metrics import r2_score
from xgboost import XGBClassifier
from xgboost import XGBRegressor

x_train = pd.read_csv('./data/x_train.csv', encoding='utf-8')
x_test = pd.read_csv('./data/x_test.csv', encoding='utf-8')
y_train = pd.read_csv('./data/y_train.csv', encoding='utf-8')

print(x_train.shape)
print(x_test.shape)
print(y_train.shape)

(3500, 10)
(2482, 10)
(3500, 2)


In [33]:
x_train.head()

Unnamed: 0,cust_id,총구매액,최대구매액,환불금액,주구매상품,주구매지점,내점일수,내점당구매건수,주말방문비율,구매주기
0,0,68282840,11264000,6860000.0,기타,강남점,19,3.894737,0.527027,17
1,1,2136000,2136000,300000.0,스포츠,잠실점,2,1.5,0.0,1
2,2,3197000,1639000,,남성 캐주얼,관악점,2,2.0,0.0,1
3,3,16077620,4935000,,기타,광주점,18,2.444444,0.318182,16
4,4,29050000,24000000,,보석,본 점,2,1.5,0.0,85


In [34]:
y_train.head()

Unnamed: 0,cust_id,gender
0,0,0
1,1,0
2,2,1
3,3,1
4,4,0


In [35]:
train = pd.concat([x_train, x_test], axis=0)
train.shape

(5982, 10)

In [36]:
encoder = LabelEncoder()
train['주구매상품'] = encoder.fit_transform(train['주구매상품'])
train['주구매지점'] = encoder.fit_transform(train['주구매지점'])
train.head()

Unnamed: 0,cust_id,총구매액,최대구매액,환불금액,주구매상품,주구매지점,내점일수,내점당구매건수,주말방문비율,구매주기
0,0,68282840,11264000,6860000.0,5,0,19,3.894737,0.527027,17
1,1,2136000,2136000,300000.0,21,19,2,1.5,0.0,1
2,2,3197000,1639000,,6,1,2,2.0,0.0,1
3,3,16077620,4935000,,5,2,18,2.444444,0.318182,16
4,4,29050000,24000000,,15,8,2,1.5,0.0,85


In [37]:
x_train = train[:3500]
x_test = train[3500:]
print(x_train.shape)
print(x_train.shape)

(3500, 10)
(3500, 10)


In [38]:
y_train.drop(columns=['cust_id'], axis=1, inplace=True)
y_train.head()

Unnamed: 0,gender
0,0
1,0
2,1
3,1
4,0


In [39]:
x_train, x_val, y_train, y_val = train_test_split(x_train, y_train, test_size=0.3, shuffle=True, random_state=0)
print(x_train.shape)
print(y_train.shape)

(2450, 10)
(2450, 1)


In [40]:
model = XGBClassifier(n_estimators=80, max_depth=6)
model.fit(x_train, y_train)
y_val_p = model.predict(x_val)
print(y_val_p)
print(roc_auc_score(y_val, y_val_p))

[1 0 1 ... 0 0 0]
0.5871268021341809


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


In [41]:
x_test.head()

Unnamed: 0,cust_id,총구매액,최대구매액,환불금액,주구매상품,주구매지점,내점일수,내점당구매건수,주말방문비율,구매주기
0,3500,70900400,22000000,4050000.0,3,9,13,1.461538,0.789474,26
1,3501,310533100,38558000,48034700.0,9,19,90,2.433333,0.369863,3
2,3502,305264140,14825000,30521000.0,0,8,101,14.623762,0.083277,3
3,3503,7594080,5225000,,32,9,5,2.0,0.0,47
4,3504,1795790,1411200,,20,22,3,2.666667,0.125,8


In [42]:
x_test_cust_id = x_test['cust_id']
print(x_test_cust_id.head())

0    3500
1    3501
2    3502
3    3503
4    3504
Name: cust_id, dtype: int64


In [43]:
x_train_p = model.predict_proba(x_train)
x_train_p[0:5]

array([[0.41993266, 0.58006734],
       [0.6965463 , 0.30345368],
       [0.9205462 , 0.07945382],
       [0.66499037, 0.33500963],
       [0.28220862, 0.7177914 ]], dtype=float32)

In [50]:
df1 = pd.DataFrame(np.round(x_train_p[:,1], 3)).rename(columns={0:'gender'})
df1.head()

Unnamed: 0,gender
0,0.58
1,0.303
2,0.079
3,0.335
4,0.718


In [51]:
result = pd.concat([x_test_cust_id, df1], axis=1)
result.head()

Unnamed: 0,cust_id,gender
0,3500,0.58
1,3501,0.303
2,3502,0.079
3,3503,0.335
4,3504,0.718


In [53]:
result.to_csv('./result/a_1_2.csv', index=False)