In [180]:
# 분석 절차(PDCNLDNSAER)
# 1. Package import
# 2. Data loading
# 3. Column select: 구조 파악 및 필요한 컬럼 선별
# 4. NaN: 결측치 처리
# 5. Label encoding: 범주형 변수의 변환
# 6. Derivative variable: 파생 변수 만들기
# 7. Normal: 정규화
# 8. Split: 데이터 분할
# 9. Analysis: 분석
# 10. Evaluation: 평가
# 11. Result save: 결과 저장

## 쇼핑몰 성별 예측값 산출

<img src='./images/3_b_1.jpg' style='width: 70%'>

In [181]:
import pandas as pd
X = pd.read_csv('./data/x_train.csv', encoding='utf-8')
X_sub = pd.read_csv('./data/x_test.csv', encoding='utf-8')
Y = pd.read_csv('./data/y_train.csv', encoding='utf-8')

print(X.shape)
print(X_sub.shape)
print(Y.shape)

(3500, 10)
(2482, 10)
(3500, 2)


In [182]:
dfX=pd.concat([X,X_sub],axis=0,ignore_index=True)

In [183]:
dfX.tail(2)

Unnamed: 0,cust_id,총구매액,최대구매액,환불금액,주구매상품,주구매지점,내점일수,내점당구매건수,주말방문비율,구매주기
5980,5980,88991520,18120000,,육류,본 점,5,3.6,0.444444,60
5981,5981,623700,209000,,가공식품,영등포점,2,5.0,0.0,31


In [184]:
# 데이터 전처리
print(dfX.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5982 entries, 0 to 5981
Data columns (total 10 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   cust_id  5982 non-null   int64  
 1   총구매액     5982 non-null   int64  
 2   최대구매액    5982 non-null   int64  
 3   환불금액     2076 non-null   float64
 4   주구매상품    5982 non-null   object 
 5   주구매지점    5982 non-null   object 
 6   내점일수     5982 non-null   int64  
 7   내점당구매건수  5982 non-null   float64
 8   주말방문비율   5982 non-null   float64
 9   구매주기     5982 non-null   int64  
dtypes: float64(3), int64(5), object(2)
memory usage: 467.5+ KB
None


In [185]:
print(dfX.isna().sum())

cust_id       0
총구매액          0
최대구매액         0
환불금액       3906
주구매상품         0
주구매지점         0
내점일수          0
내점당구매건수       0
주말방문비율        0
구매주기          0
dtype: int64


In [186]:
print(dfX.groupby('주구매상품')['환불금액'].mean())

주구매상품
가공식품      2.080863e+07
가구        1.452875e+07
건강식품      1.416927e+07
골프        3.457147e+07
구두        2.521238e+06
기타        1.843526e+07
남성 캐주얼    5.945762e+06
남성 트랜디    5.652510e+07
남성정장      1.924602e+07
농산물       2.300891e+07
대형가전      8.760040e+07
디자이너      6.341434e+07
란제리/내의    1.131132e+07
명품        3.761344e+07
모피/피혁     2.301420e+07
보석        3.200000e+06
생활잡화      6.528000e+06
섬유잡화      5.305093e+06
셔츠        3.202311e+06
소형가전               NaN
수산품       7.737106e+06
스포츠       1.309125e+07
시티웨어      2.018025e+07
식기        2.352650e+07
아동        2.899493e+07
악기                 NaN
액세서리      6.888000e+06
육류        1.621335e+07
일용잡화      7.951950e+06
젓갈/반찬     1.810238e+07
주류        5.600000e+03
주방가전      6.439601e+07
주방용품      6.944425e+06
차/커피      1.060484e+07
축산가공      8.948509e+06
침구/수예     2.381630e+07
캐주얼       2.737687e+07
커리어       1.695097e+07
통신/컴퓨터             NaN
트래디셔널     1.868744e+07
피혁잡화      3.321309e+06
화장품       2.085498e+07
Name: 환불금액, dtype: float64


In [187]:
temp=dfX.groupby('주구매상품')['환불금액'].transform('mean')

In [188]:
dfX['환불금액'].mask(dfX['환불금액'].isna(),temp,inplace=True)

In [189]:
dfX.isna().sum()

cust_id     0
총구매액        0
최대구매액       0
환불금액       11
주구매상품       0
주구매지점       0
내점일수        0
내점당구매건수     0
주말방문비율      0
구매주기        0
dtype: int64

In [190]:
dfX['환불금액'].fillna(dfX['환불금액'].mean(),inplace=True)

In [191]:
print(dfX.isna().sum())

cust_id    0
총구매액       0
최대구매액      0
환불금액       0
주구매상품      0
주구매지점      0
내점일수       0
내점당구매건수    0
주말방문비율     0
구매주기       0
dtype: int64


In [192]:
dfX.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5982 entries, 0 to 5981
Data columns (total 10 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   cust_id  5982 non-null   int64  
 1   총구매액     5982 non-null   int64  
 2   최대구매액    5982 non-null   int64  
 3   환불금액     5982 non-null   float64
 4   주구매상품    5982 non-null   object 
 5   주구매지점    5982 non-null   object 
 6   내점일수     5982 non-null   int64  
 7   내점당구매건수  5982 non-null   float64
 8   주말방문비율   5982 non-null   float64
 9   구매주기     5982 non-null   int64  
dtypes: float64(3), int64(5), object(2)
memory usage: 467.5+ KB


In [193]:
dfX['주구매상품']=dfX['주구매상품'].astype('category').cat.codes
dfX['주구매지점']=dfX['주구매지점'].astype('category').cat.codes

In [194]:
# 테스트 함수

In [195]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler

In [196]:
def get_score(model,xtrain,xtest,ytrain,ytest):
    A=model.score(xtrain,ytrain)
    B=model.score(xtest,ytest)
    ypred=model.predict_proba(xtest)[:,1]
    C=roc_auc_score(ytest,ypred)
    return '{:.4f} {:.4f} {:.4f}'.format(A,B,C)

In [197]:
def make_models(xtrain,xtest,ytrain,ytest):
    model1=LogisticRegression(max_iter=5000).fit(xtrain,ytrain)
    print('model1',get_score(model1,xtrain,xtest,ytrain,ytest))
    
    for k in range(1,10):
        model2=KNeighborsClassifier(k).fit(xtrain,ytrain)
        print('model2',k,get_score(model2,xtrain,xtest,ytrain,ytest))
    
    model3=DecisionTreeClassifier(random_state=0).fit(xtrain,ytrain)
    print('model3',get_score(model3,xtrain,xtest,ytrain,ytest))
    
    for d in range(1,10):
        model3=DecisionTreeClassifier(max_depth=d,random_state=0).fit(xtrain,ytrain)
        print('model3',d,get_score(model3,xtrain,xtest,ytrain,ytest))
        
    model4=RandomForestClassifier(random_state=0).fit(xtrain,ytrain)
    print('model4',get_score(model4,xtrain,xtest,ytrain,ytest))
    
    for d in range(3,8):
        model4=RandomForestClassifier(max_depth=d,random_state=0).fit(xtrain,ytrain)
        print('model4',d,get_score(model4,xtrain,xtest,ytrain,ytest))
    
    model5=XGBClassifier(use_label_encoder=False,eval_metric='logloss').fit(xtrain,ytrain)
    print('model5',get_score(model5,xtrain,xtest,ytrain,ytest))

In [198]:
def get_data(dfX,Y):
    X=dfX.drop(columns=['cust_id'])
    X_use=X.iloc[:3500,:]
    X_sub=X.iloc[3500:,:]
    Y1=Y['gender']
    scaler=StandardScaler()
    X1_use=scaler.fit_transform(X_use)
    X1_sub=scaler.transform(X_sub)

    print(X1_use.shape,X1_sub.shape,Y1.shape)
    return X1_use, X1_sub, Y1

In [199]:
X1_use,X1_sub,Y1=get_data(dfX,Y)
xtrain,xtest,ytrain,ytest=train_test_split(X1_use,Y1,test_size=0.3,stratify=Y1,random_state=0)

make_models(xtrain,xtest,ytrain,ytest)

(3500, 9) (2482, 9) (3500,)
model1 0.6322 0.6410 0.6582
model2 1 1.0000 0.5676 0.5369
model2 2 0.7955 0.6114 0.5560
model2 3 0.7951 0.6019 0.5770
model2 4 0.7404 0.6305 0.5967
model2 5 0.7449 0.6181 0.6009
model2 6 0.7188 0.6190 0.5953
model2 7 0.7159 0.6000 0.6042
model2 8 0.7143 0.6248 0.6155
model2 9 0.7045 0.6219 0.6138
model3 1.0000 0.5533 0.5314
model3 1 0.6241 0.6238 0.6228
model3 2 0.6441 0.6590 0.6542
model3 3 0.6486 0.6657 0.6781
model3 4 0.6645 0.6467 0.6717
model3 5 0.6812 0.6410 0.6542
model3 6 0.7024 0.6343 0.6522
model3 7 0.7078 0.6381 0.6494
model3 8 0.7371 0.6286 0.6253
model3 9 0.7522 0.6276 0.5956
model4 1.0000 0.6543 0.6509
model4 3 0.6580 0.6476 0.6885
model4 4 0.6776 0.6495 0.6957
model4 5 0.6980 0.6562 0.6968
model4 6 0.7298 0.6590 0.6979
model4 7 0.7665 0.6638 0.6992
model5 0.9902 0.6410 0.6307


In [200]:
# 모델 선정
model=RandomForestClassifier(max_depth=6,random_state=0).fit(xtrain,ytrain)

In [201]:
# 제출 데이터 저장

In [202]:
X_sub.columns

Index(['cust_id', '총구매액', '최대구매액', '환불금액', '주구매상품', '주구매지점', '내점일수', '내점당구매건수',
       '주말방문비율', '구매주기'],
      dtype='object')

In [204]:
pred=model.predict_proba(X1_sub)[:,1]

In [205]:
pred

array([0.51513189, 0.17089939, 0.25540972, ..., 0.50088735, 0.39587348,
       0.50487367])

In [208]:
submission=pd.DataFrame({'cust_id':X_sub['cust_id'],'gender':pred})

In [210]:
submission

Unnamed: 0,cust_id,gender
0,3500,0.515132
1,3501,0.170899
2,3502,0.255410
3,3503,0.410702
4,3504,0.459112
...,...,...
2477,5977,0.532239
2478,5978,0.526345
2479,5979,0.500887
2480,5980,0.395873


In [211]:
submission.to_csv('123.csv',index=False)