In [1]:
import numpy as np
import pandas as pd

In [2]:
pd.set_option('max_rows', None)
pd.set_option('max_columns', None)

# 1. 데이터 로드

In [3]:
# 실습 환경은 인코딩 없이 진행 됨
train = pd.read_csv('data/X_train.csv', encoding='euc-kr') 
target = pd.read_csv('data/y_train.csv')
test = pd.read_csv('data/X_test.csv', encoding='euc-kr')

print(train.shape, target.shape, test.shape)

df = train.merge(target, on='cust_id', how='left')
print(df.shape)

(3500, 10) (3500, 2) (2482, 10)
(3500, 11)


In [4]:
df = pd.concat([df, test])
print(df.shape)

(5982, 11)


# 2. 전처리 & Feature Engineering

In [5]:
print(df.info())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5982 entries, 0 to 2481
Data columns (total 11 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   cust_id  5982 non-null   int64  
 1   총구매액     5982 non-null   int64  
 2   최대구매액    5982 non-null   int64  
 3   환불금액     2076 non-null   float64
 4   주구매상품    5982 non-null   object 
 5   주구매지점    5982 non-null   object 
 6   내점일수     5982 non-null   int64  
 7   내점당구매건수  5982 non-null   float64
 8   주말방문비율   5982 non-null   float64
 9   구매주기     5982 non-null   int64  
 10  gender   3500 non-null   float64
dtypes: float64(4), int64(5), object(2)
memory usage: 560.8+ KB
None


In [6]:
print(df.describe(include="all") )

            cust_id          총구매액         최대구매액          환불금액 주구매상품 주구매지점  \
count   5982.000000  5.982000e+03  5.982000e+03  2.076000e+03  5982  5982   
unique          NaN           NaN           NaN           NaN    42    24   
top             NaN           NaN           NaN           NaN    기타  본  점   
freq            NaN           NaN           NaN           NaN  1060  1803   
mean    2990.500000  9.569838e+07  2.053814e+07  2.469452e+07   NaN   NaN   
std     1726.998987  1.676480e+08  3.330805e+07  5.281222e+07   NaN   NaN   
min        0.000000 -5.242152e+07 -3.744000e+07  5.600000e+03   NaN   NaN   
25%     1495.250000  4.867800e+06  2.875000e+06  2.304000e+06   NaN   NaN   
50%     2990.500000  2.898500e+07  1.019760e+07  7.627000e+06   NaN   NaN   
75%     4485.750000  1.142893e+08  2.447250e+07  2.353250e+07   NaN   NaN   
max     5981.000000  2.861238e+09  7.066290e+08  8.715144e+08   NaN   NaN   

               내점일수      내점당구매건수       주말방문비율         구매주기       gender  
c

In [7]:
# null - 환불금액
print(sum(df['환불금액'].isna())/df.shape[0])
# 65%가 null min=
print(df['환불금액'].min())
# min 5600.0 이므로, 0으로 대체
df['환불금액'] = df['환불금액'].fillna(0)
print(df.isna().sum().sum())

0.6529588766298897
5600.0
2482


In [8]:
# object - 주구매상품, 주구매지점 
#print(df['주구매상품'].value_counts())
#print(df['주구매지점'].value_counts())

# label encoding

from sklearn.preprocessing import LabelEncoder
enc = LabelEncoder()
df['주구매상품'] = enc.fit_transform(df['주구매상품'])
df['주구매지점'] = enc.fit_transform(df['주구매지점'])

In [9]:
# corr
df.corr()[(abs(df.corr()) > 0.7) & (abs(df.corr()) != 1)]

Unnamed: 0,cust_id,총구매액,최대구매액,환불금액,주구매상품,주구매지점,내점일수,내점당구매건수,주말방문비율,구매주기,gender
cust_id,,,,,,,,,,,
총구매액,,,,,,,,,,,
최대구매액,,,,,,,,,,,
환불금액,,,,,,,,,,,
주구매상품,,,,,,,,,,,
주구매지점,,,,,,,,,,,
내점일수,,,,,,,,,,,
내점당구매건수,,,,,,,,,,,
주말방문비율,,,,,,,,,,,
구매주기,,,,,,,,,,,


In [10]:
df.drop('최대구매액', axis=1, inplace=True)

In [11]:
# one-hot 
df = pd.get_dummies(df, columns=['주구매상품','주구매지점'], drop_first=True)

# 3. 분류 알고리즘 사용

In [12]:
# 데이터 분리
from sklearn.model_selection import train_test_split

In [13]:
train = df[~df.gender.isna()]
test = df[df.gender.isna()]

In [14]:
target = train.gender
train.drop(['cust_id'], axis=1, inplace=True)

train.drop(['gender'], axis=1, inplace=True)
test.drop(['gender'], axis=1, inplace=True)

X_train, X_test, y_train, y_test = train_test_split(train, target, test_size=0.3, random_state=42)
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(2450, 70) (1050, 70) (2450,) (1050,)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


In [15]:
from sklearn.ensemble import VotingClassifier, RandomForestClassifier, GradientBoostingClassifier

from sklearn.tree import DecisionTreeClassifier

In [16]:

#dir(sklearn.ensemble)
#help(sklearn.ensemble.RandomForestClassifier)
rfc = RandomForestClassifier()
gbc = GradientBoostingClassifier()

#rfc.fit(X_train, y_train)
#gbc.fit(X_train, y_train)

# 4. 초매개변수 최적화

In [17]:
from sklearn.model_selection import GridSearchCV

In [18]:
#help(GradientBoostingClassifier)

In [19]:
parmas = {'n_estimators':[100,200], 'criterion':["gini", "entropy"]}
rfc_gs = GridSearchCV(rfc, param_grid=parmas, scoring='roc_auc', cv=3, refit=True, return_train_score=True)
rfc_gs.fit(X_train, y_train)
print(rfc_gs.best_params_)
rfc = RandomForestClassifier(criterion= 'gini', n_estimators= 200)

{'criterion': 'gini', 'n_estimators': 200}


# 5. 모형 앙상블

In [20]:
vot = VotingClassifier(estimators=[
    ('rfc',rfc), ('gbc',gbc)
], voting='soft')
vot

VotingClassifier(estimators=[('rfc', RandomForestClassifier(n_estimators=200)),
                             ('gbc', GradientBoostingClassifier())],
                 voting='soft')

In [21]:
vot.fit(X_train, y_train)

VotingClassifier(estimators=[('rfc', RandomForestClassifier(n_estimators=200)),
                             ('gbc', GradientBoostingClassifier())],
                 voting='soft')

# 6. 예측

In [22]:
from sklearn.metrics import roc_auc_score

In [24]:
pred = vot.predict(X_test)
print(roc_auc_score(pred, y_test))

0.6336278652836471


# 7. 제출 

In [25]:
pred = vot.predict_proba(test.drop('cust_id', axis=1, inplace=False))
pred.shape

(2482, 2)

In [26]:
pred

array([[0.63305393, 0.36694607],
       [0.82475971, 0.17524029],
       [0.76317199, 0.23682801],
       ...,
       [0.31861937, 0.68138063],
       [0.60668592, 0.39331408],
       [0.50944108, 0.49055892]])

In [32]:
pred[:,1]

array([0.36694607, 0.17524029, 0.23682801, ..., 0.68138063, 0.39331408,
       0.49055892])

In [33]:
test['gender'] = pred[:,1]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [34]:
test[['cust_id', 'gender']]

Unnamed: 0,cust_id,gender
0,3500,0.366946
1,3501,0.17524
2,3502,0.236828
3,3503,0.386661
4,3504,0.439737
5,3505,0.369258
6,3506,0.306309
7,3507,0.565442
8,3508,0.384894
9,3509,0.194355


In [35]:
# 답안 제출 예시
# 수험번호.csv 생성
test[['cust_id', 'gender']].to_csv("0000.csv", index=False)