# 필수 라이브러리 로딩

In [1]:
import numpy as np
import pandas as pd

import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns

import warnings

In [2]:
%matplotlib inline
%config InlineBackend.figure_format = 'retina'

mpl.rc('font', family='malgun gothic') # 폰트 설정
mpl.rc('axes', unicode_minus=False) # 유니코드에서 음수 부호 설정

# 차트 스타일 설정
sns.set(font="malgun gothic", rc={"axes.unicode_minus":False}, style='darkgrid')
plt.rc("figure", figsize=(10,8))

warnings.filterwarnings("ignore")

# 데이터 불러오기

In [5]:
data = pd.read_csv('C:/k_digital/source/data/dating.csv')
data.head()

Unnamed: 0,has_null,gender,age,age_o,race,race_o,importance_same_race,importance_same_religion,pref_o_attractive,pref_o_sincere,pref_o_intelligence,pref_o_funny,pref_o_ambitious,pref_o_shared_interests,attractive_o,sincere_o,intelligence_o,funny_o,ambitous_o,shared_interests_o,attractive_important,sincere_important,intellicence_important,funny_important,ambtition_important,shared_interests_important,attractive_partner,sincere_partner,intelligence_partner,funny_partner,ambition_partner,shared_interests_partner,interests_correlate,expected_happy_with_sd_people,expected_num_interested_in_me,like,guess_prob_liked,met,match
0,0,female,21.0,27.0,Asian/PacificIslander/Asian-American,European/Caucasian-American,2.0,4.0,35.0,20.0,20.0,20.0,0.0,5.0,6.0,8.0,8.0,8.0,8.0,6.0,15.0,20.0,20.0,15.0,15.0,15.0,6.0,9.0,7.0,7.0,6.0,5.0,0.14,3.0,2.0,7.0,6.0,0.0,0
1,0,female,21.0,22.0,Asian/PacificIslander/Asian-American,European/Caucasian-American,2.0,4.0,60.0,0.0,0.0,40.0,0.0,0.0,7.0,8.0,10.0,7.0,7.0,5.0,15.0,20.0,20.0,15.0,15.0,15.0,7.0,8.0,7.0,8.0,5.0,6.0,0.54,3.0,2.0,7.0,5.0,1.0,0
2,1,female,21.0,22.0,Asian/PacificIslander/Asian-American,Asian/PacificIslander/Asian-American,2.0,4.0,19.0,18.0,19.0,18.0,14.0,12.0,10.0,10.0,10.0,10.0,10.0,10.0,15.0,20.0,20.0,15.0,15.0,15.0,5.0,8.0,9.0,8.0,5.0,7.0,0.16,3.0,2.0,7.0,,1.0,1
3,0,female,21.0,23.0,Asian/PacificIslander/Asian-American,European/Caucasian-American,2.0,4.0,30.0,5.0,15.0,40.0,5.0,5.0,7.0,8.0,9.0,8.0,9.0,8.0,15.0,20.0,20.0,15.0,15.0,15.0,7.0,6.0,8.0,7.0,6.0,8.0,0.61,3.0,2.0,7.0,6.0,0.0,1
4,0,female,21.0,24.0,Asian/PacificIslander/Asian-American,Latino/HispanicAmerican,2.0,4.0,30.0,10.0,20.0,10.0,10.0,20.0,8.0,7.0,9.0,6.0,9.0,7.0,15.0,20.0,20.0,15.0,15.0,15.0,5.0,6.0,7.0,7.0,6.0,6.0,0.21,3.0,2.0,6.0,6.0,0.0,1


In [4]:
# 컬럼 40개까지 표시하는 설정
pd.options.display.max_columns = 40

In [6]:
# 데이터셋 기본 정보
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8378 entries, 0 to 8377
Data columns (total 39 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   has_null                       8378 non-null   int64  
 1   gender                         8378 non-null   object 
 2   age                            8283 non-null   float64
 3   age_o                          8274 non-null   float64
 4   race                           8315 non-null   object 
 5   race_o                         8305 non-null   object 
 6   importance_same_race           8299 non-null   float64
 7   importance_same_religion       8299 non-null   float64
 8   pref_o_attractive              8289 non-null   float64
 9   pref_o_sincere                 8289 non-null   float64
 10  pref_o_intelligence            8289 non-null   float64
 11  pref_o_funny                   8280 non-null   float64
 12  pref_o_ambitious               8271 non-null   f

In [7]:
# 수치 데이터의 기초 통계량
round(data.describe(), 2)

Unnamed: 0,has_null,age,age_o,importance_same_race,importance_same_religion,pref_o_attractive,pref_o_sincere,pref_o_intelligence,pref_o_funny,pref_o_ambitious,pref_o_shared_interests,attractive_o,sincere_o,intelligence_o,funny_o,ambitous_o,shared_interests_o,attractive_important,sincere_important,intellicence_important,funny_important,ambtition_important,shared_interests_important,attractive_partner,sincere_partner,intelligence_partner,funny_partner,ambition_partner,shared_interests_partner,interests_correlate,expected_happy_with_sd_people,expected_num_interested_in_me,like,guess_prob_liked,met,match
count,8378.0,8283.0,8274.0,8299.0,8299.0,8289.0,8289.0,8289.0,8280.0,8271.0,8249.0,8166.0,8091.0,8072.0,8018.0,7656.0,7302.0,8299.0,8299.0,8299.0,8289.0,8279.0,8257.0,8176.0,8101.0,8082.0,8028.0,7666.0,7311.0,8220.0,8277.0,1800.0,8138.0,8069.0,8003.0,8378.0
mean,0.87,26.36,26.36,3.78,3.65,22.5,17.4,20.27,17.46,10.69,11.85,6.19,7.18,7.37,6.4,6.78,5.47,22.51,17.4,20.27,17.46,10.68,11.85,6.19,7.18,7.37,6.4,6.78,5.47,0.2,5.53,5.57,6.13,5.21,0.05,0.16
std,0.33,3.57,3.56,2.85,2.81,12.57,7.04,6.78,6.09,6.13,6.36,1.95,1.74,1.55,1.95,1.79,2.16,12.59,7.05,6.78,6.09,6.12,6.36,1.95,1.74,1.55,1.95,1.79,2.16,0.3,1.73,4.76,1.84,2.13,0.28,0.37
min,0.0,18.0,18.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.83,1.0,0.0,0.0,0.0,0.0,0.0
25%,1.0,24.0,24.0,1.0,1.0,15.0,15.0,17.39,15.0,5.0,9.52,5.0,6.0,6.0,5.0,6.0,4.0,15.0,15.0,17.39,15.0,5.0,9.52,5.0,6.0,6.0,5.0,6.0,4.0,-0.02,5.0,2.0,5.0,4.0,0.0,0.0
50%,1.0,26.0,26.0,3.0,3.0,20.0,18.37,20.0,18.0,10.0,10.64,6.0,7.0,7.0,7.0,7.0,6.0,20.0,18.18,20.0,18.0,10.0,10.64,6.0,7.0,7.0,7.0,7.0,6.0,0.21,6.0,4.0,6.0,5.0,0.0,0.0
75%,1.0,28.0,28.0,6.0,6.0,25.0,20.0,23.81,20.0,15.0,16.0,8.0,8.0,8.0,8.0,8.0,7.0,25.0,20.0,23.81,20.0,15.0,16.0,8.0,8.0,8.0,8.0,8.0,7.0,0.43,7.0,8.0,7.0,7.0,0.0,0.0
max,1.0,55.0,55.0,10.0,10.0,100.0,60.0,50.0,50.0,53.0,30.0,10.5,10.0,10.0,11.0,10.0,10.0,100.0,60.0,50.0,50.0,53.0,30.0,10.0,10.0,10.0,10.0,10.0,10.0,0.91,10.0,20.0,10.0,10.0,8.0,1.0


## 전처리 - 결측값 처리

In [8]:
data.isna().sum()

has_null                            0
gender                              0
age                                95
age_o                             104
race                               63
race_o                             73
importance_same_race               79
importance_same_religion           79
pref_o_attractive                  89
pref_o_sincere                     89
pref_o_intelligence                89
pref_o_funny                       98
pref_o_ambitious                  107
pref_o_shared_interests           129
attractive_o                      212
sincere_o                         287
intelligence_o                    306
funny_o                           360
ambitous_o                        722
shared_interests_o               1076
attractive_important               79
sincere_important                  79
intellicence_important             79
funny_important                    89
ambtition_important                99
shared_interests_important        121
attractive_p

In [9]:
data.isna().mean()

has_null                         0.000000
gender                           0.000000
age                              0.011339
age_o                            0.012413
race                             0.007520
race_o                           0.008713
importance_same_race             0.009429
importance_same_religion         0.009429
pref_o_attractive                0.010623
pref_o_sincere                   0.010623
pref_o_intelligence              0.010623
pref_o_funny                     0.011697
pref_o_ambitious                 0.012772
pref_o_shared_interests          0.015397
attractive_o                     0.025304
sincere_o                        0.034256
intelligence_o                   0.036524
funny_o                          0.042970
ambitous_o                       0.086178
shared_interests_o               0.128432
attractive_important             0.009429
sincere_important                0.009429
intellicence_important           0.009429
funny_important                  0

- has_null 널값이 있냐 없냐
- 내가 상대의 그 항목을 얼마나 중요하게 생각하는가
- attractive_o 매력
sincere_o 성실
intelligence_o 지적
funny_o 재미
ambitous_o 야망
shared_interests_o 공유 관심사

자기 자신이 자신을 평가
attractive_important 매력
sincere_important 성실
intellicence_important 지능
funny_important 재미
ambtition_important 야망
shared_interests_important 공유 관심사

내가 상대를 평가
attractive_partner 매력
sincere_partner 성실
intelligence_partner 지능
funny_partner 재미
ambition_partner 야망
shared_interests_partner 공유관심사 

interests_correlate 참가자와 파트너의 관심사간의 상관 관계
expected_happy_with_sd_people 데이트로 만나는 사람과 얼마나 행복할거라 기대함?
expected_num_interested_in_me 니가 생각하기에 20명과 만난다면 니랑 사귀고 싶어할 거 같은 사람은 몇명?
expected_num_matche 얼마나 많은 만남을 가질거라 예상함?
like 네 파트너 맘에 들었음?
guess_prob_liked 파트너가 당신을 좋아할 가능성은 얼마라 생각함?
met 이전에 파트너를 만난적 있음?
match 매칭 결과


In [10]:
data.columns

Index(['has_null', 'gender', 'age', 'age_o', 'race', 'race_o',
       'importance_same_race', 'importance_same_religion', 'pref_o_attractive',
       'pref_o_sincere', 'pref_o_intelligence', 'pref_o_funny',
       'pref_o_ambitious', 'pref_o_shared_interests', 'attractive_o',
       'sincere_o', 'intelligence_o', 'funny_o', 'ambitous_o',
       'shared_interests_o', 'attractive_important', 'sincere_important',
       'intellicence_important', 'funny_important', 'ambtition_important',
       'shared_interests_important', 'attractive_partner', 'sincere_partner',
       'intelligence_partner', 'funny_partner', 'ambition_partner',
       'shared_interests_partner', 'interests_correlate',
       'expected_happy_with_sd_people', 'expected_num_interested_in_me',
       'like', 'guess_prob_liked', 'met', 'match'],
      dtype='object')

In [11]:
data = data.dropna(subset=['pref_o_attractive',
       'pref_o_sincere', 'pref_o_intelligence', 'pref_o_funny',
       'pref_o_ambitious', 'pref_o_shared_interests', 'attractive_o',
       'sincere_o', 'intelligence_o', 'funny_o', 'ambitous_o',
       'shared_interests_o', 'attractive_important', 'sincere_important',
       'intellicence_important', 'funny_important', 'ambtition_important',
       'shared_interests_important'])

In [12]:
data = data.fillna(-99)

## 전처리 - 피처 엔지니어링

### 나이 (내나이와 상대방의 나이)

In [13]:
def age_gap(x):
    if x['age'] == -99: # 결측처리가 되어있는 경우
        return -99
    elif x['age_o'] == -99:
        return -99
    elif x['gender'] == 'female':
        return x['age_o'] - x['age']
    else:
        return x['age'] - x['age_o']

- 남녀중 누구 하나라도 결측일경우 결측값 반환
- 남자가 연상이면 +값, 여자가 연상이면 -값

In [14]:
data['age_gap'] = data.apply(age_gap, axis=1)

### 인종
- 본인과 상대방이 같은 인종이면 1, 그렇지 않으면 -1, 결측값 -99도 존재

In [15]:
def same_race(x):
    if x['race'] == -99:
        return -99
    elif x['race_o'] == -99:
        return -99
    elif x['race'] == x['race_o']:
        return 1
    else:
        return -1    

In [16]:
data['same_race'] = data.apply(same_race, axis=1)

In [17]:
def same_race_point(x):
    if x['same_race'] == -99:
        return -99
    else:
        return x['same_race'] * x['importance_same_race']

In [18]:
data['same_race_point'] = data.apply(same_race_point, axis=1)

### 평가항목별 점수 환산
- 평가점수 * 중요도

In [19]:
def rating(data, importance, score):
    if data[importance] == -99:
        return -99
    elif data[score] == -99:
        return -99
    else:
        return data[importance] * data[score]

- pref_o_xxx : 상대방이 xxx 항목을 얼마나 중요하게 생각하는 지에 대한 응답
- xxx_o : 상대방이 본인에 대한 xxx 항목을 평가한 값
- xxx_inportance : xxx 항목에 대해 본인이 얼마나 중요하게 생각하는 지에 대한 응답
- xxx_partner : 본인이 상대방에 대한 xxx 항목을 평가한 값

In [20]:
# 상대방의 중요도
partner_imp = data.columns[8:14]
# 본인에 대한 상대방의 평가
partner_rate_me = data.columns[14:20]
# 본인의 중요도
my_imp = data.columns[20:26]
# 상대방에 대한 본인의 평가
my_rate_partner = data.columns[26:32]

In [22]:
# 상대방 관련 항목 점수
for i, j, k in zip(new_partner, partner_imp, partner_rate_me):
    data[i] = data.apply(lambda x : rating(x, j, k), axis=1)

In [23]:
# 본인 관련 항목 점수
for i, j, k in zip(new_me, my_imp, my_rate_partner):
    data[i] = data.apply(lambda x : rating(x, j, k), axis=1)

In [24]:
data.head()

Unnamed: 0,has_null,gender,age,age_o,race,race_o,importance_same_race,importance_same_religion,pref_o_attractive,pref_o_sincere,pref_o_intelligence,pref_o_funny,pref_o_ambitious,pref_o_shared_interests,attractive_o,sincere_o,intelligence_o,funny_o,ambitous_o,shared_interests_o,...,expected_num_interested_in_me,like,guess_prob_liked,met,match,age_gap,same_race,same_race_point,attractive_p,sincere_p,intelligence_p,funny_p,ambtition_p,shared_interests_p,attractive_m,sincere_m,intelligence_m,funny_m,ambtition_m,shared_interests_m
0,0,female,21.0,27.0,Asian/PacificIslander/Asian-American,European/Caucasian-American,2.0,4.0,35.0,20.0,20.0,20.0,0.0,5.0,6.0,8.0,8.0,8.0,8.0,6.0,...,2.0,7.0,6.0,0.0,0,6.0,-1,-2.0,210.0,160.0,160.0,160.0,0.0,30.0,90.0,180.0,140.0,105.0,90.0,75.0
1,0,female,21.0,22.0,Asian/PacificIslander/Asian-American,European/Caucasian-American,2.0,4.0,60.0,0.0,0.0,40.0,0.0,0.0,7.0,8.0,10.0,7.0,7.0,5.0,...,2.0,7.0,5.0,1.0,0,1.0,-1,-2.0,420.0,0.0,0.0,280.0,0.0,0.0,105.0,160.0,140.0,120.0,75.0,90.0
2,1,female,21.0,22.0,Asian/PacificIslander/Asian-American,Asian/PacificIslander/Asian-American,2.0,4.0,19.0,18.0,19.0,18.0,14.0,12.0,10.0,10.0,10.0,10.0,10.0,10.0,...,2.0,7.0,-99.0,1.0,1,1.0,1,2.0,190.0,180.0,190.0,180.0,140.0,120.0,75.0,160.0,180.0,120.0,75.0,105.0
3,0,female,21.0,23.0,Asian/PacificIslander/Asian-American,European/Caucasian-American,2.0,4.0,30.0,5.0,15.0,40.0,5.0,5.0,7.0,8.0,9.0,8.0,9.0,8.0,...,2.0,7.0,6.0,0.0,1,2.0,-1,-2.0,210.0,40.0,135.0,320.0,45.0,40.0,105.0,120.0,160.0,105.0,90.0,120.0
4,0,female,21.0,24.0,Asian/PacificIslander/Asian-American,Latino/HispanicAmerican,2.0,4.0,30.0,10.0,20.0,10.0,10.0,20.0,8.0,7.0,9.0,6.0,9.0,7.0,...,2.0,6.0,6.0,0.0,1,3.0,-1,-2.0,240.0,70.0,180.0,60.0,90.0,140.0,75.0,120.0,140.0,105.0,90.0,90.0


In [25]:
# 더미변수 생성
data = pd.get_dummies(data, columns = ['gender', 'race', 'race_o'], drop_first=True)

In [26]:
data.head()

Unnamed: 0,has_null,age,age_o,importance_same_race,importance_same_religion,pref_o_attractive,pref_o_sincere,pref_o_intelligence,pref_o_funny,pref_o_ambitious,pref_o_shared_interests,attractive_o,sincere_o,intelligence_o,funny_o,ambitous_o,shared_interests_o,attractive_important,sincere_important,intellicence_important,...,sincere_p,intelligence_p,funny_p,ambtition_p,shared_interests_p,attractive_m,sincere_m,intelligence_m,funny_m,ambtition_m,shared_interests_m,gender_male,race_Black/AfricanAmerican,race_European/Caucasian-American,race_Latino/HispanicAmerican,race_Other,race_o_Black/AfricanAmerican,race_o_European/Caucasian-American,race_o_Latino/HispanicAmerican,race_o_Other
0,0,21.0,27.0,2.0,4.0,35.0,20.0,20.0,20.0,0.0,5.0,6.0,8.0,8.0,8.0,8.0,6.0,15.0,20.0,20.0,...,160.0,160.0,160.0,0.0,30.0,90.0,180.0,140.0,105.0,90.0,75.0,0,0,0,0,0,0,1,0,0
1,0,21.0,22.0,2.0,4.0,60.0,0.0,0.0,40.0,0.0,0.0,7.0,8.0,10.0,7.0,7.0,5.0,15.0,20.0,20.0,...,0.0,0.0,280.0,0.0,0.0,105.0,160.0,140.0,120.0,75.0,90.0,0,0,0,0,0,0,1,0,0
2,1,21.0,22.0,2.0,4.0,19.0,18.0,19.0,18.0,14.0,12.0,10.0,10.0,10.0,10.0,10.0,10.0,15.0,20.0,20.0,...,180.0,190.0,180.0,140.0,120.0,75.0,160.0,180.0,120.0,75.0,105.0,0,0,0,0,0,0,0,0,0
3,0,21.0,23.0,2.0,4.0,30.0,5.0,15.0,40.0,5.0,5.0,7.0,8.0,9.0,8.0,9.0,8.0,15.0,20.0,20.0,...,40.0,135.0,320.0,45.0,40.0,105.0,120.0,160.0,105.0,90.0,120.0,0,0,0,0,0,0,1,0,0
4,0,21.0,24.0,2.0,4.0,30.0,10.0,20.0,10.0,10.0,20.0,8.0,7.0,9.0,6.0,9.0,7.0,15.0,20.0,20.0,...,70.0,180.0,60.0,90.0,140.0,75.0,120.0,140.0,105.0,90.0,90.0,0,0,0,0,0,0,0,1,0


# 모델링과 평가

In [27]:
data.columns

Index(['has_null', 'age', 'age_o', 'importance_same_race',
       'importance_same_religion', 'pref_o_attractive', 'pref_o_sincere',
       'pref_o_intelligence', 'pref_o_funny', 'pref_o_ambitious',
       'pref_o_shared_interests', 'attractive_o', 'sincere_o',
       'intelligence_o', 'funny_o', 'ambitous_o', 'shared_interests_o',
       'attractive_important', 'sincere_important', 'intellicence_important',
       'funny_important', 'ambtition_important', 'shared_interests_important',
       'attractive_partner', 'sincere_partner', 'intelligence_partner',
       'funny_partner', 'ambition_partner', 'shared_interests_partner',
       'interests_correlate', 'expected_happy_with_sd_people',
       'expected_num_interested_in_me', 'like', 'guess_prob_liked', 'met',
       'match', 'age_gap', 'same_race', 'same_race_point', 'attractive_p',
       'sincere_p', 'intelligence_p', 'funny_p', 'ambtition_p',
       'shared_interests_p', 'attractive_m', 'sincere_m', 'intelligence_m',
       'funn

## 훈련세트와 테스트세트 분리

In [28]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    data.drop('match', axis=1), data['match'], test_size=0.2, random_state=100)

## 모델링과 평가

In [29]:
import xgboost as xgb

model = xgb.XGBClassifier(n_estimators = 500, max_depth=5, random_state=100)
model.fit(X_train, y_train)
pred = model.predict(X_test)

In [30]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
accuracy_score(y_test, pred)

0.849194729136164

In [31]:
print(confusion_matrix(y_test, pred))

[[1048   66]
 [ 140  112]]


In [32]:
print(classification_report(y_test, pred))

              precision    recall  f1-score   support

           0       0.88      0.94      0.91      1114
           1       0.63      0.44      0.52       252

    accuracy                           0.85      1366
   macro avg       0.76      0.69      0.72      1366
weighted avg       0.84      0.85      0.84      1366



## 하이퍼파라미터 튜닝 : 그리드 서치

In [33]:
from sklearn.model_selection import GridSearchCV

In [34]:
parameters = {
    'learning_rate':[0.01, 0.1, 0.3],
    'max_depth': [5, 7, 10],
    'subsample': [0.5, 0.7, 1],
    'n_estimators' : [300, 500, 1000]
}

In [38]:
model = xgb.XGBClassifier()
gs = GridSearchCV(model, parameters, n_jobs=-1, scoring='f1', cv=5)
gs.fit(X_train, y_train)

GridSearchCV(cv=5,
             estimator=XGBClassifier(base_score=None, booster=None,
                                     callbacks=None, colsample_bylevel=None,
                                     colsample_bynode=None,
                                     colsample_bytree=None,
                                     early_stopping_rounds=None,
                                     enable_categorical=False, eval_metric=None,
                                     feature_types=None, gamma=None,
                                     gpu_id=None, grow_policy=None,
                                     importance_type=None,
                                     interaction_constraints=None,
                                     learning_rate=None,...
                                     max_cat_to_onehot=None,
                                     max_delta_step=None, max_depth=None,
                                     max_leaves=None, min_child_weight=None,
                                   

In [39]:
gs.best_params_

{'learning_rate': 0.3, 'max_depth': 10, 'n_estimators': 1000, 'subsample': 0.7}