In [None]:
# mac m1 : brew install cmake libomp
!uv add xgboost

In [None]:
import pandas as pd
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import root_mean_squared_error  # 루트 제곱 평균 오차
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns
import nltk
from sklearn.preprocessing import StandardScaler  # 평균 0, 표준편차 1
from sklearn.preprocessing import MinMaxScaler  # 백분위
from sklearn.preprocessing import RobustScaler

In [None]:
file_url = "https://media.githubusercontent.com/media/musthave-ML10/data_source/main/dating.csv"
data = pd.read_csv(file_url)
data.shape

In [None]:
pd.options.display.max_columns = 40

In [None]:
data.head()

In [None]:
data.info()

In [None]:
round(data.describe(), 2)

# 전처리 : 결측치

In [None]:
data.isna().mean() * 100

In [None]:
data.columns

In [None]:
data = data.dropna(
    subset=[
        "pref_o_attractive",
        "pref_o_sincere",
        "pref_o_intelligence",
        "pref_o_funny",
        "pref_o_ambitious",
        "pref_o_shared_interests",
        "attractive_o",
        "sincere_o",
        "intelligence_o",
        "funny_o",
        "ambitous_o",
        "shared_interests_o",
        "attractive_important",
        "sincere_important",
        "intellicence_important",
        "funny_important",
        "ambtition_important",
        "shared_interests_important",
    ]
)

In [None]:
data = data.fillna(-99)

# 전처리 : 피처 엔지니어링

In [None]:
def age_gap(x):
  if x["age"] == -99:
    return -99
  elif x["age_o"] == -99:
    return -99
  elif x["gender"] == "female":
    return x["age_o"] - x["age"]
  else:
    return x["age"] - x["age_o"]

In [None]:
print(data.loc[9, ["gender", "age", "age_o"]])
print(age_gap(data.loc[9]))

In [None]:
data["age_gap"] = data.apply(age_gap, axis=1)

In [None]:
data["age_gap"].sort_values(ascending=False)

In [None]:
data["age_gap_abs"] = abs(data["age_gap"])

In [None]:
data.head(2)

In [None]:
data.columns

In [None]:
def same_race(x):
    if x["race"] == -99:
        return -99
    elif x["race_o"] == -99:
        return -99
    elif x["race"] == x["race_o"]:
        return 1
    else:
        return 0

In [None]:
data["same_race"] = data.apply(same_race, axis=1)

In [None]:
def same_race_point(x):
    if x["same_race"] == -99:
        return -99
    else:
        return x["same_race"] * x["importance_same_race"]

In [None]:
data["same_race_point"] = data.apply(same_race_point, axis=1)

In [None]:
def rating(data, importance, score):
  if data[importance] == -99:
    return -99
  elif data[score] == -99:
    return -99
  else:
    return data[importance] * data[score]

In [None]:
data.columns[26:32]

In [None]:
# 상대방의 중요도
partner_imp = data.columns[8:14]
# 본인에 대한 상대방의 평가
partner_rate_me = data.columns[14:20]
# 본인의 중요도
my_imp = data.columns[20:26]
# 상대방에 대한 본인의 평가
my_rate_partner = data.columns[26:32]

In [None]:
new_label_partner = [
    "attractive_p",
    "sincere_p",
    "intelligence_p",
    "funny_p",
    "ambition_p",
    "shared_interests_p",
]

new_label_me = [
    "attractive_m",
    "sincere_m",
    "intelligence_m",
    "funny_m",
    "ambition_m",
    "shared_interests_m",
]

In [None]:
for idx, val1, val2 in zip(new_label_partner,partner_imp,partner_rate_me):
    print(idx," & " ,val1 ," & ",val2)

In [None]:
for idx, val1, val2 in zip(new_label_me,my_imp,my_rate_partner):
    print(idx," & " ,val1 ," & ",val2)

In [None]:
for idx, val1, val2 in zip(new_label_partner, partner_imp, partner_rate_me):
    data[idx] = data.apply(lambda x : rating(x,val1,val2) ,axis=1)

In [None]:
for idx, val1, val2 in zip(new_label_me, my_imp, my_rate_partner):
    data[idx] = data.apply(lambda x: rating(x, val1, val2), axis=1)

In [None]:
data.head(2)

In [None]:
data = pd.get_dummies(data,columns=["gender","race","race_o"],drop_first=True)

# 모델 학습 및 평가 

In [None]:
X_train, X_test, y_train, y_test = train_test_split(data.drop("match",axis=1),data["match"],test_size=.3,random_state=20)

In [None]:
help(xgb.XGBClassifier)

In [None]:
model =  xgb.XGBClassifier(n_estimators = 500, max_depth=5,random_state=20)
model.fit(X_train,y_train)
train_pred = model.predict(X_train)
test_pred = model.predict(X_test)

In [None]:
print("train score : " , accuracy_score(train_pred,y_train))
print("test score : ", accuracy_score(test_pred, y_test))

In [None]:
from sklearn.metrics import auc , roc_auc_score, f1_score,classification_report
print(classification_report(y_test,test_pred))

In [None]:
roc_auc_score(y_test,test_pred)

In [None]:
data.columns

In [None]:
data_subset = data.loc[
    :,
    [
        "interests_correlate",
        "expected_happy_with_sd_people",
        "expected_num_interested_in_me",
        "like",
        "guess_prob_liked",
        "met",
        "match",
        "age_gap_abs",
        "same_race",
        "same_race_point",
        "attractive_p",
        "sincere_p",
        "intelligence_p",
        "funny_p",
        "ambition_p",
        "shared_interests_p",
        "attractive_m",
        "sincere_m",
        "intelligence_m",
        "funny_m",
        "ambition_m",
        "shared_interests_m",
        "gender_male",
    ],
]

In [None]:
print(data_subset.shape,data.shape)

In [None]:
data_subset.describe()

In [None]:
rs_scaler = RobustScaler()
data_subset_scaled = rs_scaler.fit_transform(data_subset)
data_subset_scaled = pd.DataFrame(data_subset_scaled, columns=data_subset.columns)

In [None]:
data_subset_scaled.describe()

In [94]:
X_train, X_test, y_train, y_test = train_test_split(
    data_subset_scaled.drop("match", axis=1), data_subset["match"], test_size=0.3, random_state=20
)
model = xgb.XGBClassifier(n_estimators=1000, max_depth=5, random_state=20)
model.fit(X_train, y_train)
train_pred = model.predict(X_train)
test_pred = model.predict(X_test)

In [95]:
print(classification_report(y_test,test_pred))

              precision    recall  f1-score   support

           0       0.88      0.94      0.91      1686
           1       0.58      0.39      0.47       363

    accuracy                           0.84      2049
   macro avg       0.73      0.67      0.69      2049
weighted avg       0.83      0.84      0.83      2049



In [None]:
data.corr()

# 하이퍼파라미터 최적화 : 그리드 서치

In [97]:
help(xgb.XGBClassifier)

Help on class XGBClassifier in module xgboost.sklearn:

class XGBClassifier(sklearn.base.ClassifierMixin, XGBModel)
 |  XGBClassifier(
 |      *,
 |      objective: Union[str, xgboost.sklearn._SklObjWProto, Callable[[Any, Any], Tuple[numpy.ndarray, numpy.ndarray]], NoneType] = 'binary:logistic',
 |      **kwargs: Any
 |  ) -> None
 |
 |  Implementation of the scikit-learn API for XGBoost classification.
 |  See :doc:`/python/sklearn_estimator` for more information.
 |
 |  Parameters
 |  ----------
 |
 |      n_estimators : Optional[int]
 |          Number of boosting rounds.
 |
 |      max_depth :  typing.Optional[int]
 |
 |          Maximum tree depth for base learners.
 |
 |      max_leaves : typing.Optional[int]
 |
 |          Maximum number of leaves; 0 indicates no limit.
 |
 |      max_bin : typing.Optional[int]
 |
 |          If using histogram-based algorithm, maximum number of bins per feature
 |
 |      grow_policy : typing.Optional[str]
 |
 |          Tree growing policy.
 |

In [101]:
parameters = {
    "learning_rate":[0.01,0.1,0.3,0.5],
    "max_depth" : [5,7,9,10],
    "subsample" : [0.3,0.5,0.7,1],
    "n_estimators" : [300,500,700,1000]
}

model = xgb.XGBClassifier()

In [102]:
from sklearn.model_selection import GridSearchCV
gs_model = GridSearchCV(model, parameters,n_jobs=1,scoring = "f1",cv=5)

In [103]:
gs_model.fit(X_train,y_train)

In [None]:
test_pred = gs_model.predict(X_test)
print(classification_report(y_test,test_pred))

In [104]:
!uv add lightgbm

[2mResolved [1m121 packages[0m [2min 933ms[0m[0m
[36m[1mDownloading[0m[39m lightgbm [2m(1.4MiB)[0m
 [32m[1mDownloaded[0m[39m lightgbm
[2mPrepared [1m1 package[0m [2min 386ms[0m[0m
[2mInstalled [1m1 package[0m [2min 13ms[0m[0m
 [32m+[39m [1mlightgbm[0m[2m==4.6.0[0m


In [105]:
import lightgbm as lgb


In [106]:
model1 = lgb.LGBMClassifier(random_state=20)
model1.fit(X_train,y_train)
test_pred1 = model1.predict(X_test)

[LightGBM] [Info] Number of positive: 815, number of negative: 3964
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000701 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2878
[LightGBM] [Info] Number of data points in the train set: 4779, number of used features: 22
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.170538 -> initscore=-1.581821
[LightGBM] [Info] Start training from score -1.581821


In [107]:
accuracy_score(y_test,test_pred)

0.8423621278672523

In [108]:
model1.predict_proba(X_test)

array([[0.9989934 , 0.0010066 ],
       [0.92071783, 0.07928217],
       [0.46245935, 0.53754065],
       ...,
       [0.98705352, 0.01294648],
       [0.95582662, 0.04417338],
       [0.9642382 , 0.0357618 ]], shape=(2049, 2))