In [1]:
pip install xgboost

Note: you may need to restart the kernel to use updated packages.


In [2]:
import xgboost as xgb


In [3]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

In [4]:
import pandas as pd


In [5]:
pd.set_option('display.max_columns',None)
pd.set_option('display.max_rows',None)

In [7]:
df_data = pd.read_csv('/Users/nueaprae/Desktop/Competition/xgb_dataset/XGB_train.csv')

In [8]:
df_ho_data = pd.read_csv('/Users/nueaprae/Desktop/Competition/xgb_dataset/XGB_holdout.csv')

In [9]:


X = df_data.drop(columns=['id','preventive_visit_gap_ind'])
y = df_data['preventive_visit_gap_ind']


In [10]:

# split train/test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


——————————————————————————————————————————————————————————————————————————————————————————

XGBoost model (random search)

In [10]:



best_params = {
    'colsample_bytree': 0.7098887171960256,
    'gamma': 0.2806217129238506,
    'learning_rate': 0.12487806242613694,
    'max_depth': 9,
    'n_estimators': 180,
    'subsample': 0.9043140194467589,
    'objective': 'binary:logistic',   
    'eval_metric': 'logloss',         
}


In [11]:


best_model = xgb.XGBClassifier(**best_params)


In [None]:


best_model.fit(X_train, y_train)


In [13]:

y_pred = best_model.predict(X_test)


In [None]:


accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.6f}')


In [None]:


print(classification_report(y_test, y_pred))


In [9]:
from sklearn.metrics import accuracy_score, classification_report, roc_auc_score

In [None]:

roc_auc = roc_auc_score(y_test, y_pred)
print(f'ROC-AUC Score: {roc_auc:.6f}')


——————————————————————————————————————————————————————————————————————————————————————

Feature Importance

In [17]:
import matplotlib.pyplot as plt
import numpy as np


In [18]:


feature_names = X.columns


In [19]:

feature_importance = best_model.feature_importances_


In [20]:

feature_importance_df = pd.DataFrame({'Feature': feature_names, 'Importance': feature_importance})


In [21]:

feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)


In [24]:
feature_importance_df.head(30)
top_n = 50  
top_features_df = feature_importance_df.head(top_n)


In [None]:

plt.figure(figsize=(12, 8))
plt.barh(top_features_df['Feature'], top_features_df['Importance'])
plt.xlabel('Feature Importance')
plt.ylabel('Features')
plt.title(f'Top {top_n} Feature Importance')
plt.gca().invert_yaxis() 
plt.show()



__________________________________________

XGBoost (PCA)

In [73]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.metrics import accuracy_score, classification_report, roc_auc_score


In [53]:

pipeline = Pipeline([
    ('scaler', StandardScaler()),       
    ('pca', PCA(n_components=20)),      
    ('xgb', xgb.XGBClassifier(
        colsample_bytree=0.7098887171960256,
        gamma=0.2806217129238506,
        learning_rate=0.12487806242613694,
        max_depth=9,
        n_estimators=180,
        subsample=0.9043140194467589,
        objective='binary:logistic',
        eval_metric='logloss',
        random_state=42
    ))                                  
])


In [None]:

pipeline.fit(X_train, y_train)


In [55]:

y_pred = pipeline.predict(X_test)


In [None]:

accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.6f}')


In [None]:

print(classification_report(y_test, y_pred))


In [None]:

y_pred_proba = pipeline.predict_proba(X_test)[:, 1]
roc_auc = roc_auc_score(y_test, y_pred_proba)
print(f'ROC-AUC Score: {roc_auc:.6f}')


____________________________________________________________
XGBoost (Grid Search)

In [11]:
from sklearn.model_selection import GridSearchCV


In [13]:

xgb_model = xgb.XGBClassifier(
    objective='binary:logistic',
    eval_metric='logloss',  
    random_state=42
)


In [14]:

param_grid = {
'colsample_bytree': [0.69, 0.71, 0.73], # 围绕 0.709 调整
'gamma': [0.26, 0.28, 0.30], # 围绕 0.280 调整
'learning_rate': [0.12, 0.125, 0.13], # 围绕 0.125 调整
'max_depth': [8, 9, 10], # 围绕 9 调整
'n_estimators': [170, 180, 190], # 围绕 180 调整
'subsample': [0.89, 0.90, 0.91],
"reg_alpha":[0.0001,0.001, 0.01, 0.1, 1, 100], 
"reg_lambda":[0.0001,0.001, 0.01, 0.1, 1, 100], 
"min_child_weight": [2,3,4,5,6,7,8] # 围绕 0.904 调整
}
# 'colsample_bytree': 0.7098887171960256,
#     'gamma': 0.2806217129238506,
#     'learning_rate': 0.12487806242613694,
#     'max_depth': 9,
#     'n_estimators': 180,
#     'subsample': 0.9043140194467589,
#     'objective': 'binary:logistic',   # 二分类任务
#     'eval_metric': 'logloss',         # 评价指标
#     'random_state': 42


In [15]:

grid_search = GridSearchCV(
    estimator=xgb_model,
    param_grid=param_grid,
    scoring='roc_auc',
    cv=3,                
    n_jobs=1,
    verbose=1
)



In [None]:

# Grid Search
grid_search.fit(X_train, y_train)


In [None]:


print("Best Parameters:", grid_search.best_params_)
print("Best ROC-AUC Score:", grid_search.best_score_)


In [70]:

best_model_grid = grid_search.best_estimator_


In [71]:

# predict
y_pred = best_model_grid.predict(X_test)
y_pred_proba = best_model_grid.predict_proba(X_test)[:, 1]


In [None]:

# ROC/AUC/accuracy
accuracy = accuracy_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_pred_proba)

print(f'Accuracy: {accuracy:.6f}')
print(f'ROC-AUC Score: {roc_auc:.6f}')


_____________________________________________________________________________________________________________

random search 2.0

In [11]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import accuracy_score, roc_auc_score


In [22]:

# initialize model
xgb_model_random = xgb.XGBClassifier(
    objective='binary:logistic',
    eval_metric='logloss',
    random_state=42,
    n_estimators = 550,
     max_depth = 9, 
    learning_rate = 0.1,
    gamma = 0.27,
     subsample = 0.9,
    colsample_bytree = 0.67,
    reg_alpha = 1e-07,
    reg_lambda = 100,
    min_child_weight = 8
    
)

# xgb_model = xgb.XGBClassifier(
#     objective='binary:logistic',
#     eval_metric='auc',
#     random_state=77,
#     n_estimators = 500, #220 #260 #270
#     max_depth =  5, 
#     learning_rate = 0.125, 
#     gamma = 0.27,
#     subsample = 0.89,
#     colsample_bytree = 0.67,
#     reg_alpha = 1e-06,
#     reg_lambda = 100,
#     min_child_weight = 8
# )
# Best Parameters: {'subsample': 0.9, 'reg_lambda': 100, 'reg_alpha': 1e-07, 'n_estimators': 500, 'min_child_weight': 5, 'max_depth': 9, 'learning_rate': 0.09, 'gamma': 0.23}

In [23]:
# define para
# param_dist = {                   
#     'n_estimators': [490,500,510],              
#     'subsample': [0.89,0.9,0.91],
#     'max_depth' : [7,8,9], 
#     'learning_rate' : [0.09,0.1,0.125],
#     "reg_alpha":[0.0000001, 0.000001], 
#     "reg_lambda":[10,15,50,100], 
#     "min_child_weight": [5,7,8],
#     "gamma" : [0.23, 0.25, 0.27]               
# }

# Best Parameters: {'subsample': 0.89, 'reg_lambda': 100, 'reg_alpha': 0.0001, 'n_estimators': 180, 'min_child_weight': 8, 'max_depth': 9, 'learning_rate': 0.125, 'gamma': 0.28, 'colsample_bytree': 0.69}
# Best ROC-AUC Score: 0.7590238414407036

# New Best
# Best Parameters: {'subsample': 0.89, 'reg_lambda': 100, 'reg_alpha': 1e-06, 'n_estimators': 190, 'min_child_weight': 8, 'gamma': 0.27, 'colsample_bytree': 0.67}
# Best ROC-AUC Score: 0.7592664892158751

In [35]:

# RandomizedSearchCV 
# random_search = RandomizedSearchCV(
#     estimator=xgb_model,
#     param_distributions=param_dist,
#     n_iter=10,  
#     scoring='roc_auc',  
#     cv=2, 
#     random_state=42,
#     n_jobs=1, 
#     verbose=2
# )


In [24]:

#  Randomized Search
xgb_model_random.fit(X_train, y_train)


In [14]:

# print("Best Parameters:", random_search.best_params_)
# print("Best ROC-AUC Score:", random_search.best_score_)


Best Parameters: {'subsample': 0.9, 'reg_lambda': 100, 'reg_alpha': 1e-07, 'n_estimators': 500, 'min_child_weight': 5, 'max_depth': 9, 'learning_rate': 0.09, 'gamma': 0.23}
Best ROC-AUC Score: 0.7624147877241043


In [None]:
# random_search.best_params_

In [20]:


# best_model_random = random_search.best_estimator_


In [25]:

# predict with best model
y_pred = xgb_model_random.predict(X_test)
y_pred_proba = xgb_model_random.predict_proba(X_test)[:, 1]


In [26]:

# accuracy/AUC/ROC
accuracy = accuracy_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_pred_proba)

print(f'Accuracy: {accuracy:.6f}')
print(f'ROC-AUC Score: {roc_auc:.6f}')

Accuracy: 0.701032
ROC-AUC Score: 0.768098


___________________________________________________________

Predict Holdout

In [62]:
ids = df_ho_data['id']


In [63]:

# get features
X_holdout = df_ho_data.drop(columns=['id'])


In [64]:

# predict holdout
scores =xgb_model_random.predict_proba(X_holdout)[:, 1]


In [65]:
# result DataFrame
results = pd.DataFrame({'id': ids, 'SCORE': scores})
results.head()

Unnamed: 0,id,SCORE
0,5,0.331106
1,9,0.604227
2,11,0.38686
3,14,0.755401
4,22,0.790242


In [66]:

# rank
results['RANK'] = results['SCORE'].rank(ascending=False, method='dense').astype('int')


In [67]:
results.head(10)

Unnamed: 0,id,SCORE,RANK
0,5,0.331106,242362
1,9,0.604227,93115
2,11,0.38686,206331
3,14,0.755401,46583
4,22,0.790242,39102
5,23,0.550266,115836
6,30,0.17261,338415
7,43,0.153946,347238
8,48,0.218612,313818
9,50,0.670518,69599


In [68]:
results.nunique()

id       381976
SCORE    377470
RANK     377470
dtype: int64

In [71]:

results.to_csv('2024CaseCompetition_Han_Bao_20241011.csv', index=False)

