## Final Project 
## Brainster DS x Parkinson's Disease Specifications

### Import libraries

In [306]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
from functools import partial
import re
import pickle
import joblib
from scipy.stats import skew, kurtosis
from scipy import stats
from sklearn.model_selection import cross_validate, cross_val_score, train_test_split, KFold, StratifiedKFold

In [307]:
df_users = pd.read_csv("df_user.csv")
df_keys = pd.read_csv("df_keys.csv")

In [308]:
df_users

Unnamed: 0,BirthYear,ID,Gender,Parkinsons,Tremors,DiagnosisYear,Sided,UPDRS,Impact,Levadopa,DA,MAOB,Other
0,1959.0,0QAZFRHQHW,Female,False,False,,,Don't know,,False,False,False,False
1,1944.0,1HOEBIGASW,Male,False,False,,,Don't know,,False,False,False,False
2,1936.0,1XNJCXS3EY,Male,False,False,,,Don't know,,False,False,False,False
3,1936.0,3DIXPRIOSW,Male,False,False,,,Don't know,,False,False,False,False
4,1950.0,48DZPAJ5NS,Male,True,False,2010.0,,Don't know,Mild,False,False,False,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...
108,1948.0,YQSGN9BMVK,Male,False,False,,,Don't know,,False,False,False,False
109,1953.0,YWMIQIQND3,Female,True,True,2016.0,Right,Don't know,Mild,False,False,False,False
110,1928.0,YYPKGX6B24,Male,False,False,,,Don't know,,False,False,False,False
111,1947.0,Z2UPVHHGBE,Female,True,True,2015.0,Right,Don't know,Mild,False,False,False,True


In [309]:
df_users.drop(columns=[ 'UPDRS'], inplace=True)

print(df_users.head())

   BirthYear          ID  Gender  Parkinsons  Tremors  DiagnosisYear Sided  \
0     1959.0  0QAZFRHQHW  Female       False    False            NaN   NaN   
1     1944.0  1HOEBIGASW    Male       False    False            NaN   NaN   
2     1936.0  1XNJCXS3EY    Male       False    False            NaN   NaN   
3     1936.0  3DIXPRIOSW    Male       False    False            NaN   NaN   
4     1950.0  48DZPAJ5NS    Male        True    False         2010.0   NaN   

  Impact  Levadopa     DA   MAOB  Other  
0    NaN     False  False  False  False  
1    NaN     False  False  False  False  
2    NaN     False  False  False  False  
3    NaN     False  False  False  False  
4   Mild     False  False  False   True  


In [310]:
df_keys

Unnamed: 0,ID,Date,TS,Hand,HoldTime,Direction,LatencyTime,FlightTime
0,0QAZFRHQHW,160916,19:20:43.891,L,78.1,LL,312.5,257.8
1,0QAZFRHQHW,160916,19:20:44.344,L,78.1,LL,453.1,375.0
2,0QAZFRHQHW,160916,19:20:44.742,L,62.5,LL,414.1,335.9
3,0QAZFRHQHW,160916,19:20:45.516,L,93.8,LL,742.2,679.7
4,0QAZFRHQHW,160916,19:20:46.047,R,101.6,LR,523.4,429.7
...,...,...,...,...,...,...,...,...
3662934,Z2UPVHHGBE,170303,20:57:34.102,R,109.4,LR,343.8,203.1
3662935,Z2UPVHHGBE,170303,21:03:38.312,L,125.0,LL,398.4,242.2
3662936,Z2UPVHHGBE,170303,21:03:38.703,R,171.9,LR,343.8,218.8
3662937,Z2UPVHHGBE,170303,21:03:38.977,L,132.8,RL,312.5,140.6


In [311]:
hold_by_user =  df_keys[df_keys['Hand'] != 'S'].groupby(['ID', 'Hand'])['HoldTime'].agg(['mean'])
latency_by_user = df_keys[np.in1d(df_keys['Direction'], ['LL', 'LR', 'RL', 'RR'])].groupby(['ID', 'Direction'])['LatencyTime'].agg(['mean'])

In [312]:
hold_by_user_flat = hold_by_user.unstack()
hold_by_user_flat.columns = ['_'.join(col).strip() for col in hold_by_user_flat.columns.values]

In [313]:
latency_by_user_flat = latency_by_user.unstack()
latency_by_user_flat.columns = ['_'.join(col).strip() for col in latency_by_user_flat.columns.values]

In [314]:
hold_by_user_flat['mean_hold_diff'] = hold_by_user_flat['mean_L'] - hold_by_user_flat['mean_R']

In [315]:
latency_by_user_flat['mean_LR_RL_diff'] = latency_by_user_flat['mean_LR'] - latency_by_user_flat['mean_RL']
latency_by_user_flat['mean_LL_RR_diff'] = latency_by_user_flat['mean_LL'] - latency_by_user_flat['mean_RR']

In [316]:
combined = pd.concat([hold_by_user_flat, latency_by_user_flat], axis=1)

In [317]:
full_set = pd.merge(combined.reset_index(), df_users[['ID']], on='ID')


In [318]:
full_set

Unnamed: 0,ID,mean_L,mean_R,mean_hold_diff,mean_LL,mean_LR,mean_RL,mean_RR,mean_LR_RL_diff,mean_LL_RR_diff
0,0QAZFRHQHW,98.931818,101.595749,-2.663930,406.716242,411.718182,430.258974,365.736471,-18.540793,40.979771
1,1HOEBIGASW,66.280645,65.036667,1.243978,390.058824,600.433333,536.407143,394.647059,64.026190,-4.588235
2,1XNJCXS3EY,153.702407,105.622423,48.079984,347.882547,313.541489,310.799454,322.170833,2.742036,25.711714
3,3DIXPRIOSW,147.626087,167.039039,-19.412952,528.670445,575.478761,501.274093,493.779630,74.204668,34.890816
4,48DZPAJ5NS,125.182493,126.045471,-0.862979,300.323155,335.508287,321.131506,332.621036,14.376781,-32.297880
...,...,...,...,...,...,...,...,...,...,...
76,YIA9DW5AGQ,74.926898,73.006689,1.920209,233.183499,249.604979,282.845343,262.600722,-33.240364,-29.417223
77,YQSGN9BMVK,101.932172,114.030694,-12.098522,284.901879,215.023762,255.127555,271.448199,-40.103793,13.453680
78,YWMIQIQND3,103.910159,143.413333,-39.503175,249.626144,262.962245,247.860360,350.363077,15.101885,-100.736933
79,YYPKGX6B24,148.072662,143.832754,4.239908,505.747519,456.021354,580.315603,545.699209,-124.294249,-39.951690


In [319]:

full_set.drop(columns=['ID'], inplace=True)

full_set.rename(columns={'Parkinsons': 'Parkinsons_y'}, inplace=True)


print(full_set.columns)

Index(['mean_L', 'mean_R', 'mean_hold_diff', 'mean_LL', 'mean_LR', 'mean_RL',
       'mean_RR', 'mean_LR_RL_diff', 'mean_LL_RR_diff'],
      dtype='object')


In [320]:
explorer_df = pd.concat([df_users, full_set], axis=1)

In [321]:
explorer_df

Unnamed: 0,BirthYear,ID,Gender,Parkinsons,Tremors,DiagnosisYear,Sided,Impact,Levadopa,DA,...,Other,mean_L,mean_R,mean_hold_diff,mean_LL,mean_LR,mean_RL,mean_RR,mean_LR_RL_diff,mean_LL_RR_diff
0,1959.0,0QAZFRHQHW,Female,False,False,,,,False,False,...,False,98.931818,101.595749,-2.663930,406.716242,411.718182,430.258974,365.736471,-18.540793,40.979771
1,1944.0,1HOEBIGASW,Male,False,False,,,,False,False,...,False,66.280645,65.036667,1.243978,390.058824,600.433333,536.407143,394.647059,64.026190,-4.588235
2,1936.0,1XNJCXS3EY,Male,False,False,,,,False,False,...,False,153.702407,105.622423,48.079984,347.882547,313.541489,310.799454,322.170833,2.742036,25.711714
3,1936.0,3DIXPRIOSW,Male,False,False,,,,False,False,...,False,147.626087,167.039039,-19.412952,528.670445,575.478761,501.274093,493.779630,74.204668,34.890816
4,1950.0,48DZPAJ5NS,Male,True,False,2010.0,,Mild,False,False,...,True,125.182493,126.045471,-0.862979,300.323155,335.508287,321.131506,332.621036,14.376781,-32.297880
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
108,1948.0,YQSGN9BMVK,Male,False,False,,,,False,False,...,False,,,,,,,,,
109,1953.0,YWMIQIQND3,Female,True,True,2016.0,Right,Mild,False,False,...,False,,,,,,,,,
110,1928.0,YYPKGX6B24,Male,False,False,,,,False,False,...,False,,,,,,,,,
111,1947.0,Z2UPVHHGBE,Female,True,True,2015.0,Right,Mild,False,False,...,True,,,,,,,,,


In [322]:
explorer_df.shape

(113, 21)

In [323]:
import plotly.express as px
import plotly.figure_factory as ff
from plotly.subplots import make_subplots
import plotly.graph_objects as go

 

In [369]:

total_male_count = male_parkinsons_count + male_non_parkinsons_count
total_female_count = female_parkinsons_count + female_non_parkinsons_count


total_labels = ['Male', 'Female']
total_values = [total_male_count, total_female_count]


fig_total = px.pie(names=total_labels, values=total_values, title='Total Distribution by Gender (Parkinsons and Non-Parkinsons)',
                   color=total_labels, color_discrete_sequence=['#d62728', '#1f77b4'], opacity=0.8)


labels = ['Male with Parkinsons', 'Female with Parkinsons', 'Male without Parkinsons', 'Female without Parkinsons']
values = [male_parkinsons_count, female_parkinsons_count, male_non_parkinsons_count, female_non_parkinsons_count]


color_map = {
    'Male with Parkinsons': '#1f77b4',
    'Male without Parkinsons': '#95b3d7',
    'Female with Parkinsons': '#d62728',
    'Female without Parkinsons': '#ff9896'
}


fig_parkinsons = px.pie(names=labels, values=values, title='Distribution of Parkinsons by Gender',
                        color=labels, color_discrete_map=color_map, opacity=0.8, hole=0.4)


fig = make_subplots(rows=1, cols=2, specs=[[{'type': 'pie'}, {'type': 'pie'}]],
                    subplot_titles=['Gender Distribution %', 'Parkinsons Distribution by Gender %'])


fig.add_trace(fig_total.data[0], row=1, col=1)
fig.add_trace(fig_parkinsons.data[0], row=1, col=2)


fig.update_traces(rotation=-180, selector=dict(row=1, col=1, type='pie'))

fig.update_layout(title_text='Comparison of Gender Distributions')
fig.show()


In [325]:

def combine_drug_data_for_plotting():

    parkinsons_true_df = explorer_df[explorer_df['Parkinsons'] == True]
    
    
    combined_drug_data = parkinsons_true_df.melt(id_vars=['Tremors'], 
                                                 value_vars=['Levadopa', 'DA', 'MAOB', 'Other'], 
                                                 var_name='Drug', value_name='Usage')
    
    
    tremor_drug_count = combined_drug_data.groupby(['Tremors', 'Drug', 'Usage']).size().reset_index(name='Count')
    
    return tremor_drug_count
combined_drug_data = combine_drug_data_for_plotting()

fig_combined = px.bar(combined_drug_data, x='Tremors', y='Count', color='Usage', barmode='group',
                      facet_col='Drug', title='Eefficacy of separate drugs on Tremors',
                      category_orders={'Tremors': [False, True], 'Usage': [False, True]})


fig_combined.update_layout(xaxis_title='Tremor Status', yaxis_title='Count', legend_title='Drug Usage')
fig_combined.show()


In [387]:
explorer_df

Unnamed: 0,BirthYear,ID,Gender,Parkinsons,Tremors,DiagnosisYear,Sided,Impact,Levadopa,DA,...,mean_hold_diff,mean_LL,mean_LR,mean_RL,mean_RR,mean_LR_RL_diff,mean_LL_RR_diff,Age,Parkinsons_encoded,AnyDrug
0,1959.0,0QAZFRHQHW,Female,0,False,,,,False,False,...,2.663930,406.716242,411.718182,430.258974,365.736471,18.540793,40.979771,65.0,0,False
1,1944.0,1HOEBIGASW,Male,0,False,,,,False,False,...,1.243978,390.058824,600.433333,536.407143,394.647059,64.026190,4.588235,80.0,0,False
2,1936.0,1XNJCXS3EY,Male,0,False,,,,False,False,...,48.079984,347.882547,313.541489,310.799454,322.170833,2.742036,25.711714,88.0,0,False
3,1936.0,3DIXPRIOSW,Male,0,False,,,,False,False,...,19.412952,528.670445,575.478761,501.274093,493.779630,74.204668,34.890816,88.0,0,False
4,1950.0,48DZPAJ5NS,Male,1,False,2010.0,,Mild,False,False,...,0.862979,300.323155,335.508287,321.131506,332.621036,14.376781,32.297880,74.0,1,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
108,1948.0,YQSGN9BMVK,Male,0,False,,,,False,False,...,,,,,,,,76.0,0,False
109,1953.0,YWMIQIQND3,Female,1,True,2016.0,Right,Mild,False,False,...,,,,,,,,71.0,1,False
110,1928.0,YYPKGX6B24,Male,0,False,,,,False,False,...,,,,,,,,96.0,0,False
111,1947.0,Z2UPVHHGBE,Female,1,True,2015.0,Right,Mild,False,False,...,,,,,,,,77.0,1,True


: 

In [326]:




filtered_combined_group = combined_group[combined_group['Parkinsons'] == True]


fig_combined_stacked = px.bar(filtered_combined_group, x='Tremors', y='Count', color='AnyDrug', 
                              title='Total drug efficacy on tremors',
                              category_orders={'Tremors': [False, True]},
                              labels={'Tremors': 'Tremor Status', 'Count': 'Count'})

fig_combined_stacked.update_layout(barmode='stack', xaxis_title='Tremor Status', yaxis_title='Count', legend_title='Drug Usage')
fig_combined_stacked.show()


In [363]:




mean_age = parkinsons_data['Age'].mean()
parkinsons_data = explorer_df[explorer_df['Parkinsons'] == True]


fig = px.histogram(parkinsons_data, x='Age', title='Distribution of Age for Parkinsons Patients',
                   labels={'Age': 'Age', 'count': 'Number of Patients'},
                   nbins=10,  
                   marginal='rug',  
                   opacity=0.9,  
                   color_discrete_sequence=['#1f77b4'])  

fig.add_vline(x=mean_age, line_dash="dash", line_color="red", 
              annotation_text=f'Mean Age: {mean_age:.1f}', annotation_position="right", 
              annotation_font=dict(size=12, color='red'))

fig.update_layout(xaxis_title='Age', yaxis_title='Number of Patients',
                  bargap=0.1,  
                  showlegend=False)  

fig.show()


In [366]:

tappy_columns = ['mean_L', 'mean_R', 'mean_hold_diff', 'mean_LL', 'mean_LR', 'mean_RL', 'mean_RR', 'mean_LR_RL_diff', 'mean_LL_RR_diff']
# explorer_df[tappy_columns] = explorer_df[tappy_columns].abs()


parkinsons_means = explorer_df[explorer_df['Parkinsons'] == True][tappy_columns].mean()
non_parkinsons_means = explorer_df[explorer_df['Parkinsons'] == False][tappy_columns].mean()


area_data = pd.DataFrame({
    'Tappy Feature': tappy_columns,
    'Parkinsons Mean': parkinsons_means.values,
    'Non-Parkinsons Mean': non_parkinsons_means.values
})


melted_area_data = area_data.melt(id_vars='Tappy Feature', var_name='Condition', value_name='Mean Value')


fig = px.area(melted_area_data, x='Tappy Feature', y='Mean Value', color='Condition', 
              title='Mean Tappy Data Comparison Between Parkinsons and Non-Parkinsons Patients',
              labels={'Mean Value': 'Mean Value', 'Tappy Feature': 'Tappy Feature'},
              line_shape='linear')


fig.update_traces(mode='lines+markers', marker=dict(size=6))

fig.show()


In [368]:


explorer_df['Parkinsons_encoded'] = explorer_df['Parkinsons'].astype(int)

correlation_columns = tappy_columns + ['Parkinsons_encoded']

corr_matrix = explorer_df[correlation_columns].corr()


fig = px.imshow(
    corr_matrix,
    labels=dict(color="Correlation"),
    x=correlation_columns,
    y=correlation_columns,
    color_continuous_scale='RdBu',
    zmin=-1,
    zmax=1,
    title="Correlation Matrix for Tappy Data and Parkinsons Status"
)


fig.update_layout(
    xaxis_title='Features',
    yaxis_title='Features',
    coloraxis_colorbar=dict(title="Correlation"),
    width=800,  
    height=800  
)


fig.show()


In [379]:
explorer_df['Parkinsons'] = explorer_df['Parkinsons'].astype(int)

In [386]:
explorer_df

Unnamed: 0,BirthYear,ID,Gender,Parkinsons,Tremors,DiagnosisYear,Sided,Impact,Levadopa,DA,...,mean_hold_diff,mean_LL,mean_LR,mean_RL,mean_RR,mean_LR_RL_diff,mean_LL_RR_diff,Age,Parkinsons_encoded,AnyDrug
0,1959.0,0QAZFRHQHW,Female,0,False,,,,False,False,...,2.663930,406.716242,411.718182,430.258974,365.736471,18.540793,40.979771,65.0,0,False
1,1944.0,1HOEBIGASW,Male,0,False,,,,False,False,...,1.243978,390.058824,600.433333,536.407143,394.647059,64.026190,4.588235,80.0,0,False
2,1936.0,1XNJCXS3EY,Male,0,False,,,,False,False,...,48.079984,347.882547,313.541489,310.799454,322.170833,2.742036,25.711714,88.0,0,False
3,1936.0,3DIXPRIOSW,Male,0,False,,,,False,False,...,19.412952,528.670445,575.478761,501.274093,493.779630,74.204668,34.890816,88.0,0,False
4,1950.0,48DZPAJ5NS,Male,1,False,2010.0,,Mild,False,False,...,0.862979,300.323155,335.508287,321.131506,332.621036,14.376781,32.297880,74.0,1,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
108,1948.0,YQSGN9BMVK,Male,0,False,,,,False,False,...,,,,,,,,76.0,0,False
109,1953.0,YWMIQIQND3,Female,1,True,2016.0,Right,Mild,False,False,...,,,,,,,,71.0,1,False
110,1928.0,YYPKGX6B24,Male,0,False,,,,False,False,...,,,,,,,,96.0,0,False
111,1947.0,Z2UPVHHGBE,Female,1,True,2015.0,Right,Mild,False,False,...,,,,,,,,77.0,1,True


In [380]:
X = explorer_df['Parkinsons'] = explorer_df['Parkinsons'].astype(int).drop(columns=['Parkinsons', 'ID'], axis=1)
y = explorer_df['Parkinsons']

0      0
1      0
2      0
3      0
4      1
      ..
108    0
109    1
110    0
111    1
112    1
Name: Parkinsons, Length: 113, dtype: int32

In [381]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA

In [382]:
lda_model = LDA()
lda_model.fit(X, y)

ValueError: Expected a 2-dimensional container but got <class 'pandas.core.series.Series'> instead. Pass a DataFrame containing a single row (i.e. single sample) or a single column (i.e. single feature) instead.

In [None]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_estimators=100, max_depth=5,random_state=42)
from sklearn.model_selection import cross_validate
rf.fit(X, y)
cv = cross_val_score(rf, X, y, cv=5)
print(cv.mean())

In [None]:
pip install shap

In [None]:
import shap

In [None]:
explainer = shap.TreeExplainer(rf)
shap_values = explainer(X)

In [None]:
shap.initjs()
shap.plots.beeswarm(shap_values[:,:,1], max_display=27)

In [None]:
shap.plots.waterfall(shap_values[9,:,0], max_display=27)

In [None]:
import xgboost as xgb
from xgboost import XGBClassifier

In [None]:
from sklearn.model_selection import cross_validate, cross_val_score, train_test_split, KFold, StratifiedKFold

In [None]:
X_train,

In [None]:
import xgboost as xgb
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score

# Sample data (replace with your actual data)
X_train = np.random.rand(1000, 20)
y_train = np.random.randint(2, size=1000)
X_test = np.random.rand(200, 20)

# Define XGBoost parameters
params = {
    'objective': 'binary:logistic',
    'eval_metric': 'logloss',  # so that the log loss is recorded during training
    # Add other parameters as needed
}

# Convert X_train and y_train
dtrain = xgb.DMatrix(X_train, label=y_train)

# Perform cross-validation
cv_results = xgb.cv(params, dtrain, num_boost_round=100, nfold=5, metrics='logloss', early_stopping_rounds=10)

# Extract log loss values for training and validation sets
train_log_loss = cv_results['train-logloss-mean']
test_log_loss = cv_results['test-logloss-mean']

# Plot the learning curve for log loss
plt.figure(figsize=(10, 6))
plt.plot(train_log_loss, label='Train Log Loss')
plt.plot(test_log_loss, label='Test Log Loss')
plt.xlabel('Number of Boosting Rounds')
plt.ylabel('Log Loss')
plt.title('Log Loss Learning Curve')
plt.legend()
plt.show()

# Get the best number of boosting rounds from cross-validation
best_num_boost_rounds = len(cv_results)

# Split the training data to create a validation set
X_train_part, X_val, y_train_part, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

# Create DMatrices for the training and validation data
dtrain_part = xgb.DMatrix(X_train_part, label=y_train_part)
dval = xgb.DMatrix(X_val, label=y_val)

# Specify validation sets to watch performance
evals = [(dtrain_part, 'train'), (dval, 'eval')]

# Train the final model using the training part and include early stopping
final_model = xgb.train(params, dtrain_part, num_boost_round=best_num_boost_rounds, evals=evals, early_stopping_rounds=10, verbose_eval=False)

# Make predictions on the validation set
y_val_pred_proba = final_model.predict(dval)
y_val_pred = (y_val_pred_proba > 0.5).astype(int)

# Calculate metrics on the validation set
accuracy = accuracy_score(y_val, y_val_pred)
f1 = f1_score(y_val, y_val_pred)
precision = precision_score(y_val, y_val_pred)
recall = recall_score(y_val, y_val_pred)

print(f'Accuracy: {accuracy}')
print(f'F1 Score: {f1}')
print(f'Precision: {precision}')
print(f'Recall: {recall}')

# Plot the learning curve for log loss, accuracy, F1 score, precision, and recall
metrics = ['logloss', 'accuracy', 'f1', 'precision', 'recall']
results = {
    'logloss': {
        'train': train_log_loss,
        'test': test_log_loss
    },
    'accuracy': {
        'train': [],
        'test': []
    },
    'f1': {
        'train': [],
        'test': []
    },
    'precision': {
        'train': [],
        'test': []
    },
    'recall': {
        'train': [],
        'test': []
    }
}

# Get metrics for each round during training
for i in range(1, best_num_boost_rounds + 1):
    temp_model = xgb.train(params, dtrain_part, num_boost_round=i)
    y_train_pred_proba_temp = temp_model.predict(dtrain_part)
    y_train_pred_temp = (y_train_pred_proba_temp > 0.5).astype(int)
    y_val_pred_proba_temp = temp_model.predict(dval)
    y_val_pred_temp = (y_val_pred_proba_temp > 0.5).astype(int)

    results['accuracy']['train'].append(accuracy_score(y_train_part, y_train_pred_temp))
    results['accuracy']['test'].append(accuracy_score(y_val, y_val_pred_temp))
    results['f1']['train'].append(f1_score(y_train_part, y_train_pred_temp))
    results['f1']['test'].append(f1_score(y_val, y_val_pred_temp))
    results['precision']['train'].append(precision_score(y_train_part, y_train_pred_temp))
    results['precision']['test'].append(precision_score(y_val, y_val_pred_temp))
    results['recall']['train'].append(recall_score(y_train_part, y_train_pred_temp))
    results['recall']['test'].append(recall_score(y_val, y_val_pred_temp))

# Plot the learning curves
fig, axs = plt.subplots(2, 2, figsize=(12, 12))

# Accuracy
axs[0, 0].plot(results['accuracy']['train'], label='Train Accuracy')
axs[0, 0].plot(results['accuracy']['test'], label='Test Accuracy')
axs[0, 0].set_xlabel('Number of Boosting Rounds')
axs[0, 0].set_ylabel('Accuracy')
axs[0, 0].set_title('Accuracy Learning Curve')
axs[0, 0].legend()

# F1 Score
axs[0, 1].plot(results['f1']['train'], label='Train F1 Score')
axs[0, 1].plot(results['f1']['test'], label='Test F1 Score')
axs[0, 1].set_xlabel('Number of Boosting Rounds')
axs[0, 1].set_ylabel('F1 Score')
axs[0, 1].set_title('F1 Score Learning Curve')
axs[0, 1].legend()

# Precision
axs[1, 0].plot(results['precision']['train'], label='Train Precision')
axs[1, 0].plot(results['precision']['test'], label='Test Precision')
axs[1, 0].set_xlabel('Number of Boosting Rounds')
axs[1, 0].set_ylabel('Precision')
axs[1, 0].set_title('Precision Learning Curve')
axs[1, 0].legend()

# Recall
axs[1, 1].plot(results['recall']['train'], label='Train Recall')
axs[1, 1].plot(results['recall']['test'], label='Test Recall')
axs[1, 1].set_xlabel('Number of Boosting Rounds')
axs[1, 1].set_ylabel('Recall')
axs[1, 1].set_title('Recall Learning Curve')
axs[1, 1].legend()

plt.tight_layout()
plt.show()

# Assuming X_test is your test data
dtest = xgb.DMatrix(X_test)

# Make predictions on the test set
test_predictions = final_model.predict(dtest)


In [None]:
pip install plotly