In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV, KFold, StratifiedKFold, cross_val_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
import pickle
from sklearn.metrics import confusion_matrix

# Load the Higgs dataset
df = pd.read_csv('HIGGS_train.csv', header=None, low_memory=False)
df.dropna(inplace=True)

# Separate the target variable and features
X = df.iloc[:, 1:]
y = df.iloc[:, 0]


# Dimensionality reduction
pca = PCA(n_components=10, random_state=42)
X_pca = pca.fit_transform(X)

# Use a more efficient data structure
X_pca = np.array(X_pca)
y = np.array(y)

# Preprocessing
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_pca)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.3, random_state=42)

# Define decision tree parameters
dt_params = {'max_depth': [5, 10 ,15],
             'criterion': ['gini','entropy'],
             'splitter': ['random','best'],
             'random_state': [42]}

# Define random forest parameters
rf_params = {'n_estimators': [5, 10, 15],
             'max_depth': [5, 10],
             'criterion': ['entropy'],
             'random_state': [42]}

# Define XGBoost parameters
xgb_params = {'max_depth': [5, 10, 15],
              'learning_rate': [0.01, 0.1],
              'n_estimators': [5, 10],
              'random_state': [42]}

# Define the different cross-validation methods
kf = KFold(n_splits=5, shuffle=True, random_state=42)
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

results = {}
best_models = {}

for model_name, model, params in [('Decision Tree', DecisionTreeClassifier(), dt_params),
                                  ('Random Forest', RandomForestClassifier(), rf_params),
                                  ('XGBoost', XGBClassifier(), xgb_params)]:
    model_results = []
    print(f'{model_name}:')
    for param_set in GridSearchCV(model, params, cv=skf, scoring='accuracy').fit(X_train, y_train).cv_results_['params']:
        print(f'Parameters: {param_set}')
        for cv_method in ['KFold', 'StratifiedKFold']:
            if cv_method == 'KFold':
                cv = kf
            else:
                cv = skf
            for metric in ['accuracy', 'precision_macro', 'recall_macro', 'f1_macro']:
                print(f'Cross-validation method: {cv_method}')
                print(f'Scoring metric: {metric}')
                scores = cross_val_score(model.set_params(**param_set), X_train, y_train, cv=cv, scoring=metric, n_jobs=-1)
                model_results.append((cv_method, metric, scores.mean()))
                
                # Print the confusion matrix
                clf = model.set_params(**param_set).fit(X_train, y_train)
                y_pred = clf.predict(X_test)
                cm = confusion_matrix(y_test, y_pred)
                print(f'Confusion matrix: \n{cm}')
                
                print(f'{metric} score: {scores.mean()}')
                print("\n")
        
        # Save the best model for this category
        if model_name not in best_models:
            best_models[model_name] = model.set_params(**param_set)
        else:
            old_scores = cross_val_score(best_models[model_name], X_train, y_train, cv=skf, scoring='accuracy', n_jobs=-1)
            new_scores = cross_val_score(model.set_params(**param_set), X_train, y_train, cv=skf, scoring='accuracy', n_jobs=-1)
            if new_scores.mean() > old_scores.mean():
                best_models[model_name] = model.set_params(**param_set)

    results[model_name] = model_results
    print('\n')
    
# Save the best models using pickle
for model_name, model in best_models.items():
    with open(f'{model_name}_best_model.pkl', 'wb') as f:
        pickle.dump(model, f)
        print(f'Saved {model_name} best model')

print('\n')
print('Results:')
for model_name, model_results in results.items():
    print(f'{model_name}:')
    for cv_method, metric, score in model_results:
        print(f'CV method: {cv_method}, Metric: {metric}, Score: {score:.6f}')


Decision Tree:
Parameters: {'criterion': 'gini', 'max_depth': 5, 'random_state': 42, 'splitter': 'random'}
Cross-validation method: KFold
Scoring metric: accuracy
Confusion matrix: 
[[23689 60861]
 [20844 74606]]
accuracy score: 0.5473372520874682


Cross-validation method: KFold
Scoring metric: precision_macro
Confusion matrix: 
[[23689 60861]
 [20844 74606]]
precision_macro score: 0.5440363908214219


Cross-validation method: KFold
Scoring metric: recall_macro
Confusion matrix: 
[[23689 60861]
 [20844 74606]]
recall_macro score: 0.5331093878144889


Cross-validation method: KFold
Scoring metric: f1_macro
Confusion matrix: 
[[23689 60861]
 [20844 74606]]
f1_macro score: 0.5090128368652571


Cross-validation method: StratifiedKFold
Scoring metric: accuracy
Confusion matrix: 
[[23689 60861]
 [20844 74606]]
accuracy score: 0.5457991333230157


Cross-validation method: StratifiedKFold
Scoring metric: precision_macro
Confusion matrix: 
[[23689 60861]
 [20844 74606]]
precision_macro score: 