In [23]:
from sklearn.datasets import load_breast_cancer
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from lime import lime_tabular
from ucimlrepo import fetch_ucirepo
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, classification_report
from xgboost import XGBClassifier
import matplotlib.pyplot as plt
from sklearn.svm import SVC
from sklearn.model_selection import RandomizedSearchCV

Carichiamo il Dataset   

In [24]:
# Carica il dataset con il separatore corretto
file_path = '/home/lollo/Thesis/Python/data/data_mapped.csv'
data = pd.read_csv(file_path, delimiter=';')

Encoding del dataset

In [25]:
# Encode categorical features
data_encoded = data.copy()

# List categorical columns
categorical_cols = data_encoded.select_dtypes(include=['object']).columns

# Initialize the label encoder
label_encoders = {col: LabelEncoder() for col in categorical_cols}

# Apply label encoding to each categorical column
for col, encoder in label_encoders.items():
    data_encoded[col] = encoder.fit_transform(data_encoded[col])

# Define the features (X) and the target (y)
X = data_encoded.drop('Target', axis=1)
y = data_encoded['Target']

# Split the dataset into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

Definizione delle colonne da Includere nel dataset

In [26]:
columns_to_include = [
    'Marital status', 'Application order', 'Admission grade',
    'Gender', 'Age at enrollment', 'Curricular units 1st sem (evaluations)',
    'Curricular units 1st sem (approved)', 'GDP', 'Target'
]

# Extract the subset of the dataset
data_subset = data_encoded[columns_to_include]

# Define the features (X) and the target (y) for the subset
X_subset = data_subset.drop('Target', axis=1)
y_subset = data_subset['Target']

# Split the subset dataset into training and test sets
X_train_subset, X_test_subset, y_train_subset, y_test_subset = train_test_split(
    X_subset, y_subset, test_size=0.2, random_state=42
)

RandomForest su tutte le colonne del dataset

In [27]:
rf_classifier = RandomForestClassifier(random_state=42)
rf_classifier.fit(X_train, y_train)
y_pred_rf = rf_classifier.predict(X_test)
accuracy_rf = accuracy_score(y_test, y_pred_rf)
print(f"RandomForest Accuracy (All Columns): {accuracy_rf:.2%}")


RandomForest Accuracy (All Columns): 76.38%


RandomForest su un subset delle colonne del dataset

In [28]:
rf_classifier_subset = RandomForestClassifier(random_state=42)
rf_classifier_subset.fit(X_train_subset, y_train_subset)
y_pred_rf_subset = rf_classifier_subset.predict(X_test_subset)
accuracy_rf_subset = accuracy_score(y_test_subset, y_pred_rf_subset)
print(f"RandomForest Accuracy (Subset Columns): {accuracy_rf_subset:.2%}")

RandomForest Accuracy (Subset Columns): 67.12%


DecisionTreeClassifier su tutte le colonne del datset

In [29]:
dt_classifier = DecisionTreeClassifier(random_state=42)
dt_classifier.fit(X_train, y_train)
y_pred_dt = dt_classifier.predict(X_test)
accuracy_dt = accuracy_score(y_test, y_pred_dt)
print(f"DecisionTree Accuracy (All Columns): {accuracy_dt:.2%}")

DecisionTree Accuracy (All Columns): 71.19%


DecisionTreeClassifier su un subset delle colonne del datset

In [30]:
dt_classifier_subset = DecisionTreeClassifier(random_state=42)
dt_classifier_subset.fit(X_train_subset, y_train_subset)
y_pred_dt_subset = dt_classifier_subset.predict(X_test_subset)
accuracy_dt_subset = accuracy_score(y_test_subset, y_pred_dt_subset)
print(f"DecisionTree Accuracy (Subset Columns): {accuracy_dt_subset:.2%}")

DecisionTree Accuracy (Subset Columns): 58.19%


XGBoost su tutte le colonne

In [31]:
xgb_classifier = XGBClassifier(use_label_encoder=False, eval_metric='mlogloss', n_estimators=100, random_state=42)
xgb_classifier.fit(X_train, y_train)
y_pred_xgb = xgb_classifier.predict(X_test)
accuracy_xgb = accuracy_score(y_test, y_pred_xgb)
print(f"XGBoost Accuracy (All Columns): {accuracy_xgb:.2%}")

Parameters: { "use_label_encoder" } are not used.



XGBoost Accuracy (All Columns): 74.58%


XGBoost su un subset delle colonne

In [32]:
xgb_classifier_subset = XGBClassifier(use_label_encoder=False, eval_metric='mlogloss', n_estimators=100, random_state=42)
xgb_classifier_subset.fit(X_train_subset, y_train_subset)
y_pred_xgb_subset = xgb_classifier_subset.predict(X_test_subset)
accuracy_xgb_subset = accuracy_score(y_test_subset, y_pred_xgb_subset)
print(f"XGBoost Accuracy (Subset Columns): {accuracy_xgb_subset:.2%}")

Parameters: { "use_label_encoder" } are not used.



XGBoost Accuracy (Subset Columns): 68.02%


Support Vector Machines (SVM) su tutte le colonne del dataset

In [33]:
svm_classifier = SVC(kernel='linear', random_state=42)
svm_classifier.fit(X_train, y_train)
y_pred_svm = svm_classifier.predict(X_test)
accuracy_svm = accuracy_score(y_test, y_pred_svm)
print(f"SVM Accuracy (All Columns): {accuracy_svm:.2%}")

SVM Accuracy (All Columns): 75.14%


Support Vector Machines (SVM) su un subset delle colonne del dataset

In [34]:
svm_classifier_subset = SVC(kernel='linear', random_state=42)
svm_classifier_subset.fit(X_train_subset, y_train_subset)
y_pred_svm_subset = svm_classifier_subset.predict(X_test_subset)
accuracy_svm_subset = accuracy_score(y_test_subset, y_pred_svm_subset)
print(f"SVM Accuracy (Subset Columns): {accuracy_svm_subset:.2%}")

SVM Accuracy (Subset Columns): 69.15%


In [40]:
# Decision Tree con GridSearchCV per il pruning
dt_params = {
    'max_depth': [3, 5, 7, 10],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'criterion': ['gini', 'entropy']
}

dt_classifier = DecisionTreeClassifier(random_state=42)
dt_grid_search = GridSearchCV(estimator=dt_classifier, param_grid=dt_params, cv=5, scoring='accuracy')
dt_grid_search.fit(X_train, y_train)
best_dt = dt_grid_search.best_estimator_
y_pred_dt = best_dt.predict(X_test)
accuracy_dt = accuracy_score(y_test, y_pred_dt)
print(f"Optimized DecisionTree Accuracy on all Dataset: {accuracy_dt:.2%}")


Optimized DecisionTree Accuracy on all Dataset: 73.45%


In [39]:
# Decision Tree con GridSearchCV per il pruning sul subset
dt_params = {
    'max_depth': [3, 5, 7, 10],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'criterion': ['gini', 'entropy']
}

dt_classifier = DecisionTreeClassifier(random_state=42)
dt_grid_search = GridSearchCV(estimator=dt_classifier, param_grid=dt_params, cv=5, scoring='accuracy')
dt_grid_search.fit(X_train_subset, y_train_subset)
best_dt = dt_grid_search.best_estimator_
y_pred_dt = best_dt.predict(X_test_subset)
accuracy_dt = accuracy_score(y_test_subset, y_pred_dt)
print(f"Optimized DecisionTree Accuracy on Subset: {accuracy_dt:.2%}")

Optimized DecisionTree Accuracy on Subset: 66.44%


In [37]:
# XGBoost con GridSearchCV per l'ottimizzazione degli iperparametri
xgb_params = {
    'n_estimators': [100, 200, 300],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.1, 0.2],
    'gamma': [0, 0.1, 0.2],
    'subsample': [0.8, 1.0],
    'colsample_bytree': [0.8, 1.0]
}

xgb_classifier = XGBClassifier(random_state=42, use_label_encoder=False)
xgb_grid_search = GridSearchCV(estimator=xgb_classifier, param_grid=xgb_params, cv=5, scoring='accuracy')
xgb_grid_search.fit(X_train, y_train)
best_xgb = xgb_grid_search.best_estimator_
y_pred_xgb = best_xgb.predict(X_test)
accuracy_xgb = accuracy_score(y_test, y_pred_xgb)
print(f"Optimized XGBoost Accuracy: {accuracy_xgb:.2%}")


Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encode

KeyboardInterrupt: 

In [38]:
# Parametri da ottimizzare (stessi di prima)
xgb_params = {
    'n_estimators': [100, 200, 300],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.1, 0.2],
    'gamma': [0, 0.1, 0.2],
    'subsample': [0.8, 1.0],
    'colsample_bytree': [0.8, 1.0]
}

# Creazione del modello XGBoost
xgb_classifier = XGBClassifier(random_state=42, use_label_encoder=False)

# Configurazione di RandomizedSearchCV
xgb_random_search = RandomizedSearchCV(
    estimator=xgb_classifier,
    param_distributions=xgb_params,
    n_iter=50,  # Numero di iterazioni, ossia quante combinazioni casuali provare
    cv=5,  # 5-fold cross-validation
    scoring='accuracy',
    random_state=42,  # Per riproducibilità
    n_jobs=-1  # Usa tutti i processori disponibili per velocizzare la ricerca
)

# Addestramento del modello con RandomizedSearchCV
xgb_random_search.fit(X_train, y_train)

# Miglior modello trovato e predizione
best_xgb = xgb_random_search.best_estimator_
y_pred_xgb = best_xgb.predict(X_test)

# Calcolo dell'accuratezza
accuracy_xgb = accuracy_score(y_test, y_pred_xgb)
print(f"Optimized XGBoost Accuracy with RandomizedSearchCV: {accuracy_xgb:.2%}")

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encode

Optimized XGBoost Accuracy with RandomizedSearchCV: 75.25%
