# MLP

In [14]:
import sys
sys.path.append("..")
from sklearn.model_selection import KFold, cross_val_score
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import StandardScaler
from data_prepocess import load_training_data
import numpy as np
import time
import joblib

# Get the data
X_train, y_train = load_training_data('../data/train.csv')

# Convert y_train to binary
# y_train = np.where(y_train > 6, 1, 0)


# Select only the specified features
features = ['alcohol', 'sulphates', 'total sulfur dioxide', 'volatile acidity', 'density', 'chlorides']
X_train = X_train[features]

# Standardize features instead of just normalizing
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)



# Standardize features instead of just normalizing
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)

# Initialize variables
hidden_layer_sizes = [(32, 64, 128)]
alpha_values = [0.001]
learning_rate_init_values = [0.005] 
max_iter_values = [500]
results = {}

# Define 5-fold cross validation test harness
kfold = KFold(n_splits=5, shuffle=True)

# Perform 5-fold cross validation
for hidden_layer_size in hidden_layer_sizes:
    for alpha in alpha_values:
        for learning_rate_init in learning_rate_init_values:
            for max_iter in max_iter_values:
                start_time = time.time()

                # Define the model with early stopping
                model = MLPClassifier(hidden_layer_sizes=hidden_layer_size, activation='relu', solver='adam',
                                      max_iter=max_iter, alpha=alpha, learning_rate_init=learning_rate_init,
                                      early_stopping=True, n_iter_no_change=10)

                # Perform cross-validation manually to get the number of iterations
                cv_results = []
                for train_index, test_index in kfold.split(X_train):
                    X_train_fold, X_test_fold = X_train[train_index], X_train[test_index]
                    y_train_fold, y_test_fold = y_train[train_index], y_train[test_index]
                    model.fit(X_train_fold, y_train_fold)
                    score = model.score(X_test_fold, y_test_fold)
                    cv_results.append(score)

                # Save the model
                model_directory = "model_validation"
                model_filename = f"{model_directory}/model_{hidden_layer_size}_{alpha}_{learning_rate_init}_{max_iter}.joblib"
                joblib.dump(model, model_filename)

                end_time = time.time()
                elapsed_time = end_time - start_time
                config = (hidden_layer_size, alpha, learning_rate_init, max_iter)
                print("Config: {}\nCross-validation mean accuracy: {:.2f}%\nElapsed time: {:.2f} seconds\nNumber of iterations: {}\n".format(
                    config, np.mean(cv_results)*100, elapsed_time, model.n_iter_))

                results[config] = np.mean(cv_results)

print("Results:", results)

Config: ((32, 64, 128), 0.001, 0.005, 500)
Cross-validation mean accuracy: 58.44%
Elapsed time: 0.60 seconds
Number of iterations: 19

Results: {((32, 64, 128), 0.001, 0.005, 500): 0.5844490711082639}


## Model's paramemters with the best accuracy

In [28]:
# Find the configuration with the highest accuracy
best_config = max(results, key=results.get)
best_accuracy = results[best_config]

# Print the best configuration and its accuracy
print("Best configuration: ", best_config)
print("Best cross-validation mean accuracy: {:.2f}%".format(best_accuracy*100))

Best configuration:  ((32, 64, 128), 0.001, 0.005, 500)
Best cross-validation mean accuracy: 58.62%


## Acuracy on the test set

In [29]:
from data_prepocess import load_test_data
from sklearn.metrics import accuracy_score
# from sklearn.preprocessing import StandardScaler
# import joblib

# Load the test data
X_test, y_test = load_test_data('../data/test.csv')

# Preprocess the test data
scaler = StandardScaler()
X_test = scaler.fit_transform(X_test)

# Load the trained model
model_directory = "model_validation"
model_filename = f"{model_directory}/model_{hidden_layer_size}_{alpha}_{learning_rate_init}_{max_iter}.joblib"
model = joblib.load(model_filename)

# Make predictions on the test data
y_pred = model.predict(X_test)

# Calculate and print the accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy on test data: {:.2f}%".format(accuracy * 100))

Accuracy on test data: 60.62%


# PCA

In [16]:
# Standardize the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)

# Apply PCA
n_features = X_train_scaled.shape[1]
pca = PCA(n_components=n_features//2)  # Set the number of components to half the number of features
X_train_pca = pca.fit_transform(X_train_scaled)

# Get the explained variance ratios
explained_variance_ratios = pca.explained_variance_ratio_

# Feature Importance Analysis with Random Forest

In [13]:
from sklearn.ensemble import RandomForestClassifier

# Load the training data
X_train, y_train = load_training_data('../data/train.csv', target='quality')

# Train a random forest classifier
clf = RandomForestClassifier(n_estimators=100, random_state=0)
clf.fit(X_train, y_train)

# Get the feature importances
importances = clf.feature_importances_

# Sort the feature importances in descending order and get the indices
indices = np.argsort(importances)[::-1]

# Print the features in order of importance
print('Features in order of importance:')
for i in indices:
    print(X_train.columns[i])

Features in order of importance:
alcohol
sulphates
total sulfur dioxide
volatile acidity
density
chlorides
residual sugar
fixed acidity
pH
citric acid
free sulfur dioxide
