In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.preprocessing import LabelEncoder
from scipy import stats

# Load the dataset
file_path = './datasets/banana_quality.csv'
data = pd.read_csv(file_path, encoding='ISO-8859-1')

# Calculate Z-scores
z_scores = stats.zscore(data.drop('Quality', axis=1))
outliers = (np.abs(z_scores) > 3).any(axis=1)
print('Original Data:', data.shape)
print('Outlier Data:', data[outliers].shape)

data_selected = data[~outliers]
numerical_cols = data_selected.drop('Quality', axis=1).columns.values

le = LabelEncoder()

X = data_selected.drop('Quality', axis=1)
y = le.fit_transform(data_selected['Quality'])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
quality_counts = data['Quality'].value_counts()
print(quality_counts)

In [None]:
import pandas as pd
from statsmodels.stats.outliers_influence import variance_inflation_factor

# Calculate VIF for each feature in your dataset
vif = pd.DataFrame()
vif["VIF Factor"] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
vif["features"] = X.columns

print(vif)

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix

# Create a Logistic Regression model
logreg_model = LogisticRegression()
logreg_model.fit(X_train, y_train)
logreg_predictions = logreg_model.predict(X_test)

# Calculate metrics for Logistic Regression
logreg_accuracy = accuracy_score(y_test, logreg_predictions)
logreg_precision = precision_score(y_test, logreg_predictions)
logreg_recall = recall_score(y_test, logreg_predictions)
logreg_f1 = f1_score(y_test, logreg_predictions)

print("Logistic Regression: Accuracy = %.3f, Precision = %.3f, Recall = %.3f, F1 = %.3f" % (logreg_accuracy, logreg_precision, logreg_recall, logreg_f1))

# Compute confusion matrix
cnf_matrix = confusion_matrix(y_test, logreg_predictions)

# Define the labels
labels = ['TN','FP','FN','TP']
labels = np.asarray(labels).reshape(2,2)

# Combine labels and confusion matrix values
labels = (np.asarray(["{0}\n{1}".format(label, value)
                      for label, value in zip(labels.flatten(), cnf_matrix.flatten())])
         ).reshape(2,2)

# Plot confusion matrix with seaborn
plt.figure(figsize=(10,7))
sns.heatmap(cnf_matrix, annot=labels, fmt='', cmap='Blues')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.show()

In [None]:
from sklearn.svm import SVC

# Create a Support Vector Classifier model
svc_model = SVC()
svc_model.fit(X_train, y_train)
svc_predictions = svc_model.predict(X_test)

# Calculate metrics for Support Vector Classifier
svc_accuracy = accuracy_score(y_test, svc_predictions)
svc_precision = precision_score(y_test, svc_predictions)
svc_recall = recall_score(y_test, svc_predictions)
svc_f1 = f1_score(y_test, svc_predictions)

print("Support Vector Classifier: Accuracy = %.3f, Precision = %.3f, Recall = %.3f, F1 = %.3f" % (svc_accuracy, svc_precision, svc_recall, svc_f1))

dummy_data = {
    'Size': 0.326839,
    'Weight': -0.878966,
    'Sweetness': 3.84747,
    'Softness': -3.056569,
    'HarvestTime': 1.026182,
    'Ripeness': 4.707825,
    'Acidity': 1.216529,
}

dummy_data_df = pd.DataFrame([dummy_data])
svc_dummy_pred = svc_model.predict(dummy_data_df)

print(le.inverse_transform(svc_dummy_pred))

In [None]:

from sklearn.ensemble import RandomForestClassifier

# Create a Random Forest Classifier model
rfc_model = RandomForestClassifier()
rfc_model.fit(X_train, y_train)
rfc_predictions = rfc_model.predict(X_test)

# Calculate metrics for Random Forest Classifier
rfc_accuracy = accuracy_score(y_test, rfc_predictions)
rfc_precision = precision_score(y_test, rfc_predictions)
rfc_recall = recall_score(y_test, rfc_predictions)
rfc_f1 = f1_score(y_test, rfc_predictions)

print("Random Forest Classifier: Accuracy = %.3f, Precision = %.3f, Recall = %.3f, F1 = %.3f" % (rfc_accuracy, rfc_precision, rfc_recall, rfc_f1))

In [None]:
from sklearn.neighbors import KNeighborsClassifier

# Create a KNN classifier
knn = KNeighborsClassifier(n_neighbors=3)
knn.fit(X_train, y_train)
knn_predictions = knn.predict(X_test)

knn_accuracy = accuracy_score(y_test, knn_predictions)
knn_precision = precision_score(y_test, knn_predictions)
knn_recall = recall_score(y_test, knn_predictions)
knn_f1 = f1_score(y_test, knn_predictions)

print("K-Nearest Neighbor Classifier: Accuracy = %.3f, Precision = %.3f, Recall = %.3f, F1 = %.3f" % (knn_accuracy, knn_precision, knn_recall, knn_f1))