# Data Preprocessing

## Loading el dataset

In [None]:
import pandas as pd

# Load the data
data = pd.read_csv('data.csv')

# Display some rows
print(data.head())

   red  green  blue   label
0  226    158    50  Orange
1   76     21    88  Purple
2  146     70   179  Purple
3   18    240   118   Green
4  146    253   227   Green


## Preprocess the data

In [None]:
from sklearn.preprocessing import MinMaxScaler, LabelEncoder

# Normalize RGB values
scaler = MinMaxScaler()
data[['red', 'green', 'blue']] = scaler.fit_transform(data[['red', 'green', 'blue']])

# Encode color labels
label_encoder = LabelEncoder()
data['encoded_label'] = label_encoder.fit_transform(data['label'])

print(f"Preprocessed Data:\n{data.head()}")

Preprocessed Data:
        red     green      blue   label  encoded_label
0  0.886275  0.619608  0.196078  Orange              5
1  0.298039  0.082353  0.345098  Purple              7
2  0.572549  0.274510  0.701961  Purple              7
3  0.070588  0.941176  0.462745   Green              3
4  0.572549  0.992157  0.890196   Green              3


## Split the data

In [None]:
from sklearn.model_selection import train_test_split

# Split the data
X = data[['red', 'green', 'blue']].to_numpy()
y = data['label'].to_numpy()

X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

print("Training set size:", len(X_train))
print("Validation set size:", len(X_val))
print("Test set size:", len(X_test))

Training set size: 3536
Validation set size: 758
Test set size: 758


# Model

## 1 - KNN

### model beda2y

In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score

# Initialize and train the model
knn = KNeighborsClassifier(n_neighbors=5)

knn


In [None]:
import numpy as np

X_combined_train = np.concatenate((X_train, X_val), axis=0)
y_combined_train = np.concatenate((y_train, y_val), axis=0)

knn.fit(X_combined_train, y_combined_train)

# Evaluate the model
y_pred = knn.predict(X_test)
print(f'KNN Accuracy: {accuracy_score(y_test, y_pred)}')

KNN Accuracy: 0.8390501319261213


### KNN fine-tuning
Experiment with different hyperparameters for each model to improve performance.

In [None]:
from sklearn.model_selection import GridSearchCV

param_grid = {'n_neighbors': [3, 5, 7, 9, 11, 13, 15, 17, 19, 21]}
grid_search = GridSearchCV(KNeighborsClassifier(), param_grid, cv=5)
grid_search.fit(X_combined_train, y_combined_train)
print(f'Best KNN Parameters: {grid_search.best_params_}')


Best KNN Parameters: {'n_neighbors': 9}


### Final KNN model using the best hyperparameters

In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score

# Initialize and train the model
knn = KNeighborsClassifier(n_neighbors=grid_search.best_params_['n_neighbors'])
knn.fit(X_combined_train, y_combined_train)

# Evaluate the model
y_pred = knn.predict(X_test)
print(f'KNN Accuracy: {accuracy_score(y_test, y_pred)}')


KNN Accuracy: 0.8548812664907651


### KNN Evaluation

In [None]:
from sklearn.metrics import classification_report

# Evaluate KNN
print("KNN Classification Report:")
print(classification_report(y_test, knn.predict(X_test), target_names=label_encoder.classes_))

KNN Classification Report:
              precision    recall  f1-score   support

       Black       1.00      0.91      0.95        11
        Blue       0.93      0.90      0.91       169
       Brown       0.72      0.92      0.81        52
       Green       0.91      0.94      0.93       212
        Grey       0.78      0.81      0.79        26
      Orange       0.82      0.82      0.82        34
        Pink       0.82      0.80      0.81        79
      Purple       0.79      0.73      0.76        90
         Red       0.71      0.77      0.74        35
       White       0.75      0.38      0.50         8
      Yellow       0.91      0.71      0.80        42

    accuracy                           0.85       758
   macro avg       0.83      0.79      0.80       758
weighted avg       0.86      0.85      0.85       758



## 2 - Random Forest

### model beda2y

In [None]:
from sklearn.ensemble import RandomForestClassifier

# Initialize the Random Forest model with regularization parameters
rf = RandomForestClassifier(
    n_estimators=100,          # Number of trees
    max_depth=10,              # Maximum depth of the trees
    min_samples_split=5,       # Minimum number of samples required to split a node
    min_samples_leaf=3,        # Minimum number of samples required at a leaf node
    random_state=42
)

# Train the model
rf.fit(X_combined_train, y_combined_train)

# Evaluate the model
y_pred = rf.predict(X_test)
print(f'Random Forest Accuracy: {accuracy_score(y_test, y_pred)}')
print("Random Forest Classification Report:")
print(classification_report(y_test, y_pred, target_names=label_encoder.classes_))


Random Forest Accuracy: 0.8654353562005277
Random Forest Classification Report:
              precision    recall  f1-score   support

       Black       1.00      0.91      0.95        11
        Blue       0.93      0.92      0.93       169
       Brown       0.75      0.88      0.81        52
       Green       0.89      0.97      0.93       212
        Grey       0.79      0.58      0.67        26
      Orange       0.77      0.79      0.78        34
        Pink       0.80      0.87      0.84        79
      Purple       0.87      0.79      0.83        90
         Red       0.79      0.66      0.72        35
       White       0.75      0.38      0.50         8
      Yellow       0.91      0.76      0.83        42

    accuracy                           0.87       758
   macro avg       0.84      0.77      0.80       758
weighted avg       0.87      0.87      0.86       758



### Random Forest Hyperparameter Tuning

In [None]:
# Define the parameter grid for Random Forest with regularization
param_grid_rf = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'bootstrap': [True, False],
}

# Initialize GridSearchCV for Random Forest
grid_search_rf = GridSearchCV(RandomForestClassifier(random_state=42), param_grid_rf, cv=5, n_jobs=-1)

# Fit GridSearchCV on the training data
grid_search_rf.fit(X_train, y_train)

# Get the best model from GridSearchCV
best_rf = grid_search_rf.best_estimator_

# Evaluate the best model on the validation set
y_val_pred_rf = best_rf.predict(X_val)
print(f'Random Forest Validation Accuracy: {accuracy_score(y_val, y_val_pred_rf)}')
print("Random Forest Validation Classification Report:")
print(classification_report(y_val, y_val_pred_rf, target_names=label_encoder.classes_))


Random Forest Validation Accuracy: 0.8482849604221636
Random Forest Validation Classification Report:
              precision    recall  f1-score   support

       Black       0.86      1.00      0.92         6
        Blue       0.85      0.94      0.89       153
       Brown       0.81      0.82      0.82        68
       Green       0.92      0.93      0.92       230
        Grey       0.83      0.47      0.60        32
      Orange       0.76      0.91      0.83        32
        Pink       0.81      0.84      0.82        92
      Purple       0.81      0.69      0.74        78
         Red       0.85      0.62      0.72        37
       White       0.40      1.00      0.57         2
      Yellow       0.75      0.86      0.80        28

    accuracy                           0.85       758
   macro avg       0.79      0.82      0.79       758
weighted avg       0.85      0.85      0.84       758



### Final evaluation random forest

In [None]:
# Final evaluation on test data
y_test_pred_rf = best_rf.predict(X_test)
print(f'Random Forest Test Accuracy: {accuracy_score(y_test, y_test_pred_rf)}')
print("Random Forest Test Classification Report:")
print(classification_report(y_test, y_test_pred_rf, target_names=label_encoder.classes_))

# Print the best parameters found
print("Best parameters found:")
print(grid_search_rf.best_params_)

Random Forest Test Accuracy: 0.8548812664907651
Random Forest Test Classification Report:
              precision    recall  f1-score   support

       Black       1.00      0.91      0.95        11
        Blue       0.90      0.92      0.91       169
       Brown       0.79      0.87      0.83        52
       Green       0.88      0.94      0.91       212
        Grey       0.77      0.65      0.71        26
      Orange       0.80      0.82      0.81        34
        Pink       0.79      0.85      0.82        79
      Purple       0.85      0.77      0.81        90
         Red       0.73      0.69      0.71        35
       White       0.80      0.50      0.62         8
      Yellow       0.91      0.71      0.80        42

    accuracy                           0.85       758
   macro avg       0.84      0.78      0.81       758
weighted avg       0.86      0.85      0.85       758

Best parameters found:
{'bootstrap': True, 'max_depth': 20, 'min_samples_leaf': 1, 'min_samples_s

## 3 - Gradient Boosting Model (Boosting)

### model beda2y

In [None]:
from sklearn.ensemble import GradientBoostingClassifier

# Initialize the Gradient Boosting model with regularization parameters
gb_model = GradientBoostingClassifier(
    n_estimators=100,        # Number of boosting stages to be run
    learning_rate=0.1,       # Shrinks the contribution of each tree
    max_depth=3,             # Maximum depth of the individual trees
    min_samples_split=2,     # Minimum number of samples required to split an internal node
    min_samples_leaf=1,      # Minimum number of samples required at a leaf node
    subsample=0.9,           # Fraction of samples used to fit each individual tree
    random_state=42
)

# Fit the model
gb_model.fit(X_train, y_train)

# Predict and evaluate
y_val_pred_gb = gb_model.predict(X_val)
print(f'Gradient Boosting Validation Accuracy: {accuracy_score(y_val, y_val_pred_gb)}')
print("Gradient Boosting Validation Classification Report:")
print(classification_report(y_val, y_val_pred_gb, target_names=label_encoder.classes_))

Gradient Boosting Validation Accuracy: 0.8403693931398417
Gradient Boosting Validation Classification Report:
              precision    recall  f1-score   support

       Black       0.86      1.00      0.92         6
        Blue       0.86      0.94      0.90       153
       Brown       0.78      0.82      0.80        68
       Green       0.92      0.91      0.92       230
        Grey       0.76      0.50      0.60        32
      Orange       0.78      0.88      0.82        32
        Pink       0.80      0.83      0.81        92
      Purple       0.80      0.65      0.72        78
         Red       0.83      0.68      0.75        37
       White       0.50      1.00      0.67         2
      Yellow       0.68      0.82      0.74        28

    accuracy                           0.84       758
   macro avg       0.78      0.82      0.79       758
weighted avg       0.84      0.84      0.84       758



### hyperparameter tuning

In [None]:
# Define the parameter grid for Gradient Boosting with regularization
param_grid_gb = {
    'n_estimators': [50, 100, 200],
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 5, 7],
    'min_samples_split' : [2],     # Minimum number of samples required to split an internal node
    'min_samples_leaf' : [1],      # Minimum number of samples required at a leaf node
    'subsample' : [0.9],
}

# Initialize GridSearchCV for Gradient Boosting
grid_search_gb = GridSearchCV(GradientBoostingClassifier(random_state=42), param_grid_gb, cv=5, n_jobs=-1)

# Fit GridSearchCV on the training data
grid_search_gb.fit(X_train, y_train)

# Get the best model from GridSearchCV
best_gb = grid_search_gb.best_estimator_

# Evaluate the best model on the validation set
y_val_pred_gb = best_gb.predict(X_val)
print(f'Gradient Boosting Validation Accuracy: {accuracy_score(y_val, y_val_pred_gb)}')
print("Gradient Boosting Validation Classification Report:")
print(classification_report(y_val, y_val_pred_gb, target_names=label_encoder.classes_))

Gradient Boosting Validation Accuracy: 0.8403693931398417
Gradient Boosting Validation Classification Report:
              precision    recall  f1-score   support

       Black       0.86      1.00      0.92         6
        Blue       0.86      0.94      0.90       153
       Brown       0.78      0.82      0.80        68
       Green       0.92      0.91      0.92       230
        Grey       0.76      0.50      0.60        32
      Orange       0.78      0.88      0.82        32
        Pink       0.80      0.83      0.81        92
      Purple       0.80      0.65      0.72        78
         Red       0.83      0.68      0.75        37
       White       0.50      1.00      0.67         2
      Yellow       0.68      0.82      0.74        28

    accuracy                           0.84       758
   macro avg       0.78      0.82      0.79       758
weighted avg       0.84      0.84      0.84       758



### Final evaluation - Boosting

In [None]:
# Final evaluation on test data
y_test_pred_gb = best_gb.predict(X_test)
print(f'Gradient Boosting Test Accuracy: {accuracy_score(y_test, y_test_pred_gb)}')
print("Gradient Boosting Test Classification Report:")
print(classification_report(y_test, y_test_pred_gb, target_names=label_encoder.classes_))

# Print the best parameters found
print("Best parameters found:")
print(grid_search_gb.best_params_)

Gradient Boosting Test Accuracy: 0.8509234828496042
Gradient Boosting Test Classification Report:
              precision    recall  f1-score   support

       Black       0.90      0.82      0.86        11
        Blue       0.91      0.91      0.91       169
       Brown       0.79      0.88      0.84        52
       Green       0.88      0.92      0.90       212
        Grey       0.67      0.62      0.64        26
      Orange       0.76      0.82      0.79        34
        Pink       0.80      0.87      0.84        79
      Purple       0.87      0.81      0.84        90
         Red       0.78      0.71      0.75        35
       White       1.00      0.50      0.67         8
      Yellow       0.82      0.64      0.72        42

    accuracy                           0.85       758
   macro avg       0.83      0.77      0.79       758
weighted avg       0.85      0.85      0.85       758

Best parameters found:
{'learning_rate': 0.1, 'max_depth': 3, 'min_samples_leaf': 1, 'min

## 4 - Support Vector Machine (SVM)

### Model beda2y

In [None]:
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report

# Initialize the SVM model with initial parameters
svm = SVC(
    C=1.0,                    # Regularization parameter
    kernel='rbf',             # Type of kernel
    gamma='scale',            # Kernel coefficient
    random_state=42
)

# Train the model
svm.fit(X_combined_train, y_combined_train)

# Evaluate the model
y_pred_svm = svm.predict(X_test)
print(f'SVM Accuracy: {accuracy_score(y_test, y_pred_svm)}')
print("SVM Classification Report:")
print(classification_report(y_test, y_pred_svm, target_names=label_encoder.classes_))


SVM Accuracy: 0.8812664907651715
SVM Classification Report:
              precision    recall  f1-score   support

       Black       1.00      0.82      0.90        11
        Blue       0.95      0.95      0.95       169
       Brown       0.76      0.87      0.81        52
       Green       0.93      0.97      0.95       212
        Grey       0.69      0.77      0.73        26
      Orange       0.79      0.76      0.78        34
        Pink       0.82      0.87      0.85        79
      Purple       0.89      0.80      0.84        90
         Red       0.76      0.74      0.75        35
       White       0.75      0.38      0.50         8
      Yellow       0.91      0.76      0.83        42

    accuracy                           0.88       758
   macro avg       0.84      0.79      0.81       758
weighted avg       0.88      0.88      0.88       758



### hyperparameter tuning

In [None]:
from sklearn.model_selection import GridSearchCV

# Define the parameter grid for SVM
param_grid_svm = {
    'C': [0.1, 1, 10],                  # Regularization parameter
    'kernel': ['linear', 'rbf', 'poly'], # Type of kernel
    'gamma': ['scale', 'auto'],         # Kernel coefficient
    'degree': [3, 4, 5]                 # Degree for polynomial kernel
}

# Initialize GridSearchCV for SVM
grid_search_svm = GridSearchCV(SVC(random_state=42), param_grid_svm, cv=5, n_jobs=-1)

# Fit GridSearchCV on the training data
grid_search_svm.fit(X_train, y_train)

# Get the best model from GridSearchCV
best_svm = grid_search_svm.best_estimator_

# Evaluate the best model on the validation set
y_val_pred_svm = best_svm.predict(X_val)
print(f'SVM Validation Accuracy: {accuracy_score(y_val, y_val_pred_svm)}')
print("SVM Validation Classification Report:")
print(classification_report(y_val, y_val_pred_svm, target_names=label_encoder.classes_))


SVM Validation Accuracy: 0.8707124010554089
SVM Validation Classification Report:
              precision    recall  f1-score   support

       Black       1.00      1.00      1.00         6
        Blue       0.88      0.96      0.92       153
       Brown       0.81      0.87      0.84        68
       Green       0.94      0.92      0.93       230
        Grey       0.83      0.59      0.69        32
      Orange       0.79      0.84      0.82        32
        Pink       0.87      0.83      0.85        92
      Purple       0.86      0.78      0.82        78
         Red       0.82      0.76      0.79        37
       White       0.50      1.00      0.67         2
      Yellow       0.68      0.82      0.74        28

    accuracy                           0.87       758
   macro avg       0.82      0.85      0.82       758
weighted avg       0.87      0.87      0.87       758



### Evaluation

In [None]:
# Final evaluation on test data
y_test_pred_svm = best_svm.predict(X_test)
print(f'SVM Test Accuracy: {accuracy_score(y_test, y_test_pred_svm)}')
print("SVM Test Classification Report:")
print(classification_report(y_test, y_test_pred_svm, target_names=label_encoder.classes_))

# Print the best parameters found
print("Best parameters found for SVM:")
print(grid_search_svm.best_params_)


SVM Test Accuracy: 0.8707124010554089
SVM Test Classification Report:
              precision    recall  f1-score   support

       Black       0.90      0.82      0.86        11
        Blue       0.92      0.93      0.92       169
       Brown       0.79      0.88      0.84        52
       Green       0.93      0.95      0.94       212
        Grey       0.73      0.73      0.73        26
      Orange       0.78      0.82      0.80        34
        Pink       0.81      0.86      0.83        79
      Purple       0.86      0.80      0.83        90
         Red       0.76      0.71      0.74        35
       White       1.00      0.38      0.55         8
      Yellow       0.89      0.76      0.82        42

    accuracy                           0.87       758
   macro avg       0.85      0.79      0.80       758
weighted avg       0.87      0.87      0.87       758

Best parameters found for SVM:
{'C': 10, 'degree': 3, 'gamma': 'scale', 'kernel': 'rbf'}
