In [3]:
from PIL import Image
import numpy as np
import pandas as pd
from sklearn.tree import DecisionTreeClassifier, plot_tree
import matplotlib.pyplot as plt


In [4]:
data = pd.read_csv("../data/datasets/data.csv")
train = pd.read_csv("../data/datasets/train.csv")
test = pd.read_csv("../data/datasets/test.csv")

def transform(X):
    img = np.array(Image.open(f"../data/datasets/samples/{X['filename']}"))
    flattened_img = img.flatten()
    return flattened_img

# Apply to each DataFrame and store the flattened images
data['flattened_image'] = data.apply(transform, axis=1)
train['flattened_image'] = train.apply(transform, axis=1)
test['flattened_image'] = test.apply(transform, axis=1)

print("\nSample flattened image from 'data':", data['flattened_image'][0][:10])  # First 10 pixels
print("\nSample flattened image from 'train':", train['flattened_image'][0][:10])
print("\nSample flattened image from 'test':", test['flattened_image'][0][:10])


Sample flattened image from 'data': [248 248 248 248 248 248 247 247 247 247]

Sample flattened image from 'train': [252 252 252 255 255 255 248 248 248 250]

Sample flattened image from 'test': [245 245 245 245 245 245 244 244 244 243]


In [5]:
X_train = np.stack(train['flattened_image'].values)  # Convert list of arrays to a 2D numpy array
y_train = train['label'].values

X_test = np.stack(test['flattened_image'].values)
y_test = test['label'].values
print("\n--- Training Data ---")
print(f"X_train shape: {X_train.shape}")  # (num_samples, flattened_image_length)
print(f"Sample X_train[0] (first 10 pixels): {X_train[0][:10]}")  # First 10 pixels of first image
print(f"y_train shape: {y_train.shape}")  # (num_samples,)
print(f"Sample y_train values: {y_train[:5]}")  # First 5 labels

print("\n--- Test Data ---")
print(f"X_test shape: {X_test.shape}")  # (num_samples, flattened_image_length)
print(f"Sample X_test[0] (first 10 pixels): {X_test[0][:10]}")  # First 10 pixels of first test image
print(f"y_test shape: {y_test.shape}")  # (num_samples,)
print(f"Sample y_test values: {y_test[:5]}")  # First 5 test labels


--- Training Data ---
X_train shape: (160, 2352)
Sample X_train[0] (first 10 pixels): [252 252 252 255 255 255 248 248 248 250]
y_train shape: (160,)
Sample y_train values: [8 6 5 0 5]

--- Test Data ---
X_test shape: (40, 2352)
Sample X_test[0] (first 10 pixels): [245 245 245 245 245 245 244 244 244 243]
y_test shape: (40,)
Sample y_test values: [1 2 5 6 1]


In [6]:
from sklearn.dummy import DummyClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

clf = DummyClassifier(strategy="most_frequent")
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
print("\n--- Dummy Classifier ---")
print(f"Accuracy: {accuracy_score(y_test, y_pred)}")
print("\nClassification Report:\n", classification_report(y_test, y_pred))


--- Dummy Classifier ---
Accuracy: 0.1

Classification Report:
               precision    recall  f1-score   support

           0       0.10      1.00      0.18         4
           1       0.00      0.00      0.00         4
           2       0.00      0.00      0.00         4
           3       0.00      0.00      0.00         4
           4       0.00      0.00      0.00         4
           5       0.00      0.00      0.00         4
           6       0.00      0.00      0.00         4
           7       0.00      0.00      0.00         4
           8       0.00      0.00      0.00         4
           9       0.00      0.00      0.00         4

    accuracy                           0.10        40
   macro avg       0.01      0.10      0.02        40
weighted avg       0.01      0.10      0.02        40



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
from xgboost import XGBClassifier

xgb = XGBClassifier(
    n_estimators=200,
    learning_rate=0.1,
    max_depth=5,
    random_state=0
)
xgb.fit(X_train, y_train)
y_pred_xgb = xgb.predict(X_test)

print(f"XGBoost Accuracy: {accuracy_score(y_test, y_pred_xgb):.2f}")
print(classification_report(y_test, y_pred_xgb))

ModuleNotFoundError: No module named 'lightgbm'

In [8]:
from sklearn.ensemble import RandomForestClassifier
# from sklearn.metrics import accuracy_score, classification_report

# Initialize and train the model
rf_clf = RandomForestClassifier(
    random_state=0     # For reproducibility
)
rf_clf.fit(X_train, y_train)

# Predict and evaluate
y_pred_rf = rf_clf.predict(X_test)

print("\n--- Random Forest Classifier ---")
print(f"Accuracy: {accuracy_score(y_test, y_pred_rf):.2f}")  # Prints 2 decimal places
print("\nClassification Report:\n", classification_report(y_test, y_pred_rf))


--- Random Forest Classifier ---
Accuracy: 0.78

Classification Report:
               precision    recall  f1-score   support

           0       0.50      0.75      0.60         4
           1       0.75      0.75      0.75         4
           2       0.60      0.75      0.67         4
           3       1.00      0.50      0.67         4
           4       0.75      0.75      0.75         4
           5       1.00      0.50      0.67         4
           6       1.00      0.75      0.86         4
           7       1.00      1.00      1.00         4
           8       1.00      1.00      1.00         4
           9       0.67      1.00      0.80         4

    accuracy                           0.78        40
   macro avg       0.83      0.78      0.78        40
weighted avg       0.83      0.78      0.78        40



In [None]:

# Initialize and train the model
dt_clf = DecisionTreeClassifier(
    random_state=0
)
dt_clf.fit(X_train, y_train)

# Predict and evaluate
y_pred_rf = dt_clf.predict(X_test)

print("\n--- Random Forest Classifier ---")
print(f"Accuracy: {accuracy_score(y_test, y_pred_rf):.2f}")
print("\nClassification Report:\n", classification_report(y_test, y_pred_rf))


--- Random Forest Classifier ---
Accuracy: 0.45

Classification Report:
               precision    recall  f1-score   support

           0       0.25      0.50      0.33         4
           1       0.67      0.50      0.57         4
           2       0.50      0.50      0.50         4
           3       0.00      0.00      0.00         4
           4       0.60      0.75      0.67         4
           5       0.20      0.25      0.22         4
           6       1.00      0.75      0.86         4
           7       0.40      0.50      0.44         4
           8       0.50      0.25      0.33         4
           9       1.00      0.50      0.67         4

    accuracy                           0.45        40
   macro avg       0.51      0.45      0.46        40
weighted avg       0.51      0.45      0.46        40

