In [1]:
import numpy as np
import pickle

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split

In [2]:
def load_data(filename):
    with open(filename, 'rb') as f:
        x, y = pickle.load(f)

    rows_count = x[0].shape[0]
    x = [bucket for bucket in x if bucket.shape[0] == rows_count]

    x = np.array([bucket.to_numpy().flatten() for bucket in x])
    y = np.array(y[:len(x)])
    
    return x, y


In [3]:
x_data_np, y_data_np = load_data('preprocessed_data/processed_data.pkl')

# Split data into training and test sets
x_train, x_test, y_train, y_test = train_test_split(x_data_np, y_data_np, test_size=0.2, random_state=42)

# Train a classifier (example with RandomForest)
clf = RandomForestClassifier()
clf.fit(x_train, y_train)

# Predict and evaluate
y_pred = clf.predict(x_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00      3263
           1       1.00      0.99      1.00       157
           2       1.00      1.00      1.00      3071

    accuracy                           1.00      6491
   macro avg       1.00      1.00      1.00      6491
weighted avg       1.00      1.00      1.00      6491



In [4]:
x_val, y_val = load_data('validation_data/processed_data.pkl')

y_pred = clf.predict(x_val)
print(classification_report(y_val, y_pred))

              precision    recall  f1-score   support

           0       1.00      0.99      1.00       157
           1       0.99      0.99      0.99       157
           2       0.99      1.00      0.99       154

    accuracy                           0.99       468
   macro avg       0.99      0.99      0.99       468
weighted avg       0.99      0.99      0.99       468

