## Supervised (Random Forest)




In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.utils.class_weight import compute_class_weight
from sklearn.metrics import precision_score, recall_score, accuracy_score, f1_score

file_path = 'data/SDSS_DR18.csv'
stars_data = pd.read_csv(file_path)

preprocessed_data = stars_data.copy()

label_encoders = {}
for column in ['class']:
    le = LabelEncoder()
    preprocessed_data[column] = le.fit_transform(preprocessed_data[column])
    label_encoders[column] = le

numerical_features = ["ra","dec","u","g","r","i","z","run","rerun","camcol","field","plate","mjd","fiberid","petroRad_u","petroRad_g","petroRad_i","petroRad_r","petroRad_z","petroFlux_u","petroFlux_g","petroFlux_i","petroFlux_r","petroFlux_z","petroR50_u","petroR50_g","petroR50_i","petroR50_r","petroR50_z","psfMag_u","psfMag_r","psfMag_g","psfMag_i","psfMag_z","expAB_u","expAB_g","expAB_r","expAB_i","expAB_z","redshift"]

scaler = StandardScaler()
preprocessed_data[numerical_features] = scaler.fit_transform(preprocessed_data[numerical_features])

target_column = 'class'
X = preprocessed_data.drop(columns=[target_column, "objid", "specobjid"])
y = preprocessed_data[target_column]


y = y.astype(int)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

train_df = pd.concat([X_train, y_train], axis=1)
test_df = pd.concat([X_test, y_test], axis=1)

train_df.to_csv('train_data.csv', index=False)
test_df.to_csv('test_data.csv', index=False)

class_weight_dict = {cls: weight for cls, weight in zip(
    np.unique(y_train), compute_class_weight(class_weight='balanced', classes=np.unique(y_train), y=y_train)
)}


In [3]:
rf_classifier_weighted = RandomForestClassifier(n_estimators=100, random_state=42, class_weight=class_weight_dict)

rf_classifier_weighted.fit(X_train, y_train)

rf_cv_scores_weighted = cross_val_score(rf_classifier_weighted, X_train, y_train, cv=5, scoring='accuracy')

accuracy_rf = np.mean(rf_cv_scores_weighted)

y_pred = rf_classifier_weighted.predict(X_test)

unique_classes = np.unique(y_test)
accuracy = accuracy_score(y_test, y_pred)

precision = precision_score(y_test, y_pred, average='macro')
recall = recall_score(y_test, y_pred, average='macro')
f_score = f1_score(y_test, y_pred, average='macro')

print("Accuracy:", accuracy)
print("Precision (macro avg):", precision)
print("Recall (macro avg):", recall)
print("F1 Score (macro avg):", f_score)


Random Forest Accuracy: 0.9903000000000001
              precision    recall  f1-score   support

           0       0.99      0.99      0.99     10373
           1       0.99      0.95      0.97      2115
           2       0.99      1.00      1.00      7512

    accuracy                           0.99     20000
   macro avg       0.99      0.98      0.99     20000
weighted avg       0.99      0.99      0.99     20000



## Setting up the C++ wrapper for Random Forest

In [2]:
import random_forest_wrapper

class_weight_map = random_forest_wrapper.IntDoubleMap()
for k, v in class_weight_dict.items():
    class_weight_map[int(k)] = v

## Single Thread Execution

In [4]:
random_forest_wrapper.execute_single_thread("train_data.csv", "test_data.csv", 100, class_weight_map)

Accuracy: 0.8441
Precision: 0.402678
Recall: 0.981087
F1-score: 0.570996


## Parallel Execution

In [6]:
random_forest_wrapper.execute_parallel("train_data.csv", "test_data.csv", 100, class_weight_map)


Accuracy: 0.7777
Precision: 0.32116
Recall: 0.989598
F1-score: 0.484939
