In [1]:
!pip install imbalanced-learn

Defaulting to user installation because normal site-packages is not writeable

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.0.1[0m[39;49m -> [0m[32;49m24.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [4]:
import os
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, classification_report, confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder

# Load the data
file_path = "./data/train.csv"
data = pd.read_csv(file_path)

# Separate features and target
train_data = data.drop(columns=["target"])
train_label = data["target"]

# Remove columns with missing values and columns with only one unique value
train_data = train_data.dropna(axis=1)
train_data = train_data.loc[:, train_data.nunique() > 1]

# Identify categorical and numerical columns
categorical_cols = train_data.select_dtypes(include=['object']).columns
numerical_cols = train_data.select_dtypes(include=['int64', 'float64']).columns

# Preprocessing pipeline
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_cols),
        ('cat', Pipeline(steps=[('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
                                ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=True))]), categorical_cols)
    ])

# Apply preprocessing
train_data_preprocessed = preprocessor.fit_transform(train_data)

# Split the data into training and validation sets
df_train, df_val, train_y, val_y = train_test_split(
    train_data_preprocessed, train_label, test_size=0.1, stratify=train_label, random_state=110
)

# Apply TruncatedSVD to keep 99% variance
svd = TruncatedSVD(n_components=200, random_state=110)  # Use n_components as a starting point
df_train_svd = svd.fit_transform(df_train)
df_val_svd = svd.transform(df_val)

# Balance the dataset using SMOTE
smote = SMOTE(random_state=110)
df_train_resampled, train_y_resampled = smote.fit_resample(df_train_svd, train_y)

# Initialize and train the model
model = RandomForestClassifier(n_estimators=200, random_state=110, class_weight='balanced')
model.fit(df_train_resampled, train_y_resampled)

# Validate the model
val_pred = model.predict(df_val_svd)

# Evaluate the model
accuracy = accuracy_score(val_y, val_pred)
f1 = f1_score(val_y, val_pred, pos_label='AbNormal')
precision = precision_score(val_y, val_pred, pos_label='AbNormal')
recall = recall_score(val_y, val_pred, pos_label='AbNormal')

# Output results
print(f"Validation Accuracy: {accuracy:.4f}")
print(f"Validation F1 Score: {f1:.4f}")
print(f"Validation Precision: {precision:.4f}")
print(f"Validation Recall: {recall:.4f}")

print("\nConfusion Matrix:")
print(confusion_matrix(val_y, val_pred))

print("\nClassification Report:")
print(classification_report(val_y, val_pred))




Validation Accuracy: 0.9129
Validation F1 Score: 0.1885
Validation Precision: 0.2050
Validation Recall: 0.1745

Confusion Matrix:
[[  41  194]
 [ 159 3657]]

Classification Report:
              precision    recall  f1-score   support

    AbNormal       0.20      0.17      0.19       235
      Normal       0.95      0.96      0.95      3816

    accuracy                           0.91      4051
   macro avg       0.58      0.57      0.57      4051
weighted avg       0.91      0.91      0.91      4051



In [5]:
file_path = "./data/test.csv"
test_data = pd.read_csv(file_path)

# Preprocess the test data similarly to the training data
test_data_preprocessed = preprocessor.transform(test_data.drop(columns=['target','Set ID']))

# Apply TruncatedSVD (or PCA if needed) to the test data
test_data_svd = svd.transform(test_data_preprocessed)

# Predict the target for the test data
test_predictions = model.predict(test_data_svd)
test_predictions

array(['Normal', 'Normal', 'Normal', ..., 'Normal', 'AbNormal', 'Normal'],
      dtype=object)

In [6]:
# Update the submission dataframe with the predictions
submission_data = pd.read_csv("./submission.csv")
submission_data['target'] = test_predictions

# Save the updated submission file
submission_data.to_csv("./submission.csv", index=False)