In [None]:
import warnings
warnings.filterwarnings('ignore')

In [None]:
import micropip
await micropip.install('seaborn')

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

In [None]:
train=pd.read_csv('dataset/train.csv')

In [None]:
train.drop_duplicates(inplace=True)

In [None]:
train.nunique()

In [None]:
train.isnull().sum()

In [None]:
train.info()

In [None]:
num_col=train.select_dtypes(include=['int64','float64']).columns.tolist()

In [None]:
for col in num_col:
    plt.figure(figsize=(12,6))
    sns.histplot(x=col,data=train,kde=True)
    plt.xlabel(col)
    plt.ylabel('Class')
    plt.show()

In [None]:
sns.countplot(x='Class',data=train)

In [None]:
for col in num_col:
    median_val = train[col].median()
    train.fillna({col: median_val}, inplace=True)
    print(f"Filled missing in {col} with median: {median_val}")

mode_val = train['Class'].mode()[0]
train.fillna({'Class': mode_val}, inplace=True)
print(f"Filled missing in {col} with mode: {mode_val}")

In [None]:
train.isnull().sum()

In [None]:
for col in num_col:
    plt.figure(figsize=(12,6))
    sns.boxplot(x=col,data=train)
    plt.xlabel(col)
    plt.ylabel('Class')
    plt.show()

In [None]:
outlier_count = {}

for col in num_col:  
    Q1 = train[col].quantile(0.25)
    Q3 = train[col].quantile(0.75)
    IQR = Q3 - Q1
    
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    
    outliers = train[(train[col] < lower_bound) | (train[col] > upper_bound)]
    outlier_count[col] = outliers.shape[0]

outlier_count

In [None]:
train.shape

In [None]:
# for col in num_col:
#     Q1 = train[col].quantile(0.25)
#     Q3 = train[col].quantile(0.75)
#     IQR = Q3 - Q1
#     lower_bound = Q1 - 1.5 * IQR
#     upper_bound = Q3 + 1.5 * IQR
#     train[col] = train[col].clip(lower=lower_bound, upper=upper_bound)

In [None]:
from sklearn.preprocessing import RobustScaler
scaler = RobustScaler()
train[num_col] = scaler.fit_transform(train[num_col])

In [None]:
train.head()

In [None]:
sns.pairplot(train, hue='Class')
plt.show()

In [None]:
le = LabelEncoder()
train['Class'] = le.fit_transform(train['Class'])

In [None]:
corr=train.corr(numeric_only=True)
plt.figure(figsize=(14,6))
sns.heatmap(corr, annot=True, cmap='coolwarm', fmt=".2f")
plt.title("Correlation Heatmap", fontsize=14)
plt.show()

In [None]:
X = train.drop('Class', axis=1)
y = train['Class']

In [None]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [None]:
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint
rf = RandomForestClassifier(random_state=42)

param_dist = {
    "n_estimators": randint(50, 300),
    "max_depth": randint(5, 50),
    "min_samples_split": randint(2, 20),
    "min_samples_leaf": randint(1, 20),
    "max_features": ["sqrt", "log2"],
    "class_weight": ['balanced']
}

random_search = RandomizedSearchCV(
    estimator=rf,
    param_distributions=param_dist,
    n_iter=100,            
    scoring='accuracy',    
    cv=5,                 
    verbose=2,
    random_state=42,
    n_jobs=-1             
)

random_search.fit(X_train, y_train)

print("Best Hyperparameters:", random_search.best_params_)
print("Best F1 Score (weighted):", random_search.best_score_)

best_model = random_search.best_estimator_
y_pred_train = best_model.predict(X_train)
train_accuracy = accuracy_score(y_train, y_pred_train)
print(f"Training Accuracy: {train_accuracy:.4f}")   
train_f1score = f1_score(y_train, y_pred_train,average='weighted')
print(f"Training F1 Score: {train_f1score:.4f}")
train_conf_matrix = confusion_matrix(y_train, y_pred_train)
print("Training Confusion Matrix:")
print(train_conf_matrix)
y_val_pred = best_model.predict(X_val)
val_accuracy = accuracy_score(y_val, y_val_pred)
print(f"Validation Accuracy: {val_accuracy:.4f}")
val_f1 = f1_score(y_val, y_val_pred, average='weighted')
print(f"Validation F1 Score: {val_f1:.4f}")
val_conf_matrix = confusion_matrix(y_val, y_val_pred)
print("Validation Confusion Matrix:")
print(val_conf_matrix)

In [None]:
rf_final = random_search.best_estimator_
rf_final.fit(X_train, y_train)

In [None]:
test=pd.read_csv('dataset/test.csv')
sol=pd.read_csv('dataset/sol.csv')

In [None]:
test.info()

In [None]:
test_ids = test['id']
test.drop('id', axis=1, inplace=True)

In [None]:
X_test_scaled = scaler.fit_transform(test)

In [None]:
pred=rf_final.predict(X_test_scaled)

In [None]:
y_test=sol['Class']

In [None]:
le.classes_

In [None]:
y_test_encoded=le.transform(y_test)

In [None]:
test_accuracy = accuracy_score(y_test_encoded, pred)
print(f"Test Accuracy: {test_accuracy:.4f}")
test_f1score = f1_score(y_test_encoded, pred)
print(f"Test F1 Score: {test_f1score:.4f}")
test_conf_matrix = confusion_matrix(y_test_encoded, pred)
print("Test Confusion Matrix:")
print(test_conf_matrix)