In [45]:
#importing Libreirs
import pandas as pd
import joblib

In [46]:
#ML Lib
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score , mean_squared_error
from sklearn.ensemble import RandomForestClassifier , RandomForestRegressor

In [47]:
# Load the data 

df = pd.read_csv("SP_cleaned.csv")
TARGET = df.columns[-1]
FEATURES = df.columns[:-1]

X = df[FEATURES]
y = df[TARGET]

In [48]:
import joblib
model = joblib.load("final_model.pkl")

In [49]:
#Feature importance 

if not hasattr(model , 'feature_importances_'):
    print("Feature importance not available for this model")
    exit()

In [50]:
importance = model.feature_importances_

feature_importance_df = pd.DataFrame ({
    "Feature" : FEATURES ,
    "Importance" : importance
    }).sort_values(by="Importance" , ascending =False)

print("Feature Importance:")
print(feature_importance_df)

Feature Importance:
                       Feature  Importance
6                reading score    0.927233
5                   math score    0.028547
2  parental level of education    0.012213
1               race/ethnicity    0.011120
0                       gender    0.009246
4      test preparation course    0.008322
3                        lunch    0.003320


In [51]:
THRESHOLD = 0.05 
selected_features = feature_importance_df[
    feature_importance_df["Importance"] >= THRESHOLD
]["Feature"].values

print("Selected Features:")
print(selected_features)

Selected Features:
['reading score']


In [52]:
X_selected = df[selected_features]

In [53]:
X_train ,X_test ,y_train , y_test = train_test_split(
    X_selected , y  ,test_size=0.2 , random_state= 42 
)

In [54]:
#Scale the data 

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [55]:
#Detected problem type 

problem_type = "Regresion" if y.dtype in ['float', 'int64'] and len(y.unique()) > 15 else "Classification"

In [56]:
if problem_type == "Classification":
    new_model = RandomForestClassifier()
    new_model.fit(X_train, y_train)
    y_pred = new_model.predict(X_test)
    score = accuracy_score(y_test, y_pred)
    print("\n Accuracy after feature selection:", score)

else:
    new_model = RandomForestRegressor()
    new_model.fit(X_train, y_train)
    y_pred = new_model.predict(X_test)
    score = mean_squared_error(y_test, y_pred)
    print("\n MSE after feature selection:", score)


 MSE after feature selection: 0.10673982006463816


In [57]:
#Save the model
joblib.dump(new_model, "improved_model.pkl")
joblib.dump(scaler, "improved_scaler.pkl")

print("\n Improved model saved as improved_model.pkl")
print(" Day 12 Completed!")


 Improved model saved as improved_model.pkl
 Day 12 Completed!
