In [7]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_absolute_error, r2_score
from sklearn.ensemble import RandomForestRegressor
import joblib

df = pd.read_csv("student_exam_data.csv")
print("Dataset loaded successfully!")
print(df.head())

TARGET = "Study Hours"     

if TARGET not in df.columns:
    raise ValueError(f"Target column '{TARGET}' not found. Please check dataset columns.")


X = df.drop(columns=[TARGET])
y = df[TARGET]


numeric_features = X.select_dtypes(include=['int64', 'float64']).columns.tolist()
categorical_features = X.select_dtypes(include=['object']).columns.tolist()

print("Numeric columns:", numeric_features)
print("Categorical columns:", categorical_features)


numeric_transformer = StandardScaler()
categorical_transformer = OneHotEncoder(handle_unknown='ignore')

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_features),
        ("cat", categorical_transformer, categorical_features)
    ]
)


model = RandomForestRegressor(
    n_estimators=300,
    max_depth=10,
    random_state=42
)

pipeline = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("model", model)
])


X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

pipeline.fit(X_train, y_train)
print("Model training completed!")

preds = pipeline.predict(X_test)

mae = mean_absolute_error(y_test, preds)
r2 = r2_score(y_test, preds)

print(f"MAE: {mae:.3f}")
print(f"R² Score: {r2:.3f}")

import warnings
from pandas.api.types import is_numeric_dtype

preproc = pipeline.named_steps["preprocessor"]
rf = pipeline.named_steps["model"]


try:
    numeric_features = list(preproc.transformers_[0][2])  
except Exception:
 
    numeric_features = [c for c in X.columns if is_numeric_dtype(X[c])]

cat_feature_names = []
try:

    cat_transformer = preproc.named_transformers_.get("cat")
    if cat_transformer is not None:

        try:
            cat_feature_names = cat_transformer.get_feature_names_out(categorical_features)
        except Exception:
            
            try:
                cat_feature_names = cat_transformer.get_feature_names(categorical_features)
            except Exception:
              
                cat_feature_names = []
    else:
        cat_feature_names = []
except Exception:
    cat_feature_names = []
    

if isinstance(cat_feature_names, np.ndarray):
    cat_feature_names = cat_feature_names.tolist()

all_features = list(numeric_features) + list(cat_feature_names)

importances = None
if hasattr(rf, "feature_importances_"):
    importances = rf.feature_importances_
else:
    warnings.warn("Model does not expose feature_importances_. Skipping importance extraction.")

if importances is None:
    print("No importances to show.")
else:
    if len(importances) != len(all_features):

        sample = X_train.iloc[:5].copy()
        transformed = preproc.transform(sample)
        transformed_dim = transformed.shape[1]
        if transformed_dim == len(importances):
     
            if len(all_features) != transformed_dim:
                all_features = [f"f{i}" for i in range(transformed_dim)]
        else:
  
            warnings.warn(
                f"Feature count mismatch: {len(all_features)} names vs {len(importances)} importances.\n"
                "Will display first min(len) pairs."
            )
    
    min_len = min(len(all_features), len(importances))
    feat_df = pd.DataFrame({
        "feature": all_features[:min_len],
        "importance": importances[:min_len]
    }).sort_values("importance", ascending=False)
    
    print("\nTop important features (showing up to first 50):")
    print(feat_df.head(50))

def study_plan(score):
    """
    Generates a simple study recommendation based on predicted score.
    Adjust thresholds to match your dataset.
    """

    if score < 40:
        return (
            "Your performance is low.\n"
            "- Study 3–4 hours per day\n"
            "- Focus first on weak subjects\n"
            "- Solve at least 30 practice questions daily\n"
            "- Revise weekly\n"
            "- Attend doubt-clearance sessions"
        )

    elif 40 <= score < 70:
        return (
            "Moderate performance.\n"
            "- Study 2 hours per day\n"
            "- Strengthen weak areas\n"
            "- Practice previous year papers\n"
            "- Revise every 3 days"
        )

    else:
        return (
            "Strong performance!\n"
            "- Maintain 1 hour revision daily\n"
            "- Solve mock tests weekly\n"
            "- Focus on accuracy and speed\n"
            "- Learn advanced topics"
        )
        
print("\n### StudyBuddy – Enter Your Details ###\n")

user_data = {}

for col in X.columns:
    value = input(f"Enter {col}: ")

    if isinstance(value, str):
        if value.strip().lower() == "pass":
            value = 1
        elif value.strip().lower() == "fail":
            value = 0

    try:
        if value.strip() == "":
            value = None
        else:
            value = float(value)
    except:
        pass  

    user_data[col] = value

user_df = pd.DataFrame([user_data])

user_df = user_df[X.columns]

try:
    user_pred = pipeline.predict(user_df)[0]
    print("\nPredicted Score:", user_pred)
    print("\nStudyBuddy Recommendation:\n", study_plan(user_pred))

except Exception as e:
    print("\nError during user prediction:")
    print(e)


Dataset loaded successfully!
   Study Hours  Previous Exam Score  Pass/Fail
0     4.370861            81.889703          0
1     9.556429            72.165782          1
2     7.587945            58.571657          0
3     6.387926            88.827701          1
4     2.404168            81.083870          0
Numeric columns: ['Previous Exam Score', 'Pass/Fail']
Categorical columns: []
Model training completed!
MAE: 1.787
R² Score: 0.185

Top important features (showing up to first 50):
               feature  importance
0  Previous Exam Score    0.611119
1            Pass/Fail    0.388881

### StudyBuddy – Enter Your Details ###



Enter Previous Exam Score:  75
Enter Pass/Fail:  Pass



Predicted Score: 6.457623684207761

StudyBuddy Recommendation:
 Your performance is low.
- Study 3–4 hours per day
- Focus first on weak subjects
- Solve at least 30 practice questions daily
- Revise weekly
- Attend doubt-clearance sessions
