Prepare Dataset for Training

In [26]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler

# Load dataset
df = pd.read_csv("student_data.csv")  # Change this if using another file

# One-Hot Encode Project Domains
project_cols = ["Project 1", "Project 2", "Project 3", "Project 4"]
encoder = OneHotEncoder(sparse_output=False)
project_encoded = encoder.fit_transform(df[project_cols])
project_encoded_df = pd.DataFrame(project_encoded, columns=encoder.get_feature_names_out(project_cols))


# Define thresholds
strong_threshold = 85
weak_threshold = 65

# Encode Strong & Weak Subjects
subjects = ['Operating System', 'DSA', 'Frontend', 'Backend', 'Machine Learning', 'Data Analytics']

# Function to determine strong and weak subjects for a student
def categorize_subjects(row):
    strong = [subject for subject in subjects if row[subject] >= strong_threshold]
    weak = [subject for subject in subjects if row[subject] <= weak_threshold]
    return pd.Series([', '.join(strong), ', '.join(weak)])

# Apply function to dataset
df[['Strong Subjects', 'Weak Subjects']] = df.apply(categorize_subjects, axis=1)

# Initialize binary columns for strong/weak subjects
for subject in subjects:
    df[f"Strong_{subject}"] = df['Strong Subjects'].apply(lambda x: 1 if subject in x else 0)
    df[f"Weak_{subject}"] = df['Weak Subjects'].apply(lambda x: 1 if subject in x else 0)

# Normalize Marks
scaler = MinMaxScaler()
df[subjects] = scaler.fit_transform(df[subjects])

# Combine all features
df_final = pd.concat([df, project_encoded_df], axis=1)

# Drop unnecessary columns
df_final.drop(columns=project_cols + ['Strong Subjects', 'Weak Subjects'], inplace=True)

# Save the processed dataset
df_final.to_csv("processed_student_data.csv", index=False)

print("Feature Engineering Completed ✅")


Feature Engineering Completed ✅


Train a Classification Model

In [27]:
print(df.columns)


Index(['Student ID', 'Operating System', 'DSA', 'Frontend', 'Backend',
       'Machine Learning', 'Data Analytics', 'Attendance (Operating System)',
       'Attendance (DSA)', 'Attendance (Frontend)', 'Attendance (Backend)',
       'Attendance (Machine Learning)', 'Attendance (Data Analytics)',
       'Project 1', 'Project 2', 'Project 3', 'Project 4', 'Strong Subjects',
       'Weak Subjects', 'Strong_Operating System', 'Weak_Operating System',
       'Strong_DSA', 'Weak_DSA', 'Strong_Frontend', 'Weak_Frontend',
       'Strong_Backend', 'Weak_Backend', 'Strong_Machine Learning',
       'Weak_Machine Learning', 'Strong_Data Analytics',
       'Weak_Data Analytics'],
      dtype='object')


In [28]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder

# Load processed dataset
df = pd.read_csv("processed_student_data.csv")

# Drop Student ID (irrelevant for learning)
X = df.drop(columns=["Student ID"], errors="ignore")

# Select project columns
project_cols = [col for col in df.columns if col.startswith("Project")]

# Generate 'Most Common Project Domain' dynamically
df["Most Common Project Domain"] = df[project_cols].idxmax(axis=1).str.replace("Project \d+_", "", regex=True)

# Encode categorical target variable
label_encoder = LabelEncoder()
df["Most Common Project Domain"] = label_encoder.fit_transform(df["Most Common Project Domain"])

# Create new feature: Count of projects per domain
for domain in ["AI", "Cybersecurity", "Data Science", "Game Development", "Machine Learning", "Robotics", "Web Development"]:
    df[f"Project_Count_{domain}"] = df.filter(like=domain).sum(axis=1)

# Use these as input features
X = df.drop(columns=project_cols, errors="ignore")

# Define Target Variable
y = df["Most Common Project Domain"]

# Split into Train and Test Set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Train a Random Forest Model
model = RandomForestClassifier(n_estimators=200, max_depth=10, random_state=42)
model.fit(X_train, y_train)

# Predict on Test Set
y_pred = model.predict(X_test)

# Model Accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Model Accuracy: {accuracy * 100:.2f}% ✅")

# Decode Predictions (Optional)
decoded_labels = label_encoder.inverse_transform(y_pred)
print("Predicted Categories:", decoded_labels[:10])


Model Accuracy: 99.00% ✅
Predicted Categories: ['Game Development' 'Robotics' 'Web Development' 'Robotics' 'Data Science'
 'Data Science' 'Robotics' 'Web Development' 'Data Science'
 'Cybersecurity']


In [29]:
from sklearn.model_selection import cross_val_score

# Perform 5-Fold Cross Validation
cv_scores = cross_val_score(model, X, y, cv=5, scoring='accuracy')

print(f"Cross-Validation Accuracy Scores: {cv_scores}")
print(f"Mean Accuracy: {cv_scores.mean() * 100:.2f}%")


Cross-Validation Accuracy Scores: [0.98  1.    0.99  0.995 0.97 ]
Mean Accuracy: 98.70%


Ensuring Robustness

In [30]:
feature_importances = pd.DataFrame({'Feature': X.columns, 'Importance': model.feature_importances_})
feature_importances = feature_importances.sort_values(by='Importance', ascending=False)
print(feature_importances.head(10))


                           Feature  Importance
25      Most Common Project Domain    0.365477
29  Project_Count_Game Development    0.036429
28      Project_Count_Data Science    0.034796
27     Project_Count_Cybersecurity    0.034129
31          Project_Count_Robotics    0.032874
32   Project_Count_Web Development    0.032721
26                Project_Count_AI    0.031723
0                       Student ID    0.029398
3                         Frontend    0.029354
30  Project_Count_Machine Learning    0.029079


In [31]:
X = X.drop(columns=['Student ID'])
model.fit(X_train, y_train)

In [32]:
# Predict on Test Set
y_pred = model.predict(X_test)

# Evaluate Model
accuracy = accuracy_score(y_test, y_pred)
print(f"Model Accuracy: {accuracy * 100:.2f}%")


Model Accuracy: 99.00%


In [33]:
from sklearn.model_selection import cross_val_score

cv_scores = cross_val_score(model, X_train, y_train, cv=5)
print(f"Cross-Validation Accuracy Scores: {cv_scores}")
print(f"Mean Accuracy: {cv_scores.mean() * 100:.2f}%")


Cross-Validation Accuracy Scores: [1.      0.99375 0.9875  0.9875  0.99375]
Mean Accuracy: 99.25%


In [34]:
import pandas as pd

# Get Feature Importance
feature_importances = pd.DataFrame({'Feature': X.columns, 'Importance': model.feature_importances_})
feature_importances = feature_importances.sort_values(by='Importance', ascending=False)
print(feature_importances.head(10))  # Show top 10 important features


ValueError: All arrays must be of the same length