Import Libraries

In [27]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier, StackingClassifier
from sklearn.metrics import accuracy_score, classification_report
from xgboost import XGBClassifier
from imblearn.over_sampling import SMOTE


Load Dataset

In [28]:
# Load the dataset (Make sure the CSV file is in the same directory as the Jupyter notebook)
df = pd.read_csv("labeled_student_data_s.csv")
df.head()  # Display the first 5 rows

Unnamed: 0,Student ID,Operating System,DSA,Frontend,Backend,Machine Learning,Data Analytics,Project 1,Project 2,Project 3,Project 4,Interest Label
0,1,52,5,6,6,14,93,Cybersecurity,Cybersecurity,Cybersecurity,Robotics,Robotics
1,2,41,65,80,65,75,34,Web Development,AI,Data Science,Game Development,AI
2,3,53,65,100,98,11,55,Robotics,Game Development,Data Science,Machine Learning,Machine Learning
3,4,62,97,97,9,97,91,AI,AI,Cybersecurity,AI,AI
4,5,49,46,76,8,89,28,Robotics,Cybersecurity,Machine Learning,Robotics,Machine Learning


Define Subject and Project Columns

In [29]:
# Define subject and project columns
subject_cols = ['Operating System', 'DSA', 'Frontend', 'Backend', 'Machine Learning', 'Data Analytics']
project_cols = ['Project 1', 'Project 2', 'Project 3', 'Project 4']

Function to Label Student Interest

In [30]:
import pandas as pd
import numpy as np

# Ensure subject columns are numeric
df[subject_cols] = df[subject_cols].apply(pd.to_numeric, errors='coerce')

# Function to determine the student's interest
def determine_interest(row):
    # Select numeric values only
    top_subjects = row[subject_cols].astype(float).nlargest(3).index.tolist()  # Get top 3 subjects
    projects = row[project_cols].dropna().tolist()  # Get all projects (remove NaNs)
    
    for project in projects:
        if project in top_subjects:
            return project  # Assign domain directly if a project matches a strong subject
    
    return np.random.choice(projects) if projects else "Exploring"  # Handle empty projects

# Apply the function to label the dataset
df['Interest Label'] = df.apply(determine_interest, axis=1)
df.head()

Unnamed: 0,Student ID,Operating System,DSA,Frontend,Backend,Machine Learning,Data Analytics,Project 1,Project 2,Project 3,Project 4,Interest Label
0,1,52,5,6,6,14,93,Cybersecurity,Cybersecurity,Cybersecurity,Robotics,Robotics
1,2,41,65,80,65,75,34,Web Development,AI,Data Science,Game Development,Web Development
2,3,53,65,100,98,11,55,Robotics,Game Development,Data Science,Machine Learning,Machine Learning
3,4,62,97,97,9,97,91,AI,AI,Cybersecurity,AI,Cybersecurity
4,5,49,46,76,8,89,28,Robotics,Cybersecurity,Machine Learning,Robotics,Machine Learning


Encode Labels

In [31]:
# Encode categorical labels (interest domains)
label_encoder = LabelEncoder()
df['Interest Label Encoded'] = label_encoder.fit_transform(df['Interest Label'])
df[['Interest Label', 'Interest Label Encoded']].head()

Unnamed: 0,Interest Label,Interest Label Encoded
0,Robotics,5
1,Web Development,6
2,Machine Learning,4
3,Cybersecurity,1
4,Machine Learning,4


Define Features (X) and Target (y)

In [32]:
# Define input features and target
X = df[subject_cols]
y = df['Interest Label Encoded']

# Train-test split (80-20)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

Apply SMOTE for Balancing Data

In [33]:
# Apply SMOTE to balance the dataset
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

# Check class distribution after SMOTE
pd.Series(y_train_resampled).value_counts()

Interest Label Encoded
3    944
6    944
4    944
5    944
1    944
2    944
0    944
Name: count, dtype: int64

Feature Scaling

In [34]:
# Scale features using StandardScaler
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_resampled)
X_test_scaled = scaler.transform(X_test)

Train XGBoost Model

In [35]:
# Initialize XGBoost model
xgb_model = XGBClassifier(n_estimators=1000, max_depth=6, learning_rate=0.5, random_state=42)

# Train the model
xgb_model.fit(X_train_scaled, y_train_resampled)

# Make predictions
y_pred_xgb = xgb_model.predict(X_test_scaled)

# Evaluate the model
accuracy_xgb = accuracy_score(y_test, y_pred_xgb)
print("XGBoost Accuracy After SMOTE:", accuracy_xgb)
print("\nClassification Report:\n", classification_report(y_test, y_pred_xgb))

XGBoost Accuracy After SMOTE: 0.1875

Classification Report:
               precision    recall  f1-score   support

           0       0.11      0.12      0.11        95
           1       0.14      0.16      0.15        94
           2       0.09      0.09      0.09        91
           3       0.07      0.07      0.07        94
           4       0.40      0.37      0.38       236
           5       0.11      0.12      0.11        90
           6       0.12      0.11      0.11       100

    accuracy                           0.19       800
   macro avg       0.15      0.15      0.15       800
weighted avg       0.19      0.19      0.19       800



Hyperparameter Tuning (GridSearchCV)

In [36]:
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score

param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [4, 6, 8],
    'learning_rate': [0.01, 0.05, 0.1],
    'subsample': [0.7, 0.8, 0.9],
    'colsample_bytree': [0.7, 0.8, 0.9]
}

# Ensure y_train_resampled has more than 2 classes
num_classes = len(set(y_train_resampled))

xgb = XGBClassifier(
    objective="multi:softprob" if num_classes > 2 else "binary:logistic",  # Automatically choose objective
    random_state=42,
    use_label_encoder=False,
    eval_metric='mlogloss'  # Log loss for multi-class classification
)

grid_search = GridSearchCV(
    estimator=xgb,
    param_grid=param_grid,
    scoring='accuracy',
    cv=5,
    n_jobs=-1  # Use all available processors
)

# Fit the model
grid_search.fit(X_train_scaled, y_train_resampled)

# Best parameters and score
print("Best Parameters:", grid_search.best_params_)
print("Best Cross-Validation Score:", grid_search.best_score_)

# Get the best model and test accuracy
best_xgb = grid_search.best_estimator_
y_pred = best_xgb.predict(X_test_scaled)
accuracy = accuracy_score(y_test, y_pred)
print(f"Test Accuracy: {accuracy:.4f}")


Parameters: { "use_label_encoder" } are not used.



Best Parameters: {'colsample_bytree': 0.9, 'learning_rate': 0.1, 'max_depth': 8, 'n_estimators': 300, 'subsample': 0.8}
Best Cross-Validation Score: 0.6493796818758081
Test Accuracy: 0.2125


Train Stacking Model

In [37]:
# Stacking model with RandomForest and XGBoost
stack_model = StackingClassifier(
    estimators=[
        ('rf', RandomForestClassifier(n_estimators=100, random_state=42)),
        ('xgb', XGBClassifier(n_estimators=200, max_depth=6, learning_rate=0.1, random_state=42))
    ],
    final_estimator=RandomForestClassifier(n_estimators=100, random_state=42)
)

stack_model.fit(X_train_scaled, y_train_resampled)
y_pred_stack = stack_model.predict(X_test_scaled)

# Evaluate Stacking Model
print("Stacking Model Accuracy:", accuracy_score(y_test, y_pred_stack))
print("\nClassification Report:\n", classification_report(y_test, y_pred_stack))

Stacking Model Accuracy: 0.245

Classification Report:
               precision    recall  f1-score   support

           0       0.14      0.12      0.12        95
           1       0.17      0.13      0.15        94
           2       0.04      0.03      0.04        91
           3       0.11      0.09      0.10        94
           4       0.39      0.60      0.47       236
           5       0.15      0.11      0.13        90
           6       0.15      0.11      0.13       100

    accuracy                           0.24       800
   macro avg       0.16      0.17      0.16       800
weighted avg       0.20      0.24      0.22       800



Voting Classifier

In [38]:
from sklearn.ensemble import VotingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

rf = RandomForestClassifier(n_estimators=200, random_state=42)
xgb = XGBClassifier(n_estimators=200, learning_rate=0.1, random_state=42)
svc = SVC(kernel='rbf', probability=True)

voting_clf = VotingClassifier(
    estimators=[('rf', rf), ('xgb', xgb), ('svc', svc)],
    voting='soft'
)

voting_clf.fit(X_train_scaled, y_train_resampled)
print("Voting Classifier Accuracy:", voting_clf.score(X_test_scaled, y_test))

Voting Classifier Accuracy: 0.25
