Importing Libraries

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report, confusion_matrix
import joblib

Load the dataset

In [2]:
# Load labeled dataset
file_path = 'labeled_student_data_s.csv'
df = pd.read_csv(file_path)

# Remove 'Exploring' category
df = df[df['Interest Label'] != 'Exploring']
df.head()

Unnamed: 0,Student ID,Operating System,DSA,Frontend,Backend,Machine Learning,Data Analytics,Project 1,Project 2,Project 3,Project 4,Interest Label
0,1,7,67,46,65,22,82,Game Development,Cybersecurity,Robotics,Robotics,Game Development
1,2,39,76,61,76,66,46,Game Development,Machine Learning,Cybersecurity,Web Development,Machine Learning
2,3,35,28,80,64,89,36,AI,Game Development,AI,Robotics,AI
3,4,47,61,40,90,26,43,Web Development,Data Science,Machine Learning,Robotics,Web Development
4,5,43,87,16,48,52,29,Data Science,Robotics,Robotics,Cybersecurity,Data Science


Mapping

In [3]:
# Define feature columns (marks in subjects)
subject_cols = ['Operating System', 'DSA', 'Frontend', 'Backend', 'Machine Learning', 'Data Analytics']

# Define project columns
project_cols = ['Project 1', 'Project 2', 'Project 3', 'Project 4']

# Mapping of project domains to numeric values (1, 2, 3, 4, etc.)
domain_mapping = {
    'AI': 1,
    'Web Development': 2,
    'Machine Learning': 3,
    'Cybersecurity': 4,
    'Data Science': 5,
    'Robotics': 6,
    'Game Development': 7
}

# Encode project domains using the mapping (replace the domain names with numbers)
for col in project_cols:
    df[col] = df[col].map(domain_mapping)


Combining the features

In [4]:
# Combine subject marks and project features (now numeric)
X = pd.concat([df[subject_cols].reset_index(drop=True), df[project_cols].reset_index(drop=True)], axis=1)

# Define target column (Interest Label)
y = df['Interest Label']


Splitting the dataset

In [5]:
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

XGBoost

In [6]:
# Encode categorical target variable
label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train)
y_test_encoded = label_encoder.transform(y_test)

# Train XGBoost model
model = XGBClassifier(n_estimators=400, learning_rate=0.06, max_depth=10, random_state=42)
model.fit(X_train, y_train_encoded)

# Predictions
y_pred = model.predict(X_test)

# Convert predictions back to original labels
y_pred_labels = label_encoder.inverse_transform(y_pred)

# Model Accuracy
accuracy = accuracy_score(y_test_encoded, y_pred)
print(f"XGBoost Model Accuracy: {accuracy:.2f}")

XGBoost Model Accuracy: 0.96


In [7]:
print(confusion_matrix(y_test_encoded, y_pred))
print(classification_report(y_test_encoded, y_pred))

[[ 6205    29    28    58     0    43    99]
 [   76  3138    35     0    30    35    67]
 [   40    36  8473   107    84    47    61]
 [   67     0   107  8077   148    33    92]
 [    0    48    45    99 12638    74    99]
 [   22    24    68    52    72  4675    52]
 [   42    31    45    56    89     2 14009]]
              precision    recall  f1-score   support

           0       0.96      0.96      0.96      6462
           1       0.95      0.93      0.94      3381
           2       0.96      0.96      0.96      8848
           3       0.96      0.95      0.95      8524
           4       0.97      0.97      0.97     13003
           5       0.95      0.94      0.95      4965
           6       0.97      0.98      0.97     14274

    accuracy                           0.96     59457
   macro avg       0.96      0.96      0.96     59457
weighted avg       0.96      0.96      0.96     59457



Testing

In [8]:
# Example: One row of new data (similar to your training data structure)
new_data = {
    'Operating System': [95],
    'DSA': [55],
    'Frontend': [80],
    'Backend': [95],
    'Machine Learning': [68],
    'Data Analytics': [92],
    'Project 1': [6],  # Numeric values for projects (as per the domain_mapping)
    'Project 2': [1],
    'Project 3': [6],
    'Project 4': [4]
}

# Convert to DataFrame
new_data_df = pd.DataFrame(new_data)

# Predict using the trained XGBoost model
predicted_class = model.predict(new_data_df)

# Decode the predicted label (if using Label Encoding)
predicted_label = label_encoder.inverse_transform(predicted_class)
print(f"Predicted Interest: {predicted_label[0]}")

Predicted Interest: Robotics


In [9]:
# Save the trained model
joblib.dump(model, "smartinterest_model.pkl")
joblib.dump(label_encoder, "label_encoder.pkl")

print("Model saved successfully!")

Model saved successfully!
