Load the dataset

In [27]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestClassifier

# Load labeled dataset
file_path = 'labeled_student_data_s.csv'  # Update with correct path
df = pd.read_csv(file_path)

# Remove 'Exploring' category
df = df[df['Interest Label'] != 'Exploring']

# Define feature columns (marks in subjects)
subject_cols = ['Operating System', 'DSA', 'Frontend', 'Backend', 'Machine Learning', 'Data Analytics']

# Define project columns
project_cols = ['Project 1', 'Project 2', 'Project 3', 'Project 4']

# Encode project domains using One-Hot Encoding
encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
project_encoded = encoder.fit_transform(df[project_cols])

# Convert encoded projects into a DataFrame
project_feature_names = encoder.get_feature_names_out(project_cols)
df_projects = pd.DataFrame(project_encoded, columns=project_feature_names)

# Give more weight to projects
df_projects_weighted = df_projects * 1.3  # Adjust weight as needed

# Combine subject marks and weighted project features
X = pd.concat([df[subject_cols].reset_index(drop=True), df_projects_weighted.reset_index(drop=True)], axis=1)


# Define target column (Interest Label)
y = df['Interest Label']

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train a RandomForestClassifier
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Evaluate the model
accuracy = model.score(X_test, y_test)
print(f"Random Forest Model Accuracy: {accuracy:.2f}")


Random Forest Model Accuracy: 0.87


In [28]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

# Load labeled dataset
file_path = 'labeled_student_data_s.csv'  # Update with correct path
df = pd.read_csv(file_path)

# Remove 'Exploring' category
df = df[df['Interest Label'] != 'Exploring']

# Define feature columns (marks in subjects)
subject_cols = ['Operating System', 'DSA', 'Frontend', 'Backend', 'Machine Learning', 'Data Analytics']

# Define project columns
project_cols = ['Project 1', 'Project 2', 'Project 3', 'Project 4']

# Mapping of project domains to numeric values (1, 2, 3, 4, etc.)
domain_mapping = {
    'AI': 1,
    'Web Development': 2,
    'Machine Learning': 3,
    'Cybersecurity': 4,
    'Data Science': 5,
    'Robotics': 6,
    'Game Development': 7
}

# Encode project domains using the mapping (replace the domain names with numbers)
for col in project_cols:
    df[col] = df[col].map(domain_mapping)

# Combine subject marks and project features (now numeric)
X = pd.concat([df[subject_cols].reset_index(drop=True), df[project_cols].reset_index(drop=True)], axis=1)

# Define target column (Interest Label)
y = df['Interest Label']

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train a RandomForestClassifier
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Evaluate the model
accuracy = model.score(X_test, y_test)
print(f"Random Forest Model Accuracy: {accuracy:.2f}")


Random Forest Model Accuracy: 0.88


In [40]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier

# Assuming you already have a trained Random Forest model `model`

# Mapping of project domains to numeric values (1, 2, 3, 4, etc.)
domain_mapping = {
    'AI': 1,
    'Web Development': 2,
    'Machine Learning': 3,
    'Cybersecurity': 4,
    'Data Science': 5,
    'Robotics': 6,
    'Game Development': 7
}

# Example: One row of new data (same encoding format as above)
new_data_rf = {
    'Operating System': [90],
    'DSA': [85],
    'Frontend': [80],
    'Backend': [95],
    'Machine Learning': [68],
    'Data Analytics': [92],
    'Project 1': [4],  # Numeric values for projects (as per the domain_mapping)
    'Project 2': [2],
    'Project 3': [2],
    'Project 4': [4]
}

# Convert to DataFrame
new_data_rf_df = pd.DataFrame(new_data_rf)

# Ensure the new data has the same columns as the training data
# Create a list of the exact columns used during training (same order)
training_columns = [
    'Operating System', 'DSA', 'Frontend', 'Backend', 
    'Machine Learning', 'Data Analytics', 
    'Project 1', 'Project 2', 'Project 3', 'Project 4'
]

# Reindex new data to match training columns
new_data_rf_df = new_data_rf_df[training_columns]

# Predict using the trained Random Forest model
predicted_class_rf = model.predict(new_data_rf_df)

# Mapping of predicted integer index back to domain
inverse_domain_mapping = {v: k for k, v in domain_mapping.items()}

# Output the predicted label
predicted_label_rf = inverse_domain_mapping.get(predicted_class_rf[0], "Unknown")
print(f"Predicted Interest: {predicted_label_rf}")


Predicted Interest: Robotics


In [21]:
df.head()

Unnamed: 0,Student ID,Operating System,DSA,Frontend,Backend,Machine Learning,Data Analytics,Project 1,Project 2,Project 3,Project 4,Interest Label
0,1,7,67,46,65,22,82,7,4,6,6,Game Development
1,2,39,76,61,76,66,46,7,3,4,2,Machine Learning
2,3,35,28,80,64,89,36,1,7,1,6,AI
3,4,47,61,40,90,26,43,2,5,3,6,Web Development
4,5,43,87,16,48,52,29,5,6,6,4,Data Science


In [35]:
from sklearn.preprocessing import LabelEncoder
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score

# Encode categorical target variable
label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train)
y_test_encoded = label_encoder.transform(y_test)

# Train XGBoost model
model = XGBClassifier(n_estimators=200, learning_rate=0.05, max_depth=10, random_state=42)
model.fit(X_train, y_train_encoded)

# Predictions
y_pred = model.predict(X_test)

# Convert predictions back to original labels
y_pred_labels = label_encoder.inverse_transform(y_pred)

# Model Accuracy
accuracy = accuracy_score(y_test_encoded, y_pred)
print(f"XGBoost Model Accuracy: {accuracy:.2f}")

XGBoost Model Accuracy: 0.95


In [44]:
domain_mapping = {
    'AI': 1,
    'Web Development': 2,
    'Machine Learning': 3,
    'Cybersecurity': 4,
    'Data Science': 5,
    'Robotics': 6,
    'Game Development': 7
}
# Example: One row of new data (similar to your training data structure)
new_data = {
    'Operating System': [95],
    'DSA': [55],
    'Frontend': [80],
    'Backend': [95],
    'Machine Learning': [68],
    'Data Analytics': [92],
    'Project 1': [7],  # Numeric values for projects (as per the domain_mapping)
    'Project 2': [1],
    'Project 3': [6],
    'Project 4': [4]
}

# Convert to DataFrame
import pandas as pd
new_data_df = pd.DataFrame(new_data)

# Predict using the trained XGBoost model
predicted_class = model.predict(new_data_df)

# Decode the predicted label (if using Label Encoding)
predicted_label = label_encoder.inverse_transform(predicted_class)
print(f"Predicted Interest: {predicted_label[0]}")

Predicted Interest: Robotics


In [None]:
print("Encoded Project Features:")
print(df_projects.head())  # Check first few rows of encoded projects

print("\nColumn Names after One-Hot Encoding:")
print(df_projects.columns.tolist())  # List all new one-hot encoded columns

Encoded Project Features:
   Project 1_AI  Project 1_Cybersecurity  Project 1_Data Science  \
0           0.0                      0.0                     0.0   
1           0.0                      0.0                     0.0   
2           1.0                      0.0                     0.0   
3           0.0                      0.0                     0.0   
4           0.0                      0.0                     1.0   

   Project 1_Game Development  Project 1_Machine Learning  Project 1_Robotics  \
0                         1.0                         0.0                 0.0   
1                         1.0                         0.0                 0.0   
2                         0.0                         0.0                 0.0   
3                         0.0                         0.0                 0.0   
4                         0.0                         0.0                 0.0   

   Project 1_Web Development  Project 2_AI  Project 2_Cybersecurity  \
0      

In [24]:
# Create a dictionary mapping each domain to its one-hot encoded column index
domain_mapping = {col: idx for idx, col in enumerate(df_projects.columns)}
print("Domain to Code Mapping:")
print(domain_mapping)


Domain to Code Mapping:
{'Project 1_AI': 0, 'Project 1_Cybersecurity': 1, 'Project 1_Data Science': 2, 'Project 1_Game Development': 3, 'Project 1_Machine Learning': 4, 'Project 1_Robotics': 5, 'Project 1_Web Development': 6, 'Project 2_AI': 7, 'Project 2_Cybersecurity': 8, 'Project 2_Data Science': 9, 'Project 2_Game Development': 10, 'Project 2_Machine Learning': 11, 'Project 2_Robotics': 12, 'Project 2_Web Development': 13, 'Project 3_AI': 14, 'Project 3_Cybersecurity': 15, 'Project 3_Data Science': 16, 'Project 3_Game Development': 17, 'Project 3_Machine Learning': 18, 'Project 3_Robotics': 19, 'Project 3_Web Development': 20, 'Project 4_AI': 21, 'Project 4_Cybersecurity': 22, 'Project 4_Data Science': 23, 'Project 4_Game Development': 24, 'Project 4_Machine Learning': 25, 'Project 4_Robotics': 26, 'Project 4_Web Development': 27}
