# load dataset

In [1]:
import pandas as pd
df1 = pd.read_csv("student-scores.csv")
df = df1.copy()
df.head()

Unnamed: 0,id,first_name,last_name,email,gender,extracurricular_activities,riasec,career_aspiration,math_score,history_score,physics_score,chemistry_score,biology_score,english_score,geography_score
0,1,Paul,Casey,paul.casey.1@gslingacademy.com,male,False,27,Bachelor of Statistics,73,81,93,97,63,80,87
1,2,Danielle,Sandoval,danielle.sandoval.2@gslingacademy.com,female,False,40,Bachelor of Supply Chain Management,90,86,96,100,90,88,90
2,3,Tina,Andrews,tina.andrews.3@gslingacademy.com,female,True,30,Bachelor of Corporate Communications,81,97,95,96,65,77,94
3,4,Tara,Clark,tara.clark.4@gslingacademy.com,female,False,40,Bachelor of Human Resouce Management,71,74,88,80,89,63,86
4,5,Anthony,Campos,anthony.campos.5@gslingacademy.com,male,False,25,Bachelor of Development Studies,84,77,65,65,80,74,76


# drop irrelevant columns

In [2]:
df.columns
df.drop(columns=['id','first_name','last_name','email'],axis=1, inplace=True)

# create new features from all score

In [3]:
df["total_score"] = df["math_score"] + df["history_score"] + df["physics_score"] + df["chemistry_score"] + df["biology_score"] + df["english_score"] + df["geography_score"]
df["average_score"] = df["total_score"] / 7
df.head()

Unnamed: 0,gender,extracurricular_activities,riasec,career_aspiration,math_score,history_score,physics_score,chemistry_score,biology_score,english_score,geography_score,total_score,average_score
0,male,False,27,Bachelor of Statistics,73,81,93,97,63,80,87,574,82.0
1,female,False,40,Bachelor of Supply Chain Management,90,86,96,100,90,88,90,640,91.428571
2,female,True,30,Bachelor of Corporate Communications,81,97,95,96,65,77,94,605,86.428571
3,female,False,40,Bachelor of Human Resouce Management,71,74,88,80,89,63,86,551,78.714286
4,male,False,25,Bachelor of Development Studies,84,77,65,65,80,74,76,521,74.428571


In [4]:
df.head()

Unnamed: 0,gender,extracurricular_activities,riasec,career_aspiration,math_score,history_score,physics_score,chemistry_score,biology_score,english_score,geography_score,total_score,average_score
0,male,False,27,Bachelor of Statistics,73,81,93,97,63,80,87,574,82.0
1,female,False,40,Bachelor of Supply Chain Management,90,86,96,100,90,88,90,640,91.428571
2,female,True,30,Bachelor of Corporate Communications,81,97,95,96,65,77,94,605,86.428571
3,female,False,40,Bachelor of Human Resouce Management,71,74,88,80,89,63,86,551,78.714286
4,male,False,25,Bachelor of Development Studies,84,77,65,65,80,74,76,521,74.428571


# Encoding Categorical Columns

In [5]:
# from sklearn.preprocessing import LabelEncoder

# # Create a LabelEncoder object
# label_encoder = LabelEncoder()

# # Encode categorical columns using label encoder
# df['gender'] = label_encoder.fit_transform(df['gender'])
# df['part_time_job'] = label_encoder.fit_transform(df['part_time_job'])
# df['riasec'] = label_encoder.fit_transform(df['riasec'])
# df['career_aspiration'] = label_encoder.fit_transform(df['career_aspiration'])
# Define mapping dictionaries for categorical features
gender_map = {'male': 0, 'female': 1}
extracurricular_activities_map = {False: 0, True: 1}
career_aspiration_map = {
    "Bachelor of Statistics": 0,
    "Bachelor of Supply Chain Management": 1,
    "Bachelor of Corporate Communications": 2,
    "Bachelor of Human Resouce Management": 3,
    "Bachelor of Development Studies": 4,
    "Bachelor of Procurement and Contract Management": 5,
    "Bachelor of Project Management": 6,
    "Bachelor of Business Administration": 7,
    "Bachelor of Journalism": 8,
    "Bachelor of Business and Office Management": 9,
    "Bachelor of Economics and Statistics": 10,
    "Bachelor of Mass Communication": 11,
    "Bachelor of Commerce": 12,
    "Bachelor of Procurement and Logistics": 13,
    "Bachelor of Finance": 14,
    "Bachelor of Business Information Technology": 15,
    "Bachelor of Technology and Entrepreneurship Management": 16,
}
# Apply mapping to the DataFrame
df['gender'] = df['gender'].map(gender_map)
df['extracurricular_activities'] = df['extracurricular_activities'].map(extracurricular_activities_map)
df['career_aspiration'] = df['career_aspiration'].map(career_aspiration_map)

In [6]:
df.head()

Unnamed: 0,gender,extracurricular_activities,riasec,career_aspiration,math_score,history_score,physics_score,chemistry_score,biology_score,english_score,geography_score,total_score,average_score
0,0,0,27,0,73,81,93,97,63,80,87,574,82.0
1,1,0,40,1,90,86,96,100,90,88,90,640,91.428571
2,1,1,30,2,81,97,95,96,65,77,94,605,86.428571
3,1,0,40,3,71,74,88,80,89,63,86,551,78.714286
4,0,0,25,4,84,77,65,65,80,74,76,521,74.428571


# Balance Dataset

In [7]:
df['career_aspiration'].unique()

array([ 0,  1,  2,  3,  4,  5,  6,  7,  9, 10, 11, 12, 15, 13, 16,  8, 14],
      dtype=int64)

In [8]:
df['career_aspiration'].value_counts()

career_aspiration
5     315
7     309
4     223
10    169
0     138
12    126
1     119
14     83
8      73
13     68
3      67
16     63
2      61
6      59
15     56
9      39
11     32
Name: count, dtype: int64

In [9]:
from imblearn.over_sampling import SMOTE

# Create SMOTE object
smote = SMOTE(random_state=42)

# Separate features and target variable
X = df.drop('career_aspiration', axis=1)
y = df['career_aspiration']

# Apply SMOTE to the data
X_resampled, y_resampled = smote.fit_resample(X, y)

In [10]:
print("Number of NaNs in y:", y.isna().sum())

Number of NaNs in y: 0


# Train test Split

In [11]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X_resampled,y_resampled,test_size=0.2, random_state=42)

In [12]:
X_train.shape

(4284, 12)

# Feature Scalling

In [13]:
from sklearn.preprocessing import StandardScaler

# Initialize the StandardScaler
scaler = StandardScaler()

# Fit the scaler to the training data and transform both training and testing data
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [14]:
X_train_scaled.shape

(4284, 12)

# Models Training (Multiple Models)

In [15]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import warnings
warnings.filterwarnings("ignore")

# Define models
models = {
    "Logistic Regression": LogisticRegression(),
    "Support Vector Classifier": SVC(),
    "Random Forest Classifier": RandomForestClassifier(),
    "K Nearest Neighbors": KNeighborsClassifier(),
    "Decision Tree Classifier": DecisionTreeClassifier(),
    "Gaussian Naive Bayes": GaussianNB(),
    "AdaBoost Classifier": AdaBoostClassifier(),
    "Gradient Boosting Classifier": GradientBoostingClassifier(),
    "XGBoost Classifier": XGBClassifier(use_label_encoder=False, eval_metric='mlogloss')
}

# Train and evaluate each model
for name, model in models.items():
    print("="*50)
    print("Model:", name)
    # Train the model
    model.fit(X_train_scaled, y_train)
    
    # Predict on test set
    y_pred = model.predict(X_test_scaled)
    
    # Calculate metrics
    accuracy = accuracy_score(y_test, y_pred)
    classification_rep = classification_report(y_test, y_pred)
    conf_matrix = confusion_matrix(y_test, y_pred)
    
    # Print metrics
    print("Accuracy:", accuracy)
    print("Classification Report:\n", classification_rep)
    print("Confusion Matrix:\n", conf_matrix)


Model: Logistic Regression
Accuracy: 0.34827264239028943
Classification Report:
               precision    recall  f1-score   support

           0       0.30      0.25      0.27        68
           1       0.40      0.71      0.51        72
           2       0.22      0.12      0.16        57
           3       0.24      0.16      0.19        58
           4       0.33      0.09      0.14        66
           5       0.29      0.25      0.27        76
           6       0.40      0.62      0.49        71
           7       0.47      0.46      0.46        61
           8       0.24      0.33      0.28        49
           9       0.33      0.37      0.35        63
          10       0.22      0.11      0.15        64
          11       0.35      0.66      0.46        50
          12       0.41      0.41      0.41        69
          13       0.35      0.56      0.43        55
          14       0.36      0.16      0.22        62
          15       0.12      0.05      0.07        65


# Model Selection (Random Forest)

In [16]:
model = RandomForestClassifier()

model.fit(X_train_scaled, y_train)
# Predict on test set
y_pred = model.predict(X_test_scaled)

# Calculate metrics
print("Accuracy: ",accuracy_score(y_test, y_pred))
print("Report: ",classification_report(y_test, y_pred))
print("Confusion Matrix: ",confusion_matrix(y_test, y_pred))

Accuracy:  0.7861811391223156
Report:                precision    recall  f1-score   support

           0       0.69      0.74      0.71        68
           1       0.82      0.96      0.88        72
           2       0.81      0.84      0.83        57
           3       0.84      0.91      0.88        58
           4       0.73      0.41      0.52        66
           5       0.41      0.21      0.28        76
           6       0.80      0.93      0.86        71
           7       0.94      0.74      0.83        61
           8       0.84      0.94      0.88        49
           9       0.81      0.95      0.88        63
          10       0.64      0.73      0.69        64
          11       0.85      1.00      0.92        50
          12       0.94      0.72      0.82        69
          13       0.68      0.98      0.81        55
          14       0.83      0.77      0.80        62
          15       0.89      0.88      0.88        65
          16       0.79      0.86      0.8

# Single Input Predictions

In [17]:
# test 1
print("Actual Label :", y_test.iloc[10])
print("Model Prediction :",model.predict(X_test_scaled[10].reshape(1,-1))[0])
if y_test.iloc[10]==model.predict(X_test_scaled[10].reshape(1,-1)):
    print("Wow! Model doing well.....")
else:
    print("not sure......")

Actual Label : 12
Model Prediction : 12
Wow! Model doing well.....


In [18]:
# test 2
print("Actual Label :", y_test.iloc[300])
print("Model Prediction :",model.predict(X_test_scaled[300].reshape(1,-1))[0])
if y_test.iloc[10]==model.predict(X_test_scaled[10].reshape(1,-1)):
    print("Wow! Model doing well.....")
else:
    print("not sure......")

Actual Label : 0
Model Prediction : 0
Wow! Model doing well.....


In [19]:
# test 2
print("Actual Label :", y_test.iloc[23])
print("Model Prediction :",model.predict(X_test_scaled[23].reshape(1,-1))[0])
if y_test.iloc[10]==model.predict(X_test_scaled[10].reshape(1,-1)):
    print("Wow! Model doing well.....")
else:
    print("not sure......")

Actual Label : 3
Model Prediction : 3
Wow! Model doing well.....


# Saving & Load Files

In [20]:

import pickle

# Save the scaler and the model
with open("Models/scaler.pkl", "wb") as scaler_file:
    pickle.dump(scaler, scaler_file)
with open("Models/model.pkl", "wb") as model_file:
    pickle.dump(model, model_file)

# Recommendation System

In [21]:
import pickle
import numpy as np

# Load the scaler, label encoder, model, and class names
scaler = pickle.load(open("Models/scaler.pkl", 'rb'))
model = pickle.load(open("Models/model.pkl", 'rb'))

# Verify the model type
print(
    type(model)
)  # Should output something like <class 'sklearn.ensemble._forest.RandomForestClassifier'>

# Check if model supports predict_proba
if hasattr(model, "predict_proba"):
    print("Model supports predict_proba")
else:
    print("Model does NOT support predict_proba")
    
class_names = [
    "Bachelor of Supply Chain Management",
    "Bachelor of Statistics",
    "Bachelor of Corporate Communications",
    "Bachelor of Human Resouce Management",
    "Bachelor of Development Studies",
    "Bachelor of Procurement and Contract Management",
    "Bachelor of Project Management",
    "Bachelor of Business Administration",
    "Bachelor of Journalism",
    "Bachelor of Business and Office Management",
    "Bachelor of Economics and Statistics",
    "Bachelor of Mass Communication",
    "Bachelor of Commerce",
    "Bachelor of Procurement and Logistics",
    "Bachelor of Finance",
    "Bachelor of Business Information Technology",
    "Bachelor of Technology and Entrepreneurship Management",
]

def Recommendations(gender, extracurricular_activities,
                    riasec, math_score, history_score, physics_score,
                    chemistry_score, biology_score, english_score, geography_score,
                    total_score,average_score):
    
    # Encode categorical variables
    gender_encoded = 1 if gender.lower() == 'female' else 0
    extracurricular_activities_encoded = 1 if extracurricular_activities else 0
    
    # Create feature array
    feature_array = np.array([[gender_encoded, extracurricular_activities_encoded,
                               riasec, math_score, history_score, physics_score,
                               chemistry_score, biology_score, english_score, geography_score,total_score,average_score]])
    
    # Scale features
    scaled_features = scaler.transform(feature_array)
    
    # Predict using the model
    probabilities = model.predict_proba(scaled_features)
    
    # Get top five predicted classes along with their probabilities
    top_classes_idx = np.argsort(-probabilities[0])[:5]
    top_classes_names_probs = [(class_names[idx], probabilities[0][idx]) for idx in top_classes_idx]
    
    return top_classes_names_probs

<class 'sklearn.ensemble._forest.RandomForestClassifier'>
Model supports predict_proba


In [22]:
# Example usage 1
final_recommendations = Recommendations(gender='female',
                                        extracurricular_activities=False,
                                        riasec=32,
                                        math_score=65,
                                        history_score=60,
                                        physics_score=97,
                                        chemistry_score=94,
                                        biology_score=71,
                                        english_score=81,
                                        geography_score=66,
                                        total_score=534,
                                        average_score=76.285714)

print("Top recommended studies with probabilities:")
print("="*50)
for class_name, probability in final_recommendations:
    print(f"{class_name} with probability {probability}")

Top recommended studies with probabilities:
Bachelor of Project Management with probability 0.61
Bachelor of Development Studies with probability 0.13
Bachelor of Business Administration with probability 0.09
Bachelor of Finance with probability 0.04
Bachelor of Human Resouce Management with probability 0.03


In [23]:
# Example usage 2
final_recommendations = Recommendations(gender='female',
                                        extracurricular_activities=False,
                                        riasec=40,
                                        math_score=87,
                                        history_score=73,
                                        physics_score=67,
                                        chemistry_score=91,
                                        biology_score=79,
                                        english_score=60,
                                        geography_score=77,
                                        total_score=583,
                                        average_score=83.285714)

print("Top recommended studies with probabilities:")
print("="*50)
for class_name, probability in final_recommendations:
    print(f"{class_name} with probability {probability}")


Top recommended studies with probabilities:
Bachelor of Human Resouce Management with probability 0.43
Bachelor of Business Administration with probability 0.13
Bachelor of Supply Chain Management with probability 0.08
Bachelor of Finance with probability 0.07
Bachelor of Procurement and Contract Management with probability 0.07


In [24]:
# sklear version in pychar production 
import sklearn
print(sklearn.__version__)
# in pycharm env install
# pip install scikit-learn==1.3.2

1.4.1.post1
