# Downloading Neccesarry Libraries

In [None]:
!pip install pandas
!pip install scikit-learn
!pip install matplotlib
!pip install seaborn
!pip install numpy
!pip install tensorflow

# Importing Libraries

In [None]:
import pandas as pd
import sklearn
pd.set_option('display.max_rows', None)       # Show all rows
pd.set_option('display.max_columns', None)    # Show all columns
pd.set_option('display.width', None)          # No line wrap
pd.set_option('display.max_colwidth', None)   # Show full content in each cell
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    confusion_matrix, ConfusionMatrixDisplay,
    roc_auc_score, roc_curve, auc, classification_report
)
from sklearn.preprocessing import label_binarize
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Input
from tensorflow.keras.utils import to_categorical
from collections import Counter

# Loading Dataset

In [None]:
dataset=pd.read_csv('age_interest_career_dataset.csv', encoding= 'cp1252')
print()
print("----------------------------------------------------------------- Dataset ------------------------------------------------------------------")
dataset

# Understaning The Dataset & Its Structure

In [None]:
print("-------------------- Dataset shape --------------------")
print("Rows:", dataset.shape[0], "and", "Columns:", dataset.shape[1])
print ("-------------------------------------------------------")

In [None]:
print("---------------------- Dataset info ----------------------")
print("Total Null Entries in Each Column in Dataset:")
print(dataset.isnull().sum())
print("Total Null Entries: ", dataset.isnull().sum().sum())
print()
print("Data Types of Each Column in Dataset:")
print()
print(dataset.info())
print()
print ("-------------------------------------------------------")

In [None]:
dataset.describe()

# Dataset Preprocessing

<b> Handling Missing Values </b>

In [None]:
for column in dataset.columns:
    if dataset[column].dtype == 'object':
        dataset[column] = dataset[column].fillna(dataset[column].mode()[0])
    else:
        dataset[column] = dataset[column].fillna(dataset[column].mean())

print("-------------------------------------------------------- Dataset after filling Nulls -------------------------------------------------------")
dataset

<b> Dataset info after filling empty values </b>

In [None]:
print("---------------------- Dataset info ----------------------")
print("Total Null Entries in Each Column in Dataset:")
print(dataset.isnull().sum())
print("Total Null Entries: ", dataset.isnull().sum().sum())
print()
print ("-------------------------------------------------------")

<b> Feature Correlation with each other (Numerical Columns) </b>

In [None]:
numeric_cols = dataset.select_dtypes(include=['float64', 'int64']).columns

# Step 2: Compute correlation matrix
corr_matrix = dataset[numeric_cols].corr()

# Step 3: Plot the heatmap
print("\n Feature Correlation Heatmap (Numerical Columns):")
plt.figure(figsize=(14, 10))
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', fmt=".2f", linewidths=0.5)
plt.title("Correlation Matrix of Numerical Features")
plt.xticks(rotation=57)
plt.yticks(rotation=0)
plt.tight_layout()
plt.show()

<b> Dataset is inbalance so removing classes with only one element for better model training</b>

In [None]:
class_counts = dataset['predicted_job'].value_counts()
valid_classes = class_counts[class_counts > 2].index
df_filtered = dataset[dataset['predicted_job'].isin(valid_classes)]

<b> Encoding categorical features: </b>

In [None]:
label_enc = LabelEncoder()

# Label Encoding Target variable
y = label_enc.fit_transform(df_filtered['predicted_job'])

# Dropping the target from features before one-hot encoding
X = df_filtered.drop(columns=['predicted_job'])

# One-hot encoding remaining categorical features
X = pd.get_dummies(X, drop_first=True)

print ("-------------------------------------------------------- Target Variable after Label Encoding -------------------------------------------------------")
print(y)
print("-------------------------------------------------------- Description of Label Encoding -------------------------------------------------------")
for i, class_name in enumerate(label_enc.classes_):
    print(f"{i} -> {class_name}")


# Scalling Features

In [None]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
print(X_scaled)

# Dataset Splitting

In [None]:
X = X_scaled
X_train, X_test, y_train, y_test = train_test_split (X, y, test_size=0.25, random_state=42, stratify=y)

# Model Implementation

<b> 1. Logistic Regression </b>

In [None]:
LogisticRegression_model = LogisticRegression()
LogisticRegression_model.fit(X_train, y_train)
y_pred_log = LogisticRegression_model.predict(X_test)
print ("-------------------------------------------------------- Logistic Regression Model -------------------------------------------------------")
print("Model Accuracy:", LogisticRegression_model.score(X_test, y_test))
print("-------------------------------------------------------- Classification Report -------------------------------------------------------")
print(classification_report(y_test, y_pred_log, zero_division=0))
print("-------------------------------------------------------- Confusion Matrix -------------------------------------------------------")
print(confusion_matrix(y_test, y_pred_log))
print("---------------------------------------------------------------------------------------------------------------------------------")

<b> 2. Decision Tree </b>

In [None]:
dtree = DecisionTreeClassifier()
dtree.fit(X_train, y_train)

y_pred_dt = dtree.predict(X_test)
print ("-------------------------------------------------------- Decision Tree Model -------------------------------------------------------")
print("Model Accuracy:", dtree.score(X_test, y_test))
print("-------------------------------------------------------- Classification Report -------------------------------------------------------")
print(classification_report(y_test, y_pred_dt, zero_division=0))
print("-------------------------------------------------------- Confusion Matrix -------------------------------------------------------")
print(confusion_matrix(y_test, y_pred_dt))
print("---------------------------------------------------------------------------------------------------------------------------------")

# 3. KNN

In [None]:
knn = KNeighborsClassifier()
knn.fit(X_train, y_train)
y_pred_knn = knn.predict(X_test)
print ("-------------------------------------------------------- K-Nearest Neighbors Model -------------------------------------------------------")
print("Model Accuracy:", knn.score(X_test, y_test))
print("-------------------------------------------------------- Classification Report -------------------------------------------------------")
print(classification_report(y_test, y_pred_knn, zero_division=0))
print("-------------------------------------------------------- Confusion Matrix -------------------------------------------------------")
print(confusion_matrix(y_test, y_pred_knn))
print("---------------------------------------------------------------------------------------------------------------------------------")

# 4.Naive Bayes

In [None]:
nb = GaussianNB()
nb.fit(X_train, y_train)
y_pred_nb = nb.predict(X_test)
print ("-------------------------------------------------------- Naive Bayes Model -------------------------------------------------------")
print("Model Accuracy:", nb.score(X_test, y_test))
print("-------------------------------------------------------- Classification Report -------------------------------------------------------")
print(classification_report(y_test, y_pred_nb, zero_division=0))
print("-------------------------------------------------------- Confusion Matrix -------------------------------------------------------")
print(confusion_matrix(y_test, y_pred_nb))
print("---------------------------------------------------------------------------------------------------------------------------------")

# 5. Random Forest

In [None]:
rf = RandomForestClassifier()
rf.fit(X_train, y_train)

y_pred_rf = rf.predict(X_test)
print ("-------------------------------------------------------- Random Forest Model -------------------------------------------------------")
print("Model Accuracy:", rf.score(X_test, y_test))
print("-------------------------------------------------------- Classification Report -------------------------------------------------------")
print(classification_report(y_test, y_pred_rf, zero_division=0))
print("-------------------------------------------------------- Confusion Matrix -------------------------------------------------------")
print(confusion_matrix(y_test, y_pred_rf))
print("---------------------------------------------------------------------------------------------------------------------------------")

# Evaluation

# 1. Logistic Regression

<b>a. Accuracy, Precision, Recall and F1-Score </b>

In [None]:
print("-------------------------------------------------------- Random Forest Performance -------------------------------------------------------")
print("Accuracy:", accuracy_score(y_test, y_pred_log))
print("Precision:", precision_score(y_test, y_pred_log, average='weighted', zero_division=0))
print("Recall:", recall_score(y_test, y_pred_log, average='weighted', zero_division=0))
print("F1 Score:", f1_score(y_test, y_pred_log, average='weighted', zero_division=0))
print()

<b>b. Confusion Matrix (Heat-map) </b>

In [None]:
sns.heatmap(confusion_matrix(y_test, y_pred_log), annot=True, fmt='d', cmap='Greens')
plt.title("Confusion Matrix - Logistic Regression")
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.show()

<b> c. Roc Curve </b>

In [None]:
# For multiclass ROC
y_test_bin = label_binarize(y_test, classes=np.unique(y))
y_pred_prob = LogisticRegression_model.predict_proba(X_test)
n_classes = y_test_bin.shape[1]

# Plot each class's ROC
plt.figure(figsize=(10, 7))
for i in range(n_classes):
    fpr, tpr, _ = roc_curve(y_test_bin[:, i], y_pred_prob[:, i])
    roc_auc = auc(fpr, tpr)
    plt.plot(fpr, tpr, label=f"Class {i} (AUC = {roc_auc:.2f})")

plt.plot([0, 1], [0, 1], 'k--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC Curve - Logistic Regression")
plt.legend(loc="upper center", bbox_to_anchor=(0.5, -0.25), ncol=4)
plt.show()

# 2. Decision Tree

<b>a. Accuracy, Precision, Recall and F1-Score </b>

In [None]:
print("-------------------------------------------------------- Decision Tree Performance -------------------------------------------------------")
print("Accuracy:", accuracy_score(y_test, y_pred_dt))
print("Precision:", precision_score(y_test, y_pred_dt, average='weighted', zero_division=0))
print("Recall:", recall_score(y_test, y_pred_dt, average='weighted', zero_division=0))
print("F1 Score:", f1_score(y_test, y_pred_dt, average='weighted', zero_division=0))
print()

<b>b. Confussion Matrix (Heat-Map) </b>

In [None]:
plt.figure(figsize=(12, 8))
sns.heatmap(confusion_matrix(y_test, y_pred_dt), annot=True, fmt='d', cmap='Blues')
plt.title("Confusion Matrix - Decision Tree")
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.tight_layout()
plt.show()

<b> c. Roc Curves </b>

In [None]:
y_test_bin = label_binarize(y_test, classes=np.unique(y))
y_pred_prob_dt = dtree.predict_proba(X_test)
n_classes = y_test_bin.shape[1]

plt.figure(figsize=(10, 7))
for i in range(n_classes):
    fpr, tpr, _ = roc_curve(y_test_bin[:, i], y_pred_prob_dt[:, i])
    roc_auc = auc(fpr, tpr)
    plt.plot(fpr, tpr, label=f"Class {i} (AUC = {roc_auc:.2f})")

plt.plot([0, 1], [0, 1], 'k--', label="Random Guess")
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC Curve - Decision Tree")
plt.legend(loc="upper center", bbox_to_anchor=(0.5, -0.3), ncol=4, fontsize='small')
plt.tight_layout()
plt.show()

# 3. KNN

<b>a. Accuracy, Precision, Recall and F1-Score </b>

In [None]:
print("-------------------------------------------------------- K-Nearest Neighbors Classification -------------------------------------------------------")
print("Accuracy:", accuracy_score(y_test, y_pred_knn))
print("Precision:", precision_score(y_test, y_pred_knn, average='weighted', zero_division=0))
print("Recall:", recall_score(y_test, y_pred_knn, average='weighted', zero_division=0))
print("F1 Score:", f1_score(y_test, y_pred_knn, average='weighted', zero_division=0))
print("\nClassification Report:\n", classification_report(y_test, y_pred_knn, zero_division=0))
print()

<b>b. Confussion Matrix (Heat-Map) </b>

In [None]:
plt.figure(figsize=(12, 8))
sns.heatmap(confusion_matrix(y_test, y_pred_knn), annot=True, fmt='d', cmap='Purples')
plt.title("Confusion Matrix - KNN")
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.tight_layout()
plt.show()

<b> c. Roc Curves </b>

In [None]:
y_test_bin = label_binarize(y_test, classes=np.unique(y))
y_pred_prob_knn = knn.predict_proba(X_test)
n_classes = y_test_bin.shape[1]

plt.figure(figsize=(10, 7))
for i in range(n_classes):
    fpr, tpr, _ = roc_curve(y_test_bin[:, i], y_pred_prob_knn[:, i])
    roc_auc = auc(fpr, tpr)
    plt.plot(fpr, tpr, label=f"Class {i} (AUC = {roc_auc:.2f})")

plt.plot([0, 1], [0, 1], 'k--', label="Random Guess")
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC Curve - KNN")
plt.legend(loc="upper center", bbox_to_anchor=(0.5, -0.3), ncol=4, fontsize='small')
plt.tight_layout()
plt.show()

# 4. Naive Bayse

<b>a. Accuracy, Precision, Recall and F1-Score </b>

In [None]:
print("-------------------------------------------------------- Naive Bayes Classification -------------------------------------------------------")
print("Accuracy:", accuracy_score(y_test, y_pred_nb))
print("Precision:", precision_score(y_test, y_pred_nb, average='weighted', zero_division=0))
print("Recall:", recall_score(y_test, y_pred_nb, average='weighted', zero_division=0))
print("F1 Score:", f1_score(y_test, y_pred_nb, average='weighted', zero_division=0))
print("\nClassification Report:\n", classification_report(y_test, y_pred_nb, zero_division=0))
print()

<b> b. Confussion Matrix (Heat-map) </b>

In [None]:
plt.figure(figsize=(12, 8))
sns.heatmap(confusion_matrix(y_test, y_pred_nb), annot=True, fmt='d', cmap='Oranges')
plt.title("Confusion Matrix - Naive Bayes")
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.tight_layout()
plt.show()

<b> c. Roc Curves </b>

In [None]:
y_test_bin = label_binarize(y_test, classes=np.unique(y))
y_pred_prob_nb = nb.predict_proba(X_test)
n_classes = y_test_bin.shape[1]

plt.figure(figsize=(10, 7))
for i in range(n_classes):
    fpr, tpr, _ = roc_curve(y_test_bin[:, i], y_pred_prob_nb[:, i])
    roc_auc = auc(fpr, tpr)
    plt.plot(fpr, tpr, label=f"Class {i} (AUC = {roc_auc:.2f})")

plt.plot([0, 1], [0, 1], 'k--', label="Random Guess")
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC Curve - Naive Bayes")
plt.legend(loc="upper center", bbox_to_anchor=(0.5, -0.3), ncol=4, fontsize='small')
plt.tight_layout()
plt.show()

# 5. Random forest

<b>a. Accuracy, Precision, Recall and F1-Score </b>

In [None]:
print("-------------------------------------------------------- Random Forest Classification -------------------------------------------------------")
print("Accuracy:", accuracy_score(y_test, y_pred_rf))
print("Precision:", precision_score(y_test, y_pred_rf, average='weighted', zero_division=0))
print("Recall:", recall_score(y_test, y_pred_rf, average='weighted', zero_division=0))
print("F1 Score:", f1_score(y_test, y_pred_rf, average='weighted', zero_division=0))
print("\nClassification Report:\n", classification_report(y_test, y_pred_rf, zero_division=0))
print()

<b> b. Confussion Matrix (Heat-map) </b>

In [None]:
plt.figure(figsize=(12, 8))
sns.heatmap(confusion_matrix(y_test, y_pred_rf), annot=True, fmt='d', cmap='Greens')
plt.title("Confusion Matrix - Random Forest")
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.tight_layout()
plt.show()

<b> c. Roc Curves </b>

In [None]:
y_test_bin = label_binarize(y_test, classes=np.unique(y))
y_pred_prob_rf = rf.predict_proba(X_test)
n_classes = y_test_bin.shape[1]

plt.figure(figsize=(10, 7))
for i in range(n_classes):
    fpr, tpr, _ = roc_curve(y_test_bin[:, i], y_pred_prob_rf[:, i])
    roc_auc = auc(fpr, tpr)
    plt.plot(fpr, tpr, label=f"Class {i} (AUC = {roc_auc:.2f})")

plt.plot([0, 1], [0, 1], 'k--', label="Random Guess")
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC Curve - Random Forest")
plt.legend(loc="upper center", bbox_to_anchor=(0.5, -0.3), ncol=4, fontsize='small')
plt.tight_layout()
plt.show()

# Artificial Neural Network (ANN)

<b> a. one hot encoding target variable </b>

In [None]:
y_train_cat = to_categorical(y_train)
y_test_cat = to_categorical(y_test)
num_classes = y_train_cat.shape[1]
input_dim = X_train.shape[1]

<b> b. Building Ann </b>

In [None]:
model = Sequential([
    Input(shape=(input_dim,)),
    Dense(128, activation='relu'),
    Dropout(0.3),
    Dense(64, activation='relu'),
    Dropout(0.3),
    Dense(num_classes, activation='softmax')
])

<b>b. Compling Model </b>

In [None]:
model.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

<b>c. Training Model </b>

In [None]:
history = model.fit(X_train, y_train_cat,
                    validation_data=(X_test, y_test_cat),
                    epochs=50,
                    batch_size=16,
                    verbose=1)

<b> d. Evaluating </b>

In [None]:
y_pred_ann_probs = model.predict(X_test)
y_pred_ann = np.argmax(y_pred_ann_probs, axis=1)
print ("-------------------------------------------------------- Artificial Neural Network Model -------------------------------------------------------")
print("Model Accuracy:", accuracy_score(y_test, y_pred_ann))
print("-------------------------------------------------------- Classification Report -------------------------------------------------------")
print(classification_report(y_test, y_pred_ann, zero_division=0))
print("-------------------------------------------------------- Confusion Matrix -------------------------------------------------------")
print(confusion_matrix(y_test, y_pred_ann))
print("---------------------------------------------------------------------------------------------------------------------------------")

<b>a. Accuracy, Precision, Recall and F1-Score </b>

In [None]:
print("-------------------------------------------------------- Random Forest Classification -------------------------------------------------------")
print("Accuracy:", accuracy_score(y_test, y_pred_ann))
print("Precision:", precision_score(y_test, y_pred_ann, average='weighted', zero_division=0))
print("Recall:", recall_score(y_test, y_pred_ann, average='weighted', zero_division=0))
print("F1 Score:", f1_score(y_test, y_pred_ann, average='weighted', zero_division=0))
print("\nClassification Report:\n", classification_report(y_test, y_pred_ann, zero_division=0))
print()

<b> b. Confusion Matrix (Heat-map) </b>

In [None]:
plt.figure(figsize=(12, 8))
sns.heatmap(confusion_matrix(y_test, y_pred_ann), annot=True, fmt='d', cmap='Purples')
plt.title("Confusion Matrix - ANN")
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.tight_layout()
plt.show()

<b> c. Roc Curves </b>

In [None]:
y_test_bin = to_categorical(y_test, num_classes=num_classes)

plt.figure(figsize=(10, 7))
for i in range(num_classes):
    fpr, tpr, _ = roc_curve(y_test_bin[:, i], y_pred_ann_probs[:, i])
    roc_auc = auc(fpr, tpr)
    plt.plot(fpr, tpr, label=f"Class {i} (AUC = {roc_auc:.2f})")

plt.plot([0, 1], [0, 1], 'k--', label="Random Guess")
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC Curve - ANN")
plt.legend(loc="upper center", bbox_to_anchor=(0.5, -0.3), ncol=4, fontsize='small')
plt.tight_layout()
plt.show()