In [None]:
# ===== CELL 1: Preprocessing Raw Student Dataset with Extra Features =====
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler
import os
import seaborn as sns
import matplotlib.pyplot as plt
import pickle

# Create output folder
os.makedirs('output', exist_ok=True)

# Load raw dataset
raw_file = 'raw_students_dataset.csv'
df = pd.read_csv(raw_file)

# Remove duplicates
df.drop_duplicates(inplace=True)

# Handle missing numeric values
numeric_cols = ['CGPA', 'Aptitude_Score', 'Soft_Skills_Rating', 'Experience']
df[numeric_cols] = df[numeric_cols].apply(pd.to_numeric, errors='coerce')
for col in numeric_cols:
    df[col].fillna(df[col].mean(), inplace=True)

# Handle missing categorical values
categorical_cols = ['Branch', 'Certifications', 'Project_Tech', 
                    'Favorite_Language', 'Preferred_Work_Mode', 'Job_Role']
for col in categorical_cols:
    df[col].fillna(df[col].mode()[0], inplace=True)

# Correct typos / standardize
df['Branch'] = df['Branch'].replace({'C.S.E':'CSE','EC':'ECE','AI DS':'AI&DS'})
df['Job_Role'] = df['Job_Role'].replace({'Web Devloper':'Web Developer','Cyber Sec':'Cybersecurity Analyst'})

# Label encoding for categorical columns
label_encoders = {}
for col in categorical_cols:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    label_encoders[col] = le

# -------------------- Extra Features --------------------
np.random.seed(42)
df['Num_Projects'] = np.random.randint(1, 6, size=len(df))  # 1–5 projects
df['Certifications_Count'] = np.random.randint(0, 5, size=len(df))
df['Internship_Experience_Years'] = np.random.randint(0, 3, size=len(df))
df['Work_Experience_Years'] = np.random.randint(0, 5, size=len(df))
df['Total_Score'] = df['CGPA'] + df['Aptitude_Score'] + df['Soft_Skills_Rating'] + df['Experience']
df['Weighted_Score'] = 0.4*df['CGPA'] + 0.3*df['Aptitude_Score'] + 0.2*df['Soft_Skills_Rating'] + 0.1*df['Experience']

# Scale numeric columns
numeric_cols += ['Num_Projects', 'Certifications_Count', 'Internship_Experience_Years', 
                 'Work_Experience_Years', 'Total_Score', 'Weighted_Score']
scaler = StandardScaler()
df[numeric_cols] = scaler.fit_transform(df[numeric_cols])

# Save preprocessed data and label encoders
df.to_csv('output/preprocessed_students_dataset.csv', index=False)
with open('output/label_encoders.pkl', 'wb') as f:
    pickle.dump(label_encoders, f)

print('Preprocessing complete. Data saved to output/preprocessed_students_dataset.csv')

# ✅ Save scaler for future predictions
with open('output/scaler.pkl', 'wb') as f:
    pickle.dump(scaler, f)


# -------------------- Visualizations --------------------
# Job Role Distribution
plt.figure(figsize=(10,6))
sns.countplot(x='Job_Role', data=df)
plt.title('Job Role Distribution')
plt.xlabel('Encoded Job Role')
plt.ylabel('Count')
plt.tight_layout()
plt.savefig('output/job_role_distribution.png')
plt.show()

# Correlation Heatmap (numeric only)
numeric_df = df.select_dtypes(include=np.number)
plt.figure(figsize=(12,8))
sns.heatmap(numeric_df.corr(), annot=True, cmap='coolwarm')
plt.title('Correlation Heatmap')
plt.tight_layout()
plt.savefig('output/correlation_heatmap.png')
plt.show()


In [None]:
# ===== CELL 2: Train Logistic Regression Model =====
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import pickle
import os
import seaborn as sns
import matplotlib.pyplot as plt

# Load preprocessed data
df = pd.read_csv("output/preprocessed_students_dataset.csv")

# Drop unwanted columns
X = df.drop(columns=['Job_Role', 'Student_ID', 'Name'], errors='ignore')
y = df['Job_Role']

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42, stratify=y)

# Model initialization
model = LogisticRegression(multi_class='multinomial', solver='lbfgs', max_iter=1000)
model.fit(X_train, y_train)

# Predictions and accuracy
y_pred = model.predict(X_test)
acc = accuracy_score(y_test, y_pred)
print(f"✅ Model trained successfully! Accuracy: {acc*100:.2f}%")

# Save model and encoders
os.makedirs("output", exist_ok=True)
with open("output/logistic_model.pkl", "wb") as f:
    pickle.dump(model, f)

print("💾 Model saved successfully to: output/logistic_model.pkl")

# Confusion matrix visualization
plt.figure(figsize=(8,6))
sns.heatmap(confusion_matrix(y_test, y_pred), annot=True, fmt='d', cmap='Blues')
plt.title("Confusion Matrix — Logistic Regression")
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.tight_layout()
plt.savefig("output/confusion_matrix.png")
plt.show()


In [None]:
# ===== CELL 3: Decode Predicted Job Roles =====
import pandas as pd
import pickle
import matplotlib.pyplot as plt
import seaborn as sns

# Load preprocessed data
df = pd.read_csv('output/preprocessed_students_dataset.csv')

# Load trained model
with open('output/logistic_jobrole_model.pkl', 'rb') as f:
    model = pickle.load(f)

# Load label encoders
with open('output/label_encoders.pkl', 'rb') as f:
    label_encoders = pickle.load(f)
job_role_le = label_encoders['Job_Role']

# Features for prediction
X = df.drop(['Job_Role', 'Student_ID', 'Name'], axis=1)

# Predict
y_pred = model.predict(X)
y_pred_decoded = job_role_le.inverse_transform(y_pred)

# Add predicted column to dataframe
df['Predicted_Job_Role'] = y_pred_decoded

# Save dataframe with predictions
df.to_csv('output/students_predictions.csv', index=False)
print('Predictions saved to output/students_predictions.csv')

# Visualize predicted job roles
plt.figure(figsize=(10,6))
sns.countplot(x='Predicted_Job_Role', data=df)
plt.title('Predicted Job Role Distribution')
plt.xticks(rotation=45)
plt.tight_layout()
plt.savefig('output/predicted_job_role_distribution.png')
plt.show()


In [None]:
# =================== CELL 4: Predict on New Dataset ===================
import pandas as pd
import numpy as np
import pickle
import os

# Ensure output folder exists
os.makedirs("output", exist_ok=True)

# -------------------- 1️⃣ Load trained model and encoders --------------------
with open("output/logistic_model.pkl", "rb") as f:
    model = pickle.load(f)

with open("output/label_encoders.pkl", "rb") as f:
    label_encoders = pickle.load(f)

with open("output/scaler.pkl", "rb") as f:
    scaler = pickle.load(f)

job_role_encoder = label_encoders["Job_Role"]

# -------------------- 2️⃣ Load new unseen dataset --------------------
new_df = pd.read_csv("new_students_sample.csv")
print("✅ New test data loaded:", new_df.shape)
print(new_df.head())

# -------------------- 3️⃣ Fill missing values --------------------
numeric_cols = ['CGPA', 'Aptitude_Score', 'Soft_Skills_Rating', 'Experience']
categorical_cols = ['Branch', 'Certifications', 'Project_Tech', 'Favorite_Language', 'Preferred_Work_Mode']

# Numeric columns
for col in numeric_cols:
    new_df[col] = pd.to_numeric(new_df[col], errors='coerce')
    new_df[col].fillna(new_df[col].mean(), inplace=True)

# Categorical columns
for col in categorical_cols:
    new_df[col].fillna(new_df[col].mode()[0], inplace=True)
    le = label_encoders[col]
    # Replace unseen labels with training mode
    new_df[col] = new_df[col].apply(lambda x: x if x in le.classes_ else le.classes_[np.argmax(np.bincount(le.transform(le.classes_)))])
    new_df[col] = le.transform(new_df[col])

# -------------------- 4️⃣ Generate extra features --------------------
np.random.seed(42)
new_df['Num_Projects'] = np.random.randint(1, 6, size=len(new_df))
new_df['Certifications_Count'] = np.random.randint(0, 5, size=len(new_df))
new_df['Internship_Experience_Years'] = np.random.randint(0, 3, size=len(new_df))
new_df['Work_Experience_Years'] = np.random.randint(0, 5, size=len(new_df))
new_df['Total_Score'] = new_df['CGPA'] + new_df['Aptitude_Score'] + new_df['Soft_Skills_Rating'] + new_df['Experience']
new_df['Weighted_Score'] = 0.4*new_df['CGPA'] + 0.3*new_df['Aptitude_Score'] + 0.2*new_df['Soft_Skills_Rating'] + 0.1*new_df['Experience']

# -------------------- 5️⃣ Scale numeric features --------------------
numeric_cols += ['Num_Projects','Certifications_Count','Internship_Experience_Years',
                 'Work_Experience_Years','Total_Score','Weighted_Score']
new_df[numeric_cols] = scaler.transform(new_df[numeric_cols])

# -------------------- 6️⃣ Align columns with model --------------------
expected_cols = model.feature_names_in_
for col in expected_cols:
    if col not in new_df.columns:
        new_df[col] = 0
X_new = new_df[expected_cols]

# -------------------- 7️⃣ Predict job roles --------------------
y_pred = model.predict(X_new)
new_df["Predicted_Job_Role"] = job_role_encoder.inverse_transform(y_pred)

# -------------------- 8️⃣ Save predictions --------------------
output_path = "output/new_students_predictions.csv"
new_df.to_csv(output_path, index=False)
print(f"✅ Predictions saved to: {output_path}")
print(new_df[["Student_ID","Branch","Predicted_Job_Role"]].head(10))


In [None]:
# =================== CELL 5: Visualizations for New Sample Predictions ===================
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import pickle
from sklearn.preprocessing import label_binarize
from sklearn.metrics import roc_curve, auc, confusion_matrix

# -------------------- 1️⃣ Load predictions and model --------------------
pred_df = pd.read_csv("output/new_students_predictions.csv")
with open("output/logistic_model.pkl", "rb") as f:
    model = pickle.load(f)
with open("output/label_encoders.pkl", "rb") as f:
    label_encoders = pickle.load(f)

job_role_encoder = label_encoders["Job_Role"]

# -------------------- 2️⃣ Count plot: Predicted Job Roles --------------------
plt.figure(figsize=(10,6))
sns.countplot(x='Predicted_Job_Role', data=pred_df)
plt.title('Predicted Job Role Distribution')
plt.xticks(rotation=45)
plt.tight_layout()
plt.savefig("output/plot_predicted_job_role_distribution.png")
plt.show()

# -------------------- 3️⃣ Predicted Probabilities Heatmap --------------------
X_new = pred_df[model.feature_names_in_]
probs = model.predict_proba(X_new)  # array of shape (n_samples, n_classes)

plt.figure(figsize=(12,8))
sns.heatmap(probs, annot=False, cmap='coolwarm', xticklabels=job_role_encoder.inverse_transform(range(probs.shape[1])))
plt.title('Predicted Probabilities Heatmap')
plt.xlabel('Job Role')
plt.ylabel('Student Index')
plt.tight_layout()
plt.savefig("output/plot_predicted_probabilities_heatmap.png")
plt.show()

# -------------------- 4️⃣ ROC Curves (One-vs-Rest) --------------------
# Binarize the predicted labels
y_bin = label_binarize(job_role_encoder.transform(pred_df['Predicted_Job_Role']), classes=range(len(job_role_encoder.classes_)))
plt.figure(figsize=(12,8))
for i, class_name in enumerate(job_role_encoder.classes_):
    fpr, tpr, _ = roc_curve(y_bin[:, i], probs[:, i])
    roc_auc = auc(fpr, tpr)
    plt.plot(fpr, tpr, label=f'{class_name} (AUC = {roc_auc:.2f})')
plt.plot([0,1],[0,1],'k--',label='Random')
plt.title('ROC Curves for Predicted Job Roles')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.legend(loc='lower right', fontsize=9)
plt.tight_layout()
plt.savefig("output/plot_roc_curves.png")
plt.show()

# -------------------- 5️⃣ Odd Ratio Plot (Feature Importance) --------------------
coefs = model.coef_  # shape (n_classes, n_features)
for i, class_name in enumerate(job_role_encoder.classes_):
    plt.figure(figsize=(12,6))
    coef_df = pd.DataFrame({'Feature': model.feature_names_in_, 'Coefficient': coefs[i]})
    coef_df = coef_df.sort_values(by='Coefficient', ascending=False)
    sns.barplot(x='Coefficient', y='Feature', data=coef_df, palette='viridis')
    plt.title(f'Odd Ratios / Coefficients for {class_name}')
    plt.tight_layout()
    plt.savefig(f"output/plot_odds_ratio_{class_name}.png")
    plt.show()

# -------------------- 6️⃣ Feature Correlation Heatmap --------------------
plt.figure(figsize=(12,10))
sns.heatmap(pred_df[model.feature_names_in_].corr(), annot=True, cmap='coolwarm')
plt.title('Feature Correlation Heatmap')
plt.tight_layout()
plt.savefig("output/plot_feature_correlation.png")
plt.show()
