In [1]:
import pandas as pd

# Load the dataset
df = pd.read_csv("../data/placement.csv")

# Show basic info
print("Shape of the data:", df.shape)
print("\nFirst 5 rows:")
print(df.head())

# Check column types and null values
print("\nData types and missing values:")
print(df.info())


Shape of the data: (10000, 14)

First 5 rows:
   Unnamed: 0  StudentId  CGPA  Major Projects  Workshops/Certificatios  \
0           0          1   7.5               1                        1   
1           1          2   8.9               0                        3   
2           2          3   7.3               1                        2   
3           3          4   7.5               1                        1   
4           4          5   8.3               1                        2   

   Mini Projects  Skills  Communication Skill Rating Internship Hackathon  \
0              1       6                         4.4         No        No   
1              2       9                         4.0        Yes       Yes   
2              2       8                         4.8        Yes        No   
3              2       8                         4.4        Yes       Yes   
4              2       8                         4.5        Yes       Yes   

   12th Percentage  10th Percentage  bac

In [2]:
# 1. Drop unwanted columns
df.drop(['Unnamed: 0', 'StudentId'], axis=1, inplace=True)

# 2. Encode categorical columns
df['Internship'] = df['Internship'].map({'Yes': 1, 'No': 0})
df['Hackathon'] = df['Hackathon'].map({'Yes': 1, 'No': 0})
df['PlacementStatus'] = df['PlacementStatus'].map({'Placed': 1, 'NotPlaced': 0})

# 3. Check for missing values
print("Missing values:\n", df.isnull().sum())

# 4. Show cleaned data
print("\nCleaned DataFrame (first 5 rows):")
print(df.head())


Missing values:
 CGPA                          0
Major Projects                0
Workshops/Certificatios       0
Mini Projects                 0
Skills                        0
Communication Skill Rating    0
Internship                    0
Hackathon                     0
12th Percentage               0
10th Percentage               0
backlogs                      0
PlacementStatus               0
dtype: int64

Cleaned DataFrame (first 5 rows):
   CGPA  Major Projects  Workshops/Certificatios  Mini Projects  Skills  \
0   7.5               1                        1              1       6   
1   8.9               0                        3              2       9   
2   7.3               1                        2              2       8   
3   7.5               1                        1              2       8   
4   8.3               1                        2              2       8   

   Communication Skill Rating  Internship  Hackathon  12th Percentage  \
0                         4

In [3]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# 1. Split features and label
X = df.drop('PlacementStatus', axis=1)
y = df['PlacementStatus']

# 2. Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 3. Train model
model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)

# 4. Predict and evaluate
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)

print(f"Model Accuracy: {accuracy:.2f}")


Model Accuracy: 0.94


In [4]:
import joblib
import os

# Create path if not exists
os.makedirs("../saved_models", exist_ok=True)

# Save the model
joblib.dump(model, "../saved_models/placement_model.pkl")

print("✅ Model saved to 'saved_models/placement_model.pkl'")


✅ Model saved to 'saved_models/placement_model.pkl'
