# 🚀 College Placement Data Analysis & Prediction

## 📊 Domain: Education Analytics

### 🎯 Objective: 
Analyze student placement data and build a machine learning model to predict whether a student will be placed or not, based on academic and skill-related features.

### 📋 Project Overview:
- **Dataset**: `placementdata.csv`
- **Target**: Predict placement status (Placed/Not Placed)
- **Approach**: Multiple ML algorithms with comprehensive evaluation
- **Business Goal**: Identify key factors that influence student placement success

---

## 1️⃣ Import Required Libraries

Let's start by importing all the necessary libraries for data manipulation, visualization, and machine learning.

In [None]:
# Data handling and manipulation
import pandas as pd
import numpy as np

# Data visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Machine Learning libraries
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier

# Model evaluation metrics
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.metrics import ConfusionMatrixDisplay

# Warnings
import warnings
warnings.filterwarnings('ignore')

# Set style for better visualizations
plt.style.use('default')
sns.set_palette("husl")

print("📚 All libraries imported successfully!")
print("🎯 Ready to start the placement prediction analysis!")

## 2️⃣ Load and Explore Dataset

Now let's load the placement data and explore its structure to understand what we're working with.

In [None]:
# Load the dataset
df = pd.read_csv('placementdata.csv')

print("📊 Dataset loaded successfully!")
print(f"📐 Dataset shape: {df.shape}")
print(f"📝 Features: {df.shape[1] - 1}")
print(f"👥 Students: {df.shape[0]}")
print("\n" + "="*50)

# Display first few rows
print("🔍 First 5 rows of the dataset:")
print(df.head())

print("\n" + "="*50)

# Display dataset information
print("ℹ️ Dataset Information:")
print(df.info())

print("\n" + "="*50)

# Display basic statistics
print("📈 Basic Statistics:")
print(df.describe())

In [None]:
# Check for missing values
print("🔍 Missing Values Analysis:")
missing_values = df.isnull().sum()
missing_percentage = (missing_values / len(df)) * 100

missing_df = pd.DataFrame({
    'Missing Count': missing_values,
    'Missing Percentage': missing_percentage
})

print(missing_df[missing_df['Missing Count'] > 0])

if missing_df['Missing Count'].sum() == 0:
    print("✅ Great! No missing values found in the dataset.")
    
# Check unique values in categorical columns
print("\n🏷️ Unique values in categorical columns:")
categorical_cols = df.select_dtypes(include=['object']).columns
for col in categorical_cols:
    print(f"{col}: {df[col].unique()}")
    
# Check target variable distribution
print(f"\n🎯 Target Variable Distribution:")
print(df['PlacementStatus'].value_counts())
print(f"\nPlacement Rate: {(df['PlacementStatus'] == 'Placed').mean():.2%}")

## 3️⃣ Data Preprocessing

Time to clean and prepare our data for machine learning models!

In [None]:
# Create a copy of the original dataset for preprocessing
df_processed = df.copy()

# 1. Encode the target variable (PlacementStatus)
print("🎯 Encoding target variable...")
label_encoder_target = LabelEncoder()
df_processed['PlacementStatus_encoded'] = label_encoder_target.fit_transform(df_processed['PlacementStatus'])

# Check the encoding
print("Target encoding mapping:")
for i, class_name in enumerate(label_encoder_target.classes_):
    print(f"  {class_name} -> {i}")

# 2. Encode categorical variables
print("\n🏷️ Encoding categorical variables...")
label_encoders = {}

# ExtracurricularActivities: Yes/No
df_processed['ExtracurricularActivities_encoded'] = label_encoder_target.fit_transform(df_processed['ExtracurricularActivities'])

# PlacementTraining: Yes/No  
df_processed['PlacementTraining_encoded'] = label_encoder_target.fit_transform(df_processed['PlacementTraining'])

print("✅ Categorical variables encoded successfully!")

# 3. Create a clean dataset with only numeric features
numeric_features = ['CGPA', 'Internships', 'Projects', 'Workshops/Certifications', 
                   'AptitudeTestScore', 'SoftSkillsRating', 'SSC_Marks', 'HSC_Marks',
                   'ExtracurricularActivities_encoded', 'PlacementTraining_encoded']

df_clean = df_processed[numeric_features + ['PlacementStatus_encoded']].copy()

print(f"\n📊 Clean dataset shape: {df_clean.shape}")
print(f"🔢 Numeric features: {len(numeric_features)}")

# Display the processed dataset
print("\n🔍 First 5 rows of processed dataset:")
print(df_clean.head())

## 4️⃣ Exploratory Data Analysis (EDA)

Let's dive deep into understanding our data through visualizations and statistical analysis!

In [None]:
# Set up the plotting environment
plt.figure(figsize=(20, 15))

# 1. Count plot for Placement Status
plt.subplot(2, 3, 1)
placement_counts = df['PlacementStatus'].value_counts()
colors = ['#FF6B6B', '#4ECDC4']
plt.pie(placement_counts.values, labels=placement_counts.index, autopct='%1.1f%%', 
        colors=colors, startangle=90)
plt.title('📊 Placement Status Distribution', fontsize=14, fontweight='bold')

# 2. CGPA vs Placement Status
plt.subplot(2, 3, 2)
sns.boxplot(data=df, x='PlacementStatus', y='CGPA', palette=['#FF6B6B', '#4ECDC4'])
plt.title('📚 CGPA vs Placement Status', fontsize=14, fontweight='bold')
plt.ylabel('CGPA')

# 3. Internships vs Placement Status
plt.subplot(2, 3, 3)
internship_placement = pd.crosstab(df['Internships'], df['PlacementStatus'], normalize='index') * 100
internship_placement.plot(kind='bar', color=['#FF6B6B', '#4ECDC4'])
plt.title('💼 Internships vs Placement Rate', fontsize=14, fontweight='bold')
plt.xlabel('Number of Internships')
plt.ylabel('Percentage (%)')
plt.legend(title='Placement Status')
plt.xticks(rotation=0)

# 4. Workshops/Certifications vs Placement Status
plt.subplot(2, 3, 4)
cert_placement = pd.crosstab(df['Workshops/Certifications'], df['PlacementStatus'], normalize='index') * 100
cert_placement.plot(kind='bar', color=['#FF6B6B', '#4ECDC4'])
plt.title('🏆 Certifications vs Placement Rate', fontsize=14, fontweight='bold')
plt.xlabel('Number of Workshops/Certifications')
plt.ylabel('Percentage (%)')
plt.legend(title='Placement Status')
plt.xticks(rotation=0)

# 5. Aptitude Test Score vs Placement Status
plt.subplot(2, 3, 5)
sns.boxplot(data=df, x='PlacementStatus', y='AptitudeTestScore', palette=['#FF6B6B', '#4ECDC4'])
plt.title('🧠 Aptitude Test Score vs Placement', fontsize=14, fontweight='bold')
plt.ylabel('Aptitude Test Score')

# 6. Soft Skills Rating vs Placement Status
plt.subplot(2, 3, 6)
sns.boxplot(data=df, x='PlacementStatus', y='SoftSkillsRating', palette=['#FF6B6B', '#4ECDC4'])
plt.title('🗣️ Soft Skills vs Placement', fontsize=14, fontweight='bold')
plt.ylabel('Soft Skills Rating')

plt.tight_layout()
plt.show()

# Print some key statistics
print("📈 Key Statistics:")
print("="*50)

placed_students = df[df['PlacementStatus'] == 'Placed']
not_placed_students = df[df['PlacementStatus'] == 'NotPlaced']

print(f"📊 Average CGPA - Placed: {placed_students['CGPA'].mean():.2f}, Not Placed: {not_placed_students['CGPA'].mean():.2f}")
print(f"💼 Average Internships - Placed: {placed_students['Internships'].mean():.2f}, Not Placed: {not_placed_students['Internships'].mean():.2f}")
print(f"🏆 Average Certifications - Placed: {placed_students['Workshops/Certifications'].mean():.2f}, Not Placed: {not_placed_students['Workshops/Certifications'].mean():.2f}")
print(f"🧠 Average Aptitude Score - Placed: {placed_students['AptitudeTestScore'].mean():.2f}, Not Placed: {not_placed_students['AptitudeTestScore'].mean():.2f}")
print(f"🗣️ Average Soft Skills - Placed: {placed_students['SoftSkillsRating'].mean():.2f}, Not Placed: {not_placed_students['SoftSkillsRating'].mean():.2f}")

In [None]:
# Correlation Analysis
plt.figure(figsize=(12, 10))

# Create correlation matrix for numeric features
correlation_matrix = df_clean.corr()

# Create heatmap
sns.heatmap(correlation_matrix, annot=True, cmap='RdYlBu_r', center=0, 
            square=True, linewidths=0.5, cbar_kws={"shrink": .8})
plt.title('🔥 Feature Correlation Heatmap', fontsize=16, fontweight='bold', pad=20)
plt.xticks(rotation=45, ha='right')
plt.yticks(rotation=0)
plt.tight_layout()
plt.show()

# Analyze correlations with placement status
print("🎯 Correlation with Placement Status:")
print("="*50)
placement_correlations = correlation_matrix['PlacementStatus_encoded'].sort_values(ascending=False)

for feature, correlation in placement_correlations.items():
    if feature != 'PlacementStatus_encoded':
        direction = "📈 Positive" if correlation > 0 else "📉 Negative"
        strength = "Strong" if abs(correlation) > 0.5 else "Moderate" if abs(correlation) > 0.3 else "Weak"
        print(f"{feature}: {correlation:.3f} ({direction}, {strength})")

## 5️⃣ Data Splitting

Now let's prepare our data for machine learning by splitting it into training and testing sets.

In [None]:
# Separate features (X) and target variable (y)
X = df_clean[numeric_features]  # All numeric features
y = df_clean['PlacementStatus_encoded']  # Target variable

print("🎯 Feature and Target Selection:")
print(f"Features (X) shape: {X.shape}")
print(f"Target (y) shape: {y.shape}")
print(f"\nFeatures included: {list(X.columns)}")

# Split the dataset into training (80%) and testing (20%) sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print(f"\n📊 Data Split Summary:")
print(f"Training set: {X_train.shape[0]} samples ({X_train.shape[0]/len(X)*100:.1f}%)")
print(f"Testing set: {X_test.shape[0]} samples ({X_test.shape[0]/len(X)*100:.1f}%)")

# Check class distribution in train and test sets
print(f"\n📈 Class Distribution:")
print("Training set:")
train_dist = pd.Series(y_train).value_counts(normalize=True) * 100
for class_val, percentage in train_dist.items():
    class_name = "Placed" if class_val == 1 else "NotPlaced"
    print(f"  {class_name}: {percentage:.1f}%")

print("Testing set:")
test_dist = pd.Series(y_test).value_counts(normalize=True) * 100
for class_val, percentage in test_dist.items():
    class_name = "Placed" if class_val == 1 else "NotPlaced"
    print(f"  {class_name}: {percentage:.1f}%")

# Feature Scaling (optional but recommended for some algorithms)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print(f"\n✅ Data preprocessing completed!")
print(f"📏 Features have been standardized for better model performance.")

## 6️⃣ Machine Learning Model Implementation

Time to build and train multiple machine learning models to predict student placement!

In [None]:
# Initialize models
models = {
    'Logistic Regression': LogisticRegression(random_state=42),
    'Decision Tree': DecisionTreeClassifier(random_state=42),
    'Random Forest': RandomForestClassifier(n_estimators=100, random_state=42),
    'K-Nearest Neighbors': KNeighborsClassifier(n_neighbors=5)
}

# Dictionary to store trained models and predictions
trained_models = {}
predictions = {}

print("🚀 Training Multiple Machine Learning Models...")
print("="*60)

# Train each model and make predictions
for model_name, model in models.items():
    print(f"\n🔄 Training {model_name}...")
    
    # Use scaled data for KNN and Logistic Regression, original data for tree-based models
    if model_name in ['K-Nearest Neighbors', 'Logistic Regression']:
        model.fit(X_train_scaled, y_train)
        y_pred = model.predict(X_test_scaled)
    else:
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
    
    # Store the trained model and predictions
    trained_models[model_name] = model
    predictions[model_name] = y_pred
    
    print(f"✅ {model_name} training completed!")

print(f"\n🎉 All models trained successfully!")
print(f"📊 Ready for evaluation and comparison!")

## 7️⃣ Model Evaluation and Comparison

Let's evaluate all our models using multiple metrics and compare their performance!

In [None]:
# Calculate evaluation metrics for all models
evaluation_results = []

print("📊 Model Performance Evaluation")
print("="*80)

for model_name, y_pred in predictions.items():
    # Calculate metrics
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    
    # Store results
    evaluation_results.append({
        'Model': model_name,
        'Accuracy': accuracy,
        'Precision': precision,
        'Recall': recall,
        'F1-Score': f1
    })
    
    print(f"\n🤖 {model_name}:")
    print(f"   Accuracy:  {accuracy:.4f} ({accuracy*100:.2f}%)")
    print(f"   Precision: {precision:.4f} ({precision*100:.2f}%)")
    print(f"   Recall:    {recall:.4f} ({recall*100:.2f}%)")
    print(f"   F1-Score:  {f1:.4f} ({f1*100:.2f}%)")

# Create a comprehensive results DataFrame
results_df = pd.DataFrame(evaluation_results)
results_df = results_df.round(4)

print(f"\n📋 Complete Model Comparison Table:")
print("="*80)
print(results_df.to_string(index=False))

# Sort by accuracy to find the best model
results_df_sorted = results_df.sort_values('Accuracy', ascending=False)
best_model_name = results_df_sorted.iloc[0]['Model']
best_accuracy = results_df_sorted.iloc[0]['Accuracy']

print(f"\n🏆 Best Performing Model: {best_model_name}")
print(f"🎯 Best Accuracy: {best_accuracy:.4f} ({best_accuracy*100:.2f}%)")

In [None]:
# Visualize confusion matrices for all models
fig, axes = plt.subplots(2, 2, figsize=(15, 12))
axes = axes.ravel()

for idx, (model_name, y_pred) in enumerate(predictions.items()):
    cm = confusion_matrix(y_test, y_pred)
    
    # Create confusion matrix display
    disp = ConfusionMatrixDisplay(confusion_matrix=cm, 
                                display_labels=['Not Placed', 'Placed'])
    disp.plot(ax=axes[idx], cmap='Blues', values_format='d')
    axes[idx].set_title(f'{model_name}\nAccuracy: {accuracy_score(y_test, y_pred):.3f}', 
                       fontsize=12, fontweight='bold')

plt.tight_layout()
plt.suptitle('🎯 Confusion Matrices for All Models', fontsize=16, fontweight='bold', y=1.02)
plt.show()

# Model Performance Comparison Visualization
plt.figure(figsize=(15, 10))

# Create subplots for different metrics
metrics = ['Accuracy', 'Precision', 'Recall', 'F1-Score']
colors = ['#FF6B6B', '#4ECDC4', '#45B7D1', '#96CEB4']

for idx, metric in enumerate(metrics):
    plt.subplot(2, 2, idx+1)
    
    # Sort models by the current metric
    metric_data = results_df.sort_values(metric, ascending=True)
    
    bars = plt.barh(metric_data['Model'], metric_data[metric], color=colors[idx], alpha=0.8)
    plt.title(f'📊 {metric} Comparison', fontsize=14, fontweight='bold')
    plt.xlabel(f'{metric} Score')
    
    # Add value labels on bars
    for bar in bars:
        width = bar.get_width()
        plt.text(width + 0.01, bar.get_y() + bar.get_height()/2, 
                f'{width:.3f}', ha='left', va='center', fontweight='bold')
    
    plt.xlim(0, 1.1)
    plt.grid(axis='x', alpha=0.3)

plt.tight_layout()
plt.show()

## 8️⃣ Best Model Selection

Based on our evaluation metrics, let's select and analyze the best performing model.

In [None]:
# Select the best model based on F1-score (balanced metric)
best_model_f1 = results_df.loc[results_df['F1-Score'].idxmax()]
best_model_accuracy = results_df.loc[results_df['Accuracy'].idxmax()]

print("🏆 Best Model Selection Analysis")
print("="*60)
print(f"\n📊 Best by Accuracy: {best_model_accuracy['Model']}")
print(f"   Accuracy: {best_model_accuracy['Accuracy']:.4f} ({best_model_accuracy['Accuracy']*100:.2f}%)")

print(f"\n⚖️ Best by F1-Score: {best_model_f1['Model']}")
print(f"   F1-Score: {best_model_f1['F1-Score']:.4f} ({best_model_f1['F1-Score']*100:.2f}%)")

# Select the model with highest F1-score for final analysis
final_model_name = best_model_f1['Model']
final_model = trained_models[final_model_name]

print(f"\n🎯 Selected Model: {final_model_name}")
print(f"📈 Justification: F1-Score provides a balanced measure of precision and recall")

# Detailed classification report for the best model
print(f"\n📋 Detailed Classification Report - {final_model_name}:")
print("="*60)
y_pred_final = predictions[final_model_name]
print(classification_report(y_test, y_pred_final, 
                          target_names=['Not Placed', 'Placed'],
                          digits=4))

# Store the best model for future predictions
best_model = final_model
best_model_name_final = final_model_name

print(f"✅ Best model ({best_model_name_final}) selected and ready for predictions!")

## 9️⃣ Feature Importance Analysis

Let's analyze which features contribute most to placement predictions, especially using Random Forest.

In [None]:
# Get feature importance from Random Forest model
rf_model = trained_models['Random Forest']
feature_importance = rf_model.feature_importances_

# Create feature importance DataFrame
importance_df = pd.DataFrame({
    'Feature': numeric_features,
    'Importance': feature_importance
}).sort_values('Importance', ascending=False)

print("🔍 Feature Importance Analysis - Random Forest")
print("="*60)
print(importance_df.to_string(index=False))

# Visualize feature importance
plt.figure(figsize=(12, 8))

# Create horizontal bar plot
colors = plt.cm.viridis(np.linspace(0, 1, len(importance_df)))
bars = plt.barh(range(len(importance_df)), importance_df['Importance'], color=colors)

# Customize the plot
plt.xlabel('Feature Importance', fontsize=12, fontweight='bold')
plt.ylabel('Features', fontsize=12, fontweight='bold')
plt.title('🌟 Feature Importance - Random Forest Model', fontsize=16, fontweight='bold', pad=20)

# Add feature names to y-axis
plt.yticks(range(len(importance_df)), importance_df['Feature'])

# Add value labels on bars
for idx, (bar, importance) in enumerate(zip(bars, importance_df['Importance'])):
    plt.text(importance + 0.005, bar.get_y() + bar.get_height()/2, 
             f'{importance:.3f}', ha='left', va='center', fontweight='bold')

plt.grid(axis='x', alpha=0.3)
plt.tight_layout()
plt.show()

# Analyze top features
print(f"\n🏆 Top 5 Most Important Features:")
print("="*40)
for idx, (_, row) in enumerate(importance_df.head().iterrows(), 1):
    percentage = row['Importance'] * 100
    print(f"{idx}. {row['Feature']}: {row['Importance']:.4f} ({percentage:.2f}%)")

# Feature importance insights
print(f"\n💡 Key Insights:")
print("="*40)
top_feature = importance_df.iloc[0]
print(f"🥇 Most Important: {top_feature['Feature']} ({top_feature['Importance']:.3f})")

if 'CGPA' in importance_df['Feature'].values:
    cgpa_importance = importance_df[importance_df['Feature'] == 'CGPA']['Importance'].iloc[0]
    cgpa_rank = importance_df[importance_df['Feature'] == 'CGPA'].index[0] + 1
    print(f"📚 CGPA ranks #{cgpa_rank} with importance: {cgpa_importance:.3f}")

if 'Internships' in importance_df['Feature'].values:
    intern_importance = importance_df[importance_df['Feature'] == 'Internships']['Importance'].iloc[0]
    intern_rank = importance_df[importance_df['Feature'] == 'Internships'].index[0] + 1
    print(f"💼 Internships rank #{intern_rank} with importance: {intern_importance:.3f}")

# Calculate cumulative importance
importance_df['Cumulative_Importance'] = importance_df['Importance'].cumsum()
top_3_cumulative = importance_df.head(3)['Cumulative_Importance'].iloc[-1]
print(f"📊 Top 3 features explain {top_3_cumulative:.1%} of the model's decisions")

## 🔟 New Student Placement Prediction

Let's create a prediction system for new students entering the placement process!

In [None]:
# Create function to predict placement for new students
def predict_placement(cgpa, internships, projects, certifications, aptitude_score, 
                     soft_skills, ssc_marks, hsc_marks, extracurricular, placement_training):
    """
    Predict placement probability for a new student
    """
    # Encode categorical variables
    extracurricular_encoded = 1 if extracurricular.lower() == 'yes' else 0
    placement_training_encoded = 1 if placement_training.lower() == 'yes' else 0
    
    # Create feature array in the same order as training data
    new_student = np.array([[cgpa, internships, projects, certifications, aptitude_score,
                            soft_skills, ssc_marks, hsc_marks, extracurricular_encoded, 
                            placement_training_encoded]])
    
    # Use the best model for prediction
    if best_model_name_final in ['K-Nearest Neighbors', 'Logistic Regression']:
        # Scale the features for models that require scaling
        new_student_scaled = scaler.transform(new_student)
        prediction = best_model.predict(new_student_scaled)[0]
        probability = best_model.predict_proba(new_student_scaled)[0]
    else:
        prediction = best_model.predict(new_student)[0]
        probability = best_model.predict_proba(new_student)[0]
    
    return prediction, probability

# Example predictions for different student profiles
print("🎓 Placement Predictions for New Students")
print("="*70)

# Example 1: High-performing student
print("\n👨‍🎓 Student Profile 1: High Performer")
print("-" * 40)
student1_details = {
    'cgpa': 8.5, 'internships': 2, 'projects': 3, 'certifications': 3,
    'aptitude_score': 90, 'soft_skills': 4.5, 'ssc_marks': 85, 'hsc_marks': 88,
    'extracurricular': 'Yes', 'placement_training': 'Yes'
}

prediction1, prob1 = predict_placement(**student1_details)
result1 = "PLACED" if prediction1 == 1 else "NOT PLACED"
confidence1 = prob1[1] if prediction1 == 1 else prob1[0]

print(f"📊 Profile: CGPA: {student1_details['cgpa']}, Internships: {student1_details['internships']}, Projects: {student1_details['projects']}")
print(f"🧠 Aptitude: {student1_details['aptitude_score']}, Soft Skills: {student1_details['soft_skills']}")
print(f"🎯 Prediction: {result1} (Confidence: {confidence1:.2%})")

# Example 2: Average student
print("\n👩‍🎓 Student Profile 2: Average Performer")
print("-" * 40)
student2_details = {
    'cgpa': 7.2, 'internships': 1, 'projects': 2, 'certifications': 1,
    'aptitude_score': 75, 'soft_skills': 4.0, 'ssc_marks': 70, 'hsc_marks': 72,
    'extracurricular': 'No', 'placement_training': 'Yes'
}

prediction2, prob2 = predict_placement(**student2_details)
result2 = "PLACED" if prediction2 == 1 else "NOT PLACED"
confidence2 = prob2[1] if prediction2 == 1 else prob2[0]

print(f"📊 Profile: CGPA: {student2_details['cgpa']}, Internships: {student2_details['internships']}, Projects: {student2_details['projects']}")
print(f"🧠 Aptitude: {student2_details['aptitude_score']}, Soft Skills: {student2_details['soft_skills']}")
print(f"🎯 Prediction: {result2} (Confidence: {confidence2:.2%})")

# Example 3: Below-average student
print("\n👨‍🎓 Student Profile 3: Below Average Performer")
print("-" * 40)
student3_details = {
    'cgpa': 6.8, 'internships': 0, 'projects': 1, 'certifications': 0,
    'aptitude_score': 65, 'soft_skills': 3.5, 'ssc_marks': 60, 'hsc_marks': 65,
    'extracurricular': 'No', 'placement_training': 'No'
}

prediction3, prob3 = predict_placement(**student3_details)
result3 = "PLACED" if prediction3 == 1 else "NOT PLACED"
confidence3 = prob3[1] if prediction3 == 1 else prob3[0]

print(f"📊 Profile: CGPA: {student3_details['cgpa']}, Internships: {student3_details['internships']}, Projects: {student3_details['projects']}")
print(f"🧠 Aptitude: {student3_details['aptitude_score']}, Soft Skills: {student3_details['soft_skills']}")
print(f"🎯 Prediction: {result3} (Confidence: {confidence3:.2%})")

# Interactive prediction function
print(f"\n🔮 The prediction system is ready!")
print(f"🤖 Model Used: {best_model_name_final}")
print(f"📈 Model Accuracy: {results_df[results_df['Model'] == best_model_name_final]['Accuracy'].iloc[0]:.2%}")
print(f"✅ You can now use the predict_placement() function for any new student!")

## 🧩 Business Analytics Insights

Let's extract actionable business insights from our analysis to help improve placement rates!

In [None]:
# Generate comprehensive business insights
print("💼 BUSINESS ANALYTICS INSIGHTS")
print("="*80)

# 1. CGPA Analysis
placed_cgpa = df[df['PlacementStatus'] == 'Placed']['CGPA'].mean()
not_placed_cgpa = df[df['PlacementStatus'] == 'NotPlaced']['CGPA'].mean()
cgpa_diff = placed_cgpa - not_placed_cgpa

print(f"\n📚 CGPA INSIGHTS:")
print(f"   • Average CGPA of placed students: {placed_cgpa:.2f}")
print(f"   • Average CGPA of not placed students: {not_placed_cgpa:.2f}")
print(f"   • CGPA difference: {cgpa_diff:.2f} points higher for placed students")

# CGPA threshold analysis
high_cgpa_threshold = 8.0
high_cgpa_placed = len(df[(df['CGPA'] >= high_cgpa_threshold) & (df['PlacementStatus'] == 'Placed')])
high_cgpa_total = len(df[df['CGPA'] >= high_cgpa_threshold])
high_cgpa_rate = (high_cgpa_placed / high_cgpa_total) * 100 if high_cgpa_total > 0 else 0

low_cgpa_placed = len(df[(df['CGPA'] < high_cgpa_threshold) & (df['PlacementStatus'] == 'Placed')])
low_cgpa_total = len(df[df['CGPA'] < high_cgpa_threshold])
low_cgpa_rate = (low_cgpa_placed / low_cgpa_total) * 100 if low_cgpa_total > 0 else 0

print(f"   • Placement rate for CGPA ≥ {high_cgpa_threshold}: {high_cgpa_rate:.1f}%")
print(f"   • Placement rate for CGPA < {high_cgpa_threshold}: {low_cgpa_rate:.1f}%")

# 2. Internship Analysis
print(f"\n💼 INTERNSHIP INSIGHTS:")
internship_analysis = df.groupby('Internships')['PlacementStatus'].apply(lambda x: (x == 'Placed').mean() * 100)
for internships, rate in internship_analysis.items():
    print(f"   • {internships} internships: {rate:.1f}% placement rate")

# Students with vs without internships
with_internships = df[df['Internships'] > 0]['PlacementStatus']
without_internships = df[df['Internships'] == 0]['PlacementStatus']

with_internship_rate = (with_internships == 'Placed').mean() * 100
without_internship_rate = (without_internships == 'Placed').mean() * 100
internship_multiplier = with_internship_rate / without_internship_rate if without_internship_rate > 0 else 0

print(f"   • With internships: {with_internship_rate:.1f}% placement rate")
print(f"   • Without internships: {without_internship_rate:.1f}% placement rate")
print(f"   • Internships increase placement chances by {internship_multiplier:.1f}x")

# 3. Certification Analysis
print(f"\n🏆 CERTIFICATION INSIGHTS:")
cert_analysis = df.groupby('Workshops/Certifications')['PlacementStatus'].apply(lambda x: (x == 'Placed').mean() * 100)
for certs, rate in cert_analysis.items():
    print(f"   • {certs} certifications: {rate:.1f}% placement rate")

# 4. Aptitude Score Analysis
print(f"\n🧠 APTITUDE SCORE INSIGHTS:")
aptitude_threshold = 80
high_aptitude_rate = len(df[(df['AptitudeTestScore'] >= aptitude_threshold) & (df['PlacementStatus'] == 'Placed')]) / len(df[df['AptitudeTestScore'] >= aptitude_threshold]) * 100
low_aptitude_rate = len(df[(df['AptitudeTestScore'] < aptitude_threshold) & (df['PlacementStatus'] == 'Placed')]) / len(df[df['AptitudeTestScore'] < aptitude_threshold]) * 100

print(f"   • High aptitude (≥{aptitude_threshold}): {high_aptitude_rate:.1f}% placement rate")
print(f"   • Low aptitude (<{aptitude_threshold}): {low_aptitude_rate:.1f}% placement rate")

# 5. Soft Skills Analysis
print(f"\n🗣️ SOFT SKILLS INSIGHTS:")
soft_skills_threshold = 4.0
high_soft_skills = df[df['SoftSkillsRating'] >= soft_skills_threshold]
low_soft_skills = df[df['SoftSkillsRating'] < soft_skills_threshold]

high_soft_rate = (high_soft_skills['PlacementStatus'] == 'Placed').mean() * 100
low_soft_rate = (low_soft_skills['PlacementStatus'] == 'Placed').mean() * 100

print(f"   • High soft skills (≥{soft_skills_threshold}): {high_soft_rate:.1f}% placement rate")
print(f"   • Low soft skills (<{soft_skills_threshold}): {low_soft_rate:.1f}% placement rate")

# 6. Combined Factors Analysis
print(f"\n🎯 COMBINED SUCCESS FACTORS:")
success_criteria = (
    (df['CGPA'] >= 8.0) & 
    (df['Internships'] >= 1) & 
    (df['Workshops/Certifications'] >= 2)
)
success_students = df[success_criteria]
success_rate = (success_students['PlacementStatus'] == 'Placed').mean() * 100

print(f"   • Students with CGPA≥8.0 + ≥1 internship + ≥2 certifications:")
print(f"     Placement rate: {success_rate:.1f}%")
print(f"     Total students meeting criteria: {len(success_students)}")

# 7. Key Recommendations
print(f"\n💡 KEY RECOMMENDATIONS:")
print("="*50)
print("   1. 🎓 Maintain CGPA above 8.0 for significantly higher placement chances")
print(f"   2. 💼 Complete at least 1 internship (increases chances by {internship_multiplier:.1f}x)")
print("   3. 🏆 Earn 2+ certifications to boost employability")
print(f"   4. 🧠 Score above {aptitude_threshold} in aptitude tests")
print(f"   5. 🗣️ Develop soft skills (target rating ≥{soft_skills_threshold})")
print("   6. 🎯 Focus on placement training programs")
print("   7. 🏃‍♂️ Participate in extracurricular activities")

## 📊 Results Visualization & Summary

Let's create a comprehensive summary of our findings with final visualizations!

In [None]:
# Create comprehensive results visualization
fig, ((ax1, ax2), (ax3, ax4)) = plt.subplots(2, 2, figsize=(18, 12))

# 1. Model Accuracy Comparison
models_sorted = results_df.sort_values('Accuracy', ascending=True)
colors = ['#FF6B6B' if model != best_model_name_final else '#4ECDC4' for model in models_sorted['Model']]

bars1 = ax1.barh(models_sorted['Model'], models_sorted['Accuracy'], color=colors, alpha=0.8)
ax1.set_title('🏆 Model Accuracy Comparison', fontsize=14, fontweight='bold')
ax1.set_xlabel('Accuracy Score')
ax1.set_xlim(0, 1)

# Add value labels
for bar, acc in zip(bars1, models_sorted['Accuracy']):
    ax1.text(acc + 0.01, bar.get_y() + bar.get_height()/2, 
             f'{acc:.3f}', ha='left', va='center', fontweight='bold')

# 2. Feature Importance (Top 6)
top_features = importance_df.head(6)
bars2 = ax2.bar(range(len(top_features)), top_features['Importance'], 
                color='#96CEB4', alpha=0.8)
ax2.set_title('🌟 Top 6 Feature Importance', fontsize=14, fontweight='bold')
ax2.set_xlabel('Features')
ax2.set_ylabel('Importance Score')
ax2.set_xticks(range(len(top_features)))
ax2.set_xticklabels(top_features['Feature'], rotation=45, ha='right')

# Add value labels
for bar, imp in zip(bars2, top_features['Importance']):
    ax2.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.005, 
             f'{imp:.3f}', ha='center', va='bottom', fontweight='bold')

# 3. Placement Rate by Key Factors
factors = ['Overall', 'CGPA≥8.0', '≥1 Internship', '≥2 Certifications', 'High Aptitude', 'All Combined']
rates = [
    (df['PlacementStatus'] == 'Placed').mean() * 100,
    (df[df['CGPA'] >= 8.0]['PlacementStatus'] == 'Placed').mean() * 100,
    (df[df['Internships'] >= 1]['PlacementStatus'] == 'Placed').mean() * 100,
    (df[df['Workshops/Certifications'] >= 2]['PlacementStatus'] == 'Placed').mean() * 100,
    (df[df['AptitudeTestScore'] >= 80]['PlacementStatus'] == 'Placed').mean() * 100,
    success_rate
]

colors3 = ['#FF6B6B', '#4ECDC4', '#45B7D1', '#96CEB4', '#FECA57', '#FF9FF3']
bars3 = ax3.bar(factors, rates, color=colors3, alpha=0.8)
ax3.set_title('📈 Placement Rates by Key Factors', fontsize=14, fontweight='bold')
ax3.set_ylabel('Placement Rate (%)')
ax3.set_ylim(0, 100)
plt.setp(ax3.get_xticklabels(), rotation=45, ha='right')

# Add value labels
for bar, rate in zip(bars3, rates):
    ax3.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 1, 
             f'{rate:.1f}%', ha='center', va='bottom', fontweight='bold')

# 4. Success Profile Visualization
success_profile = {
    'CGPA': 8.5,
    'Internships': 2,
    'Certifications': 3,
    'Aptitude': 90,
    'Soft Skills': 4.5,
    'Projects': 3
}

average_profile = {
    'CGPA': df['CGPA'].mean(),
    'Internships': df['Internships'].mean(),
    'Certifications': df['Workshops/Certifications'].mean(),
    'Aptitude': df['AptitudeTestScore'].mean(),
    'Soft Skills': df['SoftSkillsRating'].mean(),
    'Projects': df['Projects'].mean()
}

# Normalize values for radar chart effect
categories = list(success_profile.keys())
success_values = [success_profile[cat] / max(success_profile[cat], average_profile[cat]) for cat in categories]
average_values = [average_profile[cat] / max(success_profile[cat], average_profile[cat]) for cat in categories]

x_pos = np.arange(len(categories))
width = 0.35

ax4.bar(x_pos - width/2, success_values, width, label='Ideal Profile', color='#4ECDC4', alpha=0.8)
ax4.bar(x_pos + width/2, average_values, width, label='Average Student', color='#FF6B6B', alpha=0.8)
ax4.set_title('👨‍🎓 Ideal vs Average Student Profile', fontsize=14, fontweight='bold')
ax4.set_ylabel('Normalized Score')
ax4.set_xticks(x_pos)
ax4.set_xticklabels(categories, rotation=45, ha='right')
ax4.legend()
ax4.set_ylim(0, 1.2)

plt.tight_layout()
plt.show()

# Final Project Summary
print("\n" + "="*80)
print("🎯 COLLEGE PLACEMENT PREDICTION PROJECT SUMMARY")
print("="*80)

print(f"\n📊 Dataset Statistics:")
print(f"   • Total Students: {len(df):,}")
print(f"   • Features Analyzed: {len(numeric_features)}")
print(f"   • Overall Placement Rate: {(df['PlacementStatus'] == 'Placed').mean():.1%}")

print(f"\n🤖 Machine Learning Results:")
print(f"   • Best Model: {best_model_name_final}")
print(f"   • Model Accuracy: {results_df[results_df['Model'] == best_model_name_final]['Accuracy'].iloc[0]:.1%}")
print(f"   • F1-Score: {results_df[results_df['Model'] == best_model_name_final]['F1-Score'].iloc[0]:.3f}")

print(f"\n🔍 Key Success Factors:")
print(f"   1. {importance_df.iloc[0]['Feature']}: {importance_df.iloc[0]['Importance']:.3f} importance")
print(f"   2. {importance_df.iloc[1]['Feature']}: {importance_df.iloc[1]['Importance']:.3f} importance")
print(f"   3. {importance_df.iloc[2]['Feature']}: {importance_df.iloc[2]['Importance']:.3f} importance")

print(f"\n💡 Business Impact:")
print(f"   • High-performing students (CGPA≥8.0 + internships + certs): {success_rate:.1f}% placement rate")
print(f"   • Internships increase placement chances by {internship_multiplier:.1f}x")
print(f"   • Model can predict placements with {results_df[results_df['Model'] == best_model_name_final]['Accuracy'].iloc[0]:.1%} accuracy")

print(f"\n✅ Project Status: COMPLETED SUCCESSFULLY!")
print(f"🚀 Ready for deployment and real-world application!")
print("="*80)

## 💡 Bonus: Streamlit Dashboard Code (Optional)

Here's a bonus Streamlit app code that you can save and run for interactive predictions!

In [None]:
# Create Streamlit dashboard code
streamlit_code = '''
import streamlit as st
import pandas as pd
import numpy as np
import pickle
from sklearn.ensemble import RandomForestClassifier
import matplotlib.pyplot as plt
import seaborn as sns

# Page configuration
st.set_page_config(
    page_title="🎓 College Placement Predictor", 
    page_icon="🎯",
    layout="wide"
)

# Title and description
st.title("🎓 College Placement Prediction Dashboard")
st.markdown("### Predict student placement probability using machine learning!")

# Sidebar for input
st.sidebar.header("📝 Student Information")

# Input fields
cgpa = st.sidebar.slider("CGPA", 0.0, 10.0, 7.5, 0.1)
internships = st.sidebar.selectbox("Number of Internships", [0, 1, 2, 3, 4])
projects = st.sidebar.selectbox("Number of Projects", [0, 1, 2, 3, 4, 5])
certifications = st.sidebar.selectbox("Workshops/Certifications", [0, 1, 2, 3, 4, 5])
aptitude = st.sidebar.slider("Aptitude Test Score", 0, 100, 75)
soft_skills = st.sidebar.slider("Soft Skills Rating", 0.0, 5.0, 4.0, 0.1)
ssc_marks = st.sidebar.slider("SSC Marks", 0, 100, 75)
hsc_marks = st.sidebar.slider("HSC Marks", 0, 100, 75)
extracurricular = st.sidebar.selectbox("Extracurricular Activities", ["No", "Yes"])
placement_training = st.sidebar.selectbox("Placement Training", ["No", "Yes"])

# Create prediction button
if st.sidebar.button("🎯 Predict Placement", type="primary"):
    # Here you would load your trained model and make predictions
    # For demonstration, we'll create a mock prediction
    
    # Calculate a simple score based on inputs
    score = (cgpa/10 * 0.3 + 
             internships/4 * 0.2 + 
             certifications/5 * 0.2 + 
             aptitude/100 * 0.15 + 
             soft_skills/5 * 0.15)
    
    # Add bonuses for extracurricular and training
    if extracurricular == "Yes":
        score += 0.05
    if placement_training == "Yes":
        score += 0.05
    
    # Convert to probability
    probability = min(score, 0.95)  # Cap at 95%
    
    # Display results
    col1, col2, col3 = st.columns(3)
    
    with col1:
        if probability > 0.7:
            st.success(f"🎉 HIGH CHANCE OF PLACEMENT")
            st.metric("Placement Probability", f"{probability:.1%}")
        elif probability > 0.4:
            st.warning(f"⚠️ MODERATE CHANCE")
            st.metric("Placement Probability", f"{probability:.1%}")
        else:
            st.error(f"❌ LOW CHANCE")
            st.metric("Placement Probability", f"{probability:.1%}")
    
    with col2:
        st.metric("Overall Score", f"{score:.2f}")
        st.metric("CGPA Impact", f"{cgpa/10*0.3:.2f}")
    
    with col3:
        st.metric("Experience Score", f"{(internships/4*0.2 + certifications/5*0.2):.2f}")
        st.metric("Skills Score", f"{(aptitude/100*0.15 + soft_skills/5*0.15):.2f}")

# Display student profile
st.header("📊 Student Profile Summary")
col1, col2 = st.columns(2)

with col1:
    st.subheader("Academic Performance")
    st.write(f"**CGPA:** {cgpa}/10")
    st.write(f"**SSC Marks:** {ssc_marks}%")
    st.write(f"**HSC Marks:** {hsc_marks}%")
    st.write(f"**Aptitude Score:** {aptitude}/100")

with col2:
    st.subheader("Experience & Skills")
    st.write(f"**Internships:** {internships}")
    st.write(f"**Projects:** {projects}")
    st.write(f"**Certifications:** {certifications}")
    st.write(f"**Soft Skills:** {soft_skills}/5")
    st.write(f"**Extracurricular:** {extracurricular}")
    st.write(f"**Placement Training:** {placement_training}")

# Recommendations
st.header("💡 Improvement Recommendations")
recommendations = []

if cgpa < 7.5:
    recommendations.append("📚 Focus on improving CGPA (target: >7.5)")
if internships == 0:
    recommendations.append("💼 Complete at least 1 internship")
if certifications < 2:
    recommendations.append("🏆 Earn more certifications (target: 2+)")
if aptitude < 75:
    recommendations.append("🧠 Improve aptitude test scores (target: >75)")
if soft_skills < 4.0:
    recommendations.append("🗣️ Develop soft skills (target: >4.0)")
if extracurricular == "No":
    recommendations.append("🏃‍♂️ Participate in extracurricular activities")
if placement_training == "No":
    recommendations.append("🎯 Enroll in placement training programs")

if recommendations:
    for rec in recommendations:
        st.write(f"• {rec}")
else:
    st.success("🌟 Excellent profile! Keep up the great work!")

# Footer
st.markdown("---")
st.markdown("*Built with ❤️ using Streamlit and Machine Learning*")
'''

print("💾 Streamlit Dashboard Code:")
print("="*50)
print("Save the following code as 'placement_dashboard.py' and run with:")
print("streamlit run placement_dashboard.py")
print("\n" + streamlit_code)

# Save the Streamlit code to a file
with open('/home/parthnarkar/Desktop/DMBI-MiniProject/placement_dashboard.py', 'w') as f:
    f.write(streamlit_code)

print("✅ Streamlit dashboard code saved as 'placement_dashboard.py'!")
print("🚀 To run the dashboard: streamlit run placement_dashboard.py")