In [1]:
import numpy as np
import pandas as pd
import sklearn
import pickle
import sys

print("numpy version: ", np.__version__)
print("pandas version: ", pd.__version__)
print("sklearn version: ", sklearn.__version__)
print("Python version: ", sys.version)


numpy version:  1.26.4
pandas version:  2.2.3
sklearn version:  1.4.2
Python version:  3.12.7 | packaged by Anaconda, Inc. | (main, Oct  4 2024, 13:17:27) [MSC v.1929 64 bit (AMD64)]


In [2]:
df = pd.read_csv('students_placement.csv')

In [3]:
df.shape

(5000, 6)

In [4]:
df.sample(5)

Unnamed: 0,iq,cgpa,10th_marks,12th_marks,communication_skills,placed
3490,95,8.77,71.74,57.21,6.9,1
4646,79,8.0,90.29,66.36,5.8,1
3491,97,6.53,87.59,74.93,3.5,0
1156,90,7.34,75.97,66.89,6.1,0
1302,97,8.46,72.77,64.19,9.4,1


In [5]:
X = df.drop(columns=['placed'])
y = df['placed']

In [6]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=2)

In [7]:
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

In [8]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

# Create Random Forest Classifier
rf = RandomForestClassifier(random_state=42)

# Perform Grid Search with cross-validation
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=5, n_jobs=-1, verbose=2)
grid_search.fit(X_train, y_train)

# Get the best parameters
best_rf = grid_search.best_estimator_

Fitting 5 folds for each of 108 candidates, totalling 540 fits


In [9]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report

# Fit the default RandomForest model
rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)

# Use the best parameters found by GridSearchCV
y_pred_rf = best_rf.predict(X_test)

# Accuracy and classification reports
print("----"*20,"\nAccuracy RF (Default Params): ", accuracy_score(y_test, y_pred), "\n","----" * 20)
print("Accuracy RF (Best Params): ", accuracy_score(y_test, y_pred_rf), "\n","----" * 20)
print("\nClassification Report (Best Params):\n", classification_report(y_test, y_pred_rf), "----" * 20)
print("\nClassification Report (Default Params):\n", classification_report(y_test, y_pred), "----" * 20)


-------------------------------------------------------------------------------- 
Accuracy RF (Default Params):  0.544 
 --------------------------------------------------------------------------------
Accuracy RF (Best Params):  0.558 
 --------------------------------------------------------------------------------

Classification Report (Best Params):
               precision    recall  f1-score   support

           0       0.49      0.34      0.40       436
           1       0.59      0.73      0.65       564

    accuracy                           0.56      1000
   macro avg       0.54      0.53      0.53      1000
weighted avg       0.54      0.56      0.54      1000
 --------------------------------------------------------------------------------

Classification Report (Default Params):
               precision    recall  f1-score   support

           0       0.47      0.39      0.43       436
           1       0.58      0.66      0.62       564

    accuracy                

In [10]:
from sklearn.model_selection import cross_val_score
scores = cross_val_score(best_rf, X, y, cv=5, scoring='accuracy')
print("Cross-validation scores: ", scores)
print("Mean cross-validation score: ", scores.mean())

Cross-validation scores:  [0.561 0.562 0.563 0.576 0.567]
Mean cross-validation score:  0.5658


In [11]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score

knn = KNeighborsClassifier(n_neighbors=15)
knn.fit(X_train,y_train)
y_pred = knn.predict(X_test)
accuracy_score(y_test,y_pred)

0.525

In [12]:
# from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
# from sklearn.ensemble import RandomForestClassifier
# from sklearn.metrics import accuracy_score, classification_report
# from sklearn.neighbors import KNeighborsClassifier

# # Split data into training and testing
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# # Random Forest Classifier with expanded parameter grid
# param_grid = {
#     'n_estimators': [100, 200, 300],
#     'max_depth': [None, 10, 20, 30],
#     'min_samples_split': [2, 5, 10],
#     'min_samples_leaf': [1, 2, 4],
#     'max_features': ['auto', 'sqrt', 'log2'],  # Added
#     'max_samples': [None, 0.5, 0.8]  # Added
# }

# # Grid Search with Cross-validation for Random Forest
# rf = RandomForestClassifier(random_state=42)
# grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=5, n_jobs=-1, verbose=2)
# grid_search.fit(X_train, y_train)

# # Get best parameters and fit model
# best_rf = grid_search.best_estimator_
# y_pred_rf = best_rf.predict(X_test)

# # Accuracy and Classification Report for Random Forest
# print("Random Forest - Best Params")
# print("Accuracy: ", accuracy_score(y_test, y_pred_rf))
# print("Classification Report:\n", classification_report(y_test, y_pred_rf))

# # K-Nearest Neighbors (KNN) Model with Hyperparameter Tuning
# knn = KNeighborsClassifier(n_neighbors=5, weights='distance', metric='euclidean')  # Example tuning
# knn.fit(X_train, y_train)
# y_pred_knn = knn.predict(X_test)

# # Accuracy for KNN
# print("\nKNN Accuracy: ", accuracy_score(y_test, y_pred_knn))
# print("KNN Classification Report:\n", classification_report(y_test, y_pred_knn))

# # Cross-validation for Random Forest
# cv_scores = cross_val_score(best_rf, X, y, cv=5, scoring='accuracy')
# print("\nCross-validation scores for Random Forest: ", cv_scores)
# print("Mean Cross-validation score: ", cv_scores.mean())


In [13]:
import pickle 
with open('model.pkl', 'wb') as file:
    pickle.dump(knn, file)