In [4]:
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings('ignore')
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.neighbors import KNeighborsClassifier
import pickle

In [5]:
df = pd.read_csv('E:\CCBST\Projects\StudentPlacement\Dataset\Placement_Data_Full_Class.csv')

In [6]:
df.isnull().sum()

sl_no              0
gender             0
ssc_p              0
ssc_b              0
hsc_p              0
hsc_b              0
hsc_s              0
degree_p           0
degree_t           0
workex             0
etest_p            0
specialisation     0
mba_p              0
status             0
salary            67
dtype: int64

In [7]:
df['salary'].fillna(0,inplace=True)

In [8]:
df.drop(columns=['sl_no'],inplace=True)

In [9]:
df.head()

Unnamed: 0,gender,ssc_p,ssc_b,hsc_p,hsc_b,hsc_s,degree_p,degree_t,workex,etest_p,specialisation,mba_p,status,salary
0,M,67.0,Others,91.0,Others,Commerce,58.0,Sci&Tech,No,55.0,Mkt&HR,58.8,Placed,270000.0
1,M,79.33,Central,78.33,Others,Science,77.48,Sci&Tech,Yes,86.5,Mkt&Fin,66.28,Placed,200000.0
2,M,65.0,Central,68.0,Central,Arts,64.0,Comm&Mgmt,No,75.0,Mkt&Fin,57.8,Placed,250000.0
3,M,56.0,Central,52.0,Central,Science,52.0,Sci&Tech,No,66.0,Mkt&HR,59.43,Not Placed,0.0
4,M,85.8,Central,73.6,Central,Commerce,73.3,Comm&Mgmt,No,96.8,Mkt&Fin,55.5,Placed,425000.0


In [10]:
cat_col = ['gender','hsc_s','degree_t','workex','specialisation','status']

In [11]:
lab_enc = LabelEncoder()

In [12]:
for col in cat_col:
    df[col] = lab_enc.fit_transform(df[col])
df.head()

Unnamed: 0,gender,ssc_p,ssc_b,hsc_p,hsc_b,hsc_s,degree_p,degree_t,workex,etest_p,specialisation,mba_p,status,salary
0,1,67.0,Others,91.0,Others,1,58.0,2,0,55.0,1,58.8,1,270000.0
1,1,79.33,Central,78.33,Others,2,77.48,2,1,86.5,0,66.28,1,200000.0
2,1,65.0,Central,68.0,Central,0,64.0,0,0,75.0,0,57.8,1,250000.0
3,1,56.0,Central,52.0,Central,2,52.0,2,0,66.0,1,59.43,0,0.0
4,1,85.8,Central,73.6,Central,1,73.3,0,0,96.8,0,55.5,1,425000.0


In [13]:
X = df.drop(['status', 'ssc_b', 'hsc_b', 'salary'], axis=1)
y = df['status']
X

Unnamed: 0,gender,ssc_p,hsc_p,hsc_s,degree_p,degree_t,workex,etest_p,specialisation,mba_p
0,1,67.00,91.00,1,58.00,2,0,55.0,1,58.80
1,1,79.33,78.33,2,77.48,2,1,86.5,0,66.28
2,1,65.00,68.00,0,64.00,0,0,75.0,0,57.80
3,1,56.00,52.00,2,52.00,2,0,66.0,1,59.43
4,1,85.80,73.60,1,73.30,0,0,96.8,0,55.50
...,...,...,...,...,...,...,...,...,...,...
210,1,80.60,82.00,1,77.60,0,0,91.0,0,74.49
211,1,58.00,60.00,2,72.00,2,0,74.0,0,53.62
212,1,67.00,67.00,1,73.00,0,1,59.0,0,69.72
213,0,74.00,66.00,1,58.00,0,0,70.0,1,60.23


In [14]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

**Defining all Models**

In [15]:
log_reg = LogisticRegression(solver = 'liblinear')
knn_clf = KNeighborsClassifier(n_neighbors = 10, weights = 'distance')

**Train all Models**

In [22]:
log_reg.fit(X_train, y_train)
y_pred_test_log = log_reg.predict(X_test)
y_pred_train_log = log_reg.predict(X_train)
knn_clf.fit(X_train, y_train)
y_pred_test_knn = knn_clf.predict(X_test)
y_pred_train_knn = knn_clf.predict(X_train)

**Accuracy Scores**

In [23]:
print("Accuracy Score for Logistic Regression : ", accuracy_score(y_test, y_pred_test_log))

Accuracy Score for Logistic Regression :  0.9259259259259259


**Hyper Tunning with GridSearchCV**

In [24]:
# k-Nearest Neighbors (k-NN) GridSearchCV

parameter = {'n_neighbors': [3, 5, 7, 9, 11, 13, 15],
            'metric': ['euclidean', 'manhattan'],
            'weights': ['uniform', 'distance'],
            'p': [1, 2]}

grid_knn = GridSearchCV(knn_clf, param_grid = parameter, cv=5, verbose=2, n_jobs=-1)
grid_knn.fit(X_train, y_train)
y_pred_knn_grid = grid_knn.predict(X_test)

print("Best Parameters : ", grid_knn.best_params_)
print("Best Score : ", grid_knn.best_score_)
print("Accuracy : ", accuracy_score(y_test, y_pred_knn_grid))

Fitting 5 folds for each of 56 candidates, totalling 280 fits
Best Parameters :  {'metric': 'manhattan', 'n_neighbors': 13, 'p': 1, 'weights': 'uniform'}
Best Score :  0.8636363636363636
Accuracy :  0.7962962962962963


In [25]:
pickle.dump(log_reg,open('logregression_model.pkl', 'wb' ))
pickle.dump(knn_clf,open('knn_model.pkl', 'wb' ))