In [129]:
import numpy as np
import pandas as pd
from typing import Dict, Tuple
from sklearn.impute import SimpleImputer
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.metrics import precision_score, recall_score, f1_score, classification_report
from sklearn.ensemble import ExtraTreesClassifier, ExtraTreesRegressor
from sklearn.ensemble import GradientBoostingClassifier, GradientBoostingRegressor
from sklearn import metrics
from sklearn.metrics import roc_curve, roc_auc_score
from sklearn.svm import SVC, NuSVC, LinearSVC, OneClassSVM, SVR, NuSVR, LinearSVR
from sklearn.metrics import accuracy_score, precision_score, roc_auc_score
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline 
sns.set(style="ticks")

In [3]:
data = pd.read_csv('Placement.csv')
data.head()

Unnamed: 0,sl_no,gender,ssc_p,ssc_b,hsc_p,hsc_b,hsc_s,degree_p,degree_t,workex,etest_p,specialisation,mba_p,status,salary
0,1,M,67.0,Others,91.0,Others,Commerce,58.0,Sci&Tech,No,55.0,Mkt&HR,58.8,Placed,270000.0
1,2,M,79.33,Central,78.33,Others,Science,77.48,Sci&Tech,Yes,86.5,Mkt&Fin,66.28,Placed,200000.0
2,3,M,65.0,Central,68.0,Central,Arts,64.0,Comm&Mgmt,No,75.0,Mkt&Fin,57.8,Placed,250000.0
3,4,M,56.0,Central,52.0,Central,Science,52.0,Sci&Tech,No,66.0,Mkt&HR,59.43,Not Placed,
4,5,M,85.8,Central,73.6,Central,Commerce,73.3,Comm&Mgmt,No,96.8,Mkt&Fin,55.5,Placed,425000.0


In [4]:
data.isnull().sum()

sl_no              0
gender             0
ssc_p              0
ssc_b              0
hsc_p              0
hsc_b              0
hsc_s              0
degree_p           0
degree_t           0
workex             0
etest_p            0
specialisation     0
mba_p              0
status             0
salary            67
dtype: int64

In [14]:
from sklearn.preprocessing import LabelEncoder
imputer = SimpleImputer(missing_values=np.nan, strategy='most_frequent')
imputer.fit(features[:,:])
encode = LabelEncoder()
features = data.iloc[:,:].values

features[:,:] = imputer.fit_transform(features[:,:])
features[:,1] = encode.fit_transform(features[:,1])
features[:,3] = encode.fit_transform(features[:,3])
features[:,5] = encode.fit_transform(features[:,5])
features[:,6] = encode.fit_transform(features[:,6])
features[:,8] = encode.fit_transform(features[:,8])
features[:,9] = encode.fit_transform(features[:,9])
features[:,11] = encode.fit_transform(features[:,11])
features[:,13] = encode.fit_transform(features[:,13])
df = pd.DataFrame(features, columns = ['sl_no','gender','ssc_p','ssc_b','hsc_p','hsc_b','hsc_s','degree_p','degree_t','workex','etest_p','specialisation','mba_p','status','salary'])
df.head()

Unnamed: 0,sl_no,gender,ssc_p,ssc_b,hsc_p,hsc_b,hsc_s,degree_p,degree_t,workex,etest_p,specialisation,mba_p,status,salary
0,1,1,67.0,1,91.0,1,1,58.0,2,0,55.0,1,58.8,1,270000
1,2,1,79.33,0,78.33,1,2,77.48,2,1,86.5,0,66.28,1,200000
2,3,1,65.0,0,68.0,0,0,64.0,0,0,75.0,0,57.8,1,250000
3,4,1,56.0,0,52.0,0,2,52.0,2,0,66.0,1,59.43,0,300000
4,5,1,85.8,0,73.6,0,1,73.3,0,0,96.8,0,55.5,1,425000


In [110]:
#из object в числовые типы 
df1 = df.apply(pd.to_numeric, errors='coerce')
df1.dtypes

sl_no               int64
gender              int64
ssc_p             float64
ssc_b               int64
hsc_p             float64
hsc_b               int64
hsc_s               int64
degree_p          float64
degree_t            int64
workex              int64
etest_p           float64
specialisation      int64
mba_p             float64
status              int64
salary            float64
dtype: object

In [111]:
#целевой признак salary предназначен для регрессии. построим на его основе признак для классификации.
def regr_to_class(y: float) -> str:
    if y <= df1['salary'].mean():
        result = '0'
    else:
        result = '1'       
    return result 
df1['y_class'] = \
df1.apply(lambda row: regr_to_class(row['salary']),axis=1)
df1 = df1.drop(['salary'], axis=1)
df1.head()

Unnamed: 0,sl_no,gender,ssc_p,ssc_b,hsc_p,hsc_b,hsc_s,degree_p,degree_t,workex,etest_p,specialisation,mba_p,status,y_class
0,1,1,67.0,1,91.0,1,1,58.0,2,0,55.0,1,58.8,1,0
1,2,1,79.33,0,78.33,1,2,77.48,2,1,86.5,0,66.28,1,0
2,3,1,65.0,0,68.0,0,0,64.0,0,0,75.0,0,57.8,1,0
3,4,1,56.0,0,52.0,0,2,52.0,2,0,66.0,1,59.43,0,1
4,5,1,85.8,0,73.6,0,1,73.3,0,0,96.8,0,55.5,1,1


In [112]:
# разделим на обучающую и тестовую
features = df1.drop(['y_class'], axis=1)
target_tmp = df1['y_class']
target = pd.DataFrame({'y_class':target_tmp.index, 'y_class':target_tmp.values})
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.3,random_state=1 )

# xgboost

In [115]:
from xgboost.sklearn import XGBClassifier
cl1 = XGBClassifier().fit(X_train, y_train)
target1 = cl1.predict(X_test)
target1

array(['0', '1', '1', '1', '1', '1', '0', '0', '0', '1', '1', '1', '1',
       '1', '1', '0', '0', '1', '0', '0', '1', '1', '0', '1', '1', '0',
       '0', '1', '1', '0', '1', '1', '1', '0', '1', '1', '1', '0', '1',
       '1', '0', '1', '0', '1', '0', '0', '0', '0', '0', '1', '0', '1',
       '1', '1', '1', '0', '1', '0', '1', '0', '0', '0', '1', '0', '1'],
      dtype=object)

In [117]:
accuracy = accuracy_score(y_test, target1)
accuracy

0.7076923076923077

In [119]:
precision = precision_score(y_test, target1, average='micro')
precision

0.7076923076923077

# Bagging

In [121]:
from sklearn.ensemble import BaggingClassifier
y = np.ravel(y_train)
cl2 = BaggingClassifier().fit(X_train, y)
target2 = cl2.predict(X_test)
target2

array(['0', '1', '1', '1', '1', '1', '0', '0', '1', '1', '1', '1', '1',
       '0', '1', '0', '0', '1', '0', '0', '1', '0', '1', '1', '1', '0',
       '0', '1', '1', '0', '1', '1', '1', '0', '1', '1', '1', '0', '1',
       '0', '0', '1', '0', '0', '0', '0', '0', '0', '0', '1', '0', '1',
       '1', '0', '1', '0', '1', '0', '1', '0', '0', '0', '1', '0', '1'],
      dtype=object)

In [122]:
accuracy = accuracy_score(y_test, target2)
accuracy

0.7538461538461538

In [124]:
precision = precision_score(y_test, target2, average='micro')
precision
#метод Bagging показал более точные результаты

0.7538461538461538