<a href="https://colab.research.google.com/github/mayarachew/data_introduction/blob/main/fundamento_de_dados.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Imports

In [308]:
import pandas as pd
import numpy as np

# Normalization
from sklearn.preprocessing import Normalizer
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler

from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV

# Classification
from sklearn.metrics import classification_report
from sklearn.svm import SVC
from sklearn.naive_bayes import ComplementNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier

# Functions

In [309]:
def clf_best_parameters(search_type, classifier, all_parameters, X_train, y_train, X_test, y_test):
  tuned_parameters = all_parameters

  scores = ['f1']
  clf = []

  for score in scores:
      print("Tuning hyper-parameters for %s" % score)
      print()

      print("Best parameters set found on development set:")
      print()

      clf = search_type(classifier(), all_parameters, scoring='%s_macro' % score)
      clf.fit(X_train, y_train)
      print(clf.best_params_)
      print()
      
      print("Detailed classification report:")
      y_true, y_pred = y_test, clf.predict(X_test)
      print(classification_report(y_true, y_pred, zero_division=1))
      
      return clf.best_params_
      

# Read data

In [310]:
rawData = pd.read_csv('students2.csv')

In [311]:
print(rawData.head())
print(rawData.shape)

  Performance Gender  ... Father_occupation Mother_occupation
0   Excellent   male  ...            DOCTOR            OTHERS
1   Excellent   male  ...    SCHOOL_TEACHER        HOUSE_WIFE
2   Excellent   male  ...          BUSINESS        HOUSE_WIFE
3   Excellent   male  ...    SCHOOL_TEACHER    SCHOOL_TEACHER
4   Excellent   male  ...   COLLEGE_TEACHER        HOUSE_WIFE

[5 rows x 12 columns]
(666, 12)


Remove NaN values

In [312]:
rawData = rawData.replace('?', np.NaN)
rawData = rawData.dropna(axis=0)

View data shape

In [313]:
print(rawData.shape)

(666, 12)


View data attributes

In [314]:
print(rawData.columns)

Index(['Performance', 'Gender', 'Caste', 'coaching', 'time',
       'Class_ten_education', 'twelve_education', 'medium',
       'Class_ X_Percentage', 'Class_ XII_Percentage', 'Father_occupation',
       'Mother_occupation'],
      dtype='object')


In [315]:
def column_standardization(column):
  return column.lower().replace(' ','')

rawData.columns = map(column_standardization, rawData.columns)

attributes = rawData.columns
print(attributes)

Index(['performance', 'gender', 'caste', 'coaching', 'time',
       'class_ten_education', 'twelve_education', 'medium',
       'class_x_percentage', 'class_xii_percentage', 'father_occupation',
       'mother_occupation'],
      dtype='object')


In [316]:
print(rawData.value_counts())

performance  gender  caste    coaching  time  class_ten_education  twelve_education  medium   class_x_percentage  class_xii_percentage  father_occupation  mother_occupation
Vg           male    General  WA        TWO   CBSE                 CBSE              ENGLISH  Excellent           Excellent             OTHERS             HOUSE_WIFE           4
             female  General  WA        TWO   SEBA                 AHSEC             ENGLISH  Excellent           Excellent             OTHERS             HOUSE_WIFE           4
Average      male    ST       WA        TWO   CBSE                 CBSE              ENGLISH  Excellent           Vg                    OTHERS             HOUSE_WIFE           3
Good         male    General  NO        ONE   CBSE                 CBSE              ENGLISH  Excellent           Excellent             OTHERS             HOUSE_WIFE           3
Vg           female  General  WA        ONE   CBSE                 CBSE              ENGLISH  Excellent           E

In [317]:
perf_label = rawData['performance']
print(perf_label)

0      Excellent
1      Excellent
2      Excellent
3      Excellent
4      Excellent
         ...    
661      Average
662      Average
663      Average
664      Average
665      Average
Name: performance, Length: 666, dtype: object


View values in attribute time

In [318]:
print(rawData['performance'].value_counts())

Good         210
Vg           198
Average      157
Excellent    101
Name: performance, dtype: int64


In [319]:
print(rawData['gender'].value_counts())

male      355
female    311
Name: gender, dtype: int64


In [320]:
print(rawData['caste'].value_counts())

General    329
OBC        162
ST         108
SC          67
Name: caste, dtype: int64


In [321]:
print(rawData['coaching'].value_counts())

WA    449
NO    150
OA     67
Name: coaching, dtype: int64


In [322]:
print(rawData['time'].value_counts())

TWO      368
ONE      199
THREE     86
FOUR      11
SEVEN      1
FIVE       1
Name: time, dtype: int64


In [323]:
print(rawData['class_ten_education'].value_counts())

SEBA      396
CBSE      249
OTHERS     21
Name: class_ten_education, dtype: int64


In [324]:
print(rawData['twelve_education'].value_counts())

AHSEC     368
CBSE      290
OTHERS      8
Name: twelve_education, dtype: int64


In [325]:
print(rawData['medium'].value_counts())

ENGLISH     536
OTHERS       74
ASSAMESE     56
Name: medium, dtype: int64


In [326]:
print(rawData['class_x_percentage'].value_counts())

Excellent    511
Vg           101
Good          41
Average       13
Name: class_x_percentage, dtype: int64


In [327]:
print(rawData['class_xii_percentage'].value_counts())

Excellent    398
Vg           181
Good          75
Average       12
Name: class_xii_percentage, dtype: int64


In [328]:
print(rawData['father_occupation'].value_counts())

OTHERS             277
SCHOOL_TEACHER     109
BUSINESS           103
DOCTOR              55
ENGINEER            45
CULTIVATOR          27
COLLEGE_TEACHER     27
BANK_OFFICIAL       23
Name: father_occupation, dtype: int64


In [329]:
print(rawData['mother_occupation'].value_counts())

HOUSE_WIFE         442
SCHOOL_TEACHER     108
OTHERS              72
COLLEGE_TEACHER     20
DOCTOR              13
BANK_OFFICIAL        4
BUSINESS             3
ENGINEER             3
CULTIVATOR           1
Name: mother_occupation, dtype: int64


# Data processing

## Fix time column

Convert attributte values to integer

In [330]:
# Convert object attribute to categorial attribute
rawData['time'] = pd.Categorical(rawData['time'])

# Change categorical attributes to numbers
rawData['time'] = rawData['time'].cat.codes

print(rawData.head())

  performance gender  ... father_occupation mother_occupation
0   Excellent   male  ...            DOCTOR            OTHERS
1   Excellent   male  ...    SCHOOL_TEACHER        HOUSE_WIFE
2   Excellent   male  ...          BUSINESS        HOUSE_WIFE
3   Excellent   male  ...    SCHOOL_TEACHER    SCHOOL_TEACHER
4   Excellent   male  ...   COLLEGE_TEACHER        HOUSE_WIFE

[5 rows x 12 columns]


View table

In [331]:
rawData

Unnamed: 0,performance,gender,caste,coaching,time,class_ten_education,twelve_education,medium,class_x_percentage,class_xii_percentage,father_occupation,mother_occupation
0,Excellent,male,General,NO,2,SEBA,AHSEC,ENGLISH,Excellent,Excellent,DOCTOR,OTHERS
1,Excellent,male,OBC,WA,5,SEBA,AHSEC,OTHERS,Excellent,Excellent,SCHOOL_TEACHER,HOUSE_WIFE
2,Excellent,male,OBC,OA,5,OTHERS,CBSE,ENGLISH,Excellent,Excellent,BUSINESS,HOUSE_WIFE
3,Excellent,male,General,WA,2,SEBA,AHSEC,OTHERS,Excellent,Excellent,SCHOOL_TEACHER,SCHOOL_TEACHER
4,Excellent,male,General,OA,5,SEBA,CBSE,ENGLISH,Excellent,Excellent,COLLEGE_TEACHER,HOUSE_WIFE
...,...,...,...,...,...,...,...,...,...,...,...,...
661,Average,female,ST,WA,2,SEBA,AHSEC,ENGLISH,Good,Vg,OTHERS,HOUSE_WIFE
662,Average,male,ST,WA,4,SEBA,AHSEC,ENGLISH,Vg,Good,CULTIVATOR,HOUSE_WIFE
663,Average,male,ST,WA,5,SEBA,CBSE,ENGLISH,Good,Vg,OTHERS,SCHOOL_TEACHER
664,Average,male,ST,WA,4,SEBA,AHSEC,ENGLISH,Good,Good,SCHOOL_TEACHER,HOUSE_WIFE


## Fix performance column

In [332]:
rawData['performance'] = rawData['performance'].replace(['Good', 'Vg', 'Average', 'Excellent'], [1,2,3,4])

print(rawData['performance'].value_counts())

1    210
2    198
3    157
4    101
Name: performance, dtype: int64


In [333]:
rawData

Unnamed: 0,performance,gender,caste,coaching,time,class_ten_education,twelve_education,medium,class_x_percentage,class_xii_percentage,father_occupation,mother_occupation
0,4,male,General,NO,2,SEBA,AHSEC,ENGLISH,Excellent,Excellent,DOCTOR,OTHERS
1,4,male,OBC,WA,5,SEBA,AHSEC,OTHERS,Excellent,Excellent,SCHOOL_TEACHER,HOUSE_WIFE
2,4,male,OBC,OA,5,OTHERS,CBSE,ENGLISH,Excellent,Excellent,BUSINESS,HOUSE_WIFE
3,4,male,General,WA,2,SEBA,AHSEC,OTHERS,Excellent,Excellent,SCHOOL_TEACHER,SCHOOL_TEACHER
4,4,male,General,OA,5,SEBA,CBSE,ENGLISH,Excellent,Excellent,COLLEGE_TEACHER,HOUSE_WIFE
...,...,...,...,...,...,...,...,...,...,...,...,...
661,3,female,ST,WA,2,SEBA,AHSEC,ENGLISH,Good,Vg,OTHERS,HOUSE_WIFE
662,3,male,ST,WA,4,SEBA,AHSEC,ENGLISH,Vg,Good,CULTIVATOR,HOUSE_WIFE
663,3,male,ST,WA,5,SEBA,CBSE,ENGLISH,Good,Vg,OTHERS,SCHOOL_TEACHER
664,3,male,ST,WA,4,SEBA,AHSEC,ENGLISH,Good,Good,SCHOOL_TEACHER,HOUSE_WIFE


## Fix class_x_percentage column

In [334]:
rawData['class_x_percentage'] = rawData['class_x_percentage'].replace(['Good', 'Vg', 'Average', 'Excellent'], [1,2,3,4])

print(rawData['class_x_percentage'].value_counts())

4    511
2    101
1     41
3     13
Name: class_x_percentage, dtype: int64


In [335]:
rawData

Unnamed: 0,performance,gender,caste,coaching,time,class_ten_education,twelve_education,medium,class_x_percentage,class_xii_percentage,father_occupation,mother_occupation
0,4,male,General,NO,2,SEBA,AHSEC,ENGLISH,4,Excellent,DOCTOR,OTHERS
1,4,male,OBC,WA,5,SEBA,AHSEC,OTHERS,4,Excellent,SCHOOL_TEACHER,HOUSE_WIFE
2,4,male,OBC,OA,5,OTHERS,CBSE,ENGLISH,4,Excellent,BUSINESS,HOUSE_WIFE
3,4,male,General,WA,2,SEBA,AHSEC,OTHERS,4,Excellent,SCHOOL_TEACHER,SCHOOL_TEACHER
4,4,male,General,OA,5,SEBA,CBSE,ENGLISH,4,Excellent,COLLEGE_TEACHER,HOUSE_WIFE
...,...,...,...,...,...,...,...,...,...,...,...,...
661,3,female,ST,WA,2,SEBA,AHSEC,ENGLISH,1,Vg,OTHERS,HOUSE_WIFE
662,3,male,ST,WA,4,SEBA,AHSEC,ENGLISH,2,Good,CULTIVATOR,HOUSE_WIFE
663,3,male,ST,WA,5,SEBA,CBSE,ENGLISH,1,Vg,OTHERS,SCHOOL_TEACHER
664,3,male,ST,WA,4,SEBA,AHSEC,ENGLISH,1,Good,SCHOOL_TEACHER,HOUSE_WIFE


## Fix class_xii_percentage

In [336]:
rawData['class_xii_percentage'] = rawData['class_xii_percentage'].replace(['Good', 'Vg', 'Average', 'Excellent'], [1,2,3,4])

print(rawData['class_xii_percentage'].value_counts())

4    398
2    181
1     75
3     12
Name: class_xii_percentage, dtype: int64


In [337]:
rawData

Unnamed: 0,performance,gender,caste,coaching,time,class_ten_education,twelve_education,medium,class_x_percentage,class_xii_percentage,father_occupation,mother_occupation
0,4,male,General,NO,2,SEBA,AHSEC,ENGLISH,4,4,DOCTOR,OTHERS
1,4,male,OBC,WA,5,SEBA,AHSEC,OTHERS,4,4,SCHOOL_TEACHER,HOUSE_WIFE
2,4,male,OBC,OA,5,OTHERS,CBSE,ENGLISH,4,4,BUSINESS,HOUSE_WIFE
3,4,male,General,WA,2,SEBA,AHSEC,OTHERS,4,4,SCHOOL_TEACHER,SCHOOL_TEACHER
4,4,male,General,OA,5,SEBA,CBSE,ENGLISH,4,4,COLLEGE_TEACHER,HOUSE_WIFE
...,...,...,...,...,...,...,...,...,...,...,...,...
661,3,female,ST,WA,2,SEBA,AHSEC,ENGLISH,1,2,OTHERS,HOUSE_WIFE
662,3,male,ST,WA,4,SEBA,AHSEC,ENGLISH,2,1,CULTIVATOR,HOUSE_WIFE
663,3,male,ST,WA,5,SEBA,CBSE,ENGLISH,1,2,OTHERS,SCHOOL_TEACHER
664,3,male,ST,WA,4,SEBA,AHSEC,ENGLISH,1,1,SCHOOL_TEACHER,HOUSE_WIFE


## Fix other columns

Convert nominal attribute to binary attribute

In [338]:
rawData = pd.get_dummies(rawData)

print(rawData.head())

   performance  ...  mother_occupation_SCHOOL_TEACHER
0            4  ...                                 0
1            4  ...                                 0
2            4  ...                                 0
3            4  ...                                 1
4            4  ...                                 0

[5 rows x 39 columns]


View table

In [339]:
rawData

Unnamed: 0,performance,time,class_x_percentage,class_xii_percentage,gender_female,gender_male,caste_General,caste_OBC,caste_SC,caste_ST,coaching_NO,coaching_OA,coaching_WA,class_ten_education_CBSE,class_ten_education_OTHERS,class_ten_education_SEBA,twelve_education_AHSEC,twelve_education_CBSE,twelve_education_OTHERS,medium_ASSAMESE,medium_ENGLISH,medium_OTHERS,father_occupation_BANK_OFFICIAL,father_occupation_BUSINESS,father_occupation_COLLEGE_TEACHER,father_occupation_CULTIVATOR,father_occupation_DOCTOR,father_occupation_ENGINEER,father_occupation_OTHERS,father_occupation_SCHOOL_TEACHER,mother_occupation_BANK_OFFICIAL,mother_occupation_BUSINESS,mother_occupation_COLLEGE_TEACHER,mother_occupation_CULTIVATOR,mother_occupation_DOCTOR,mother_occupation_ENGINEER,mother_occupation_HOUSE_WIFE,mother_occupation_OTHERS,mother_occupation_SCHOOL_TEACHER
0,4,2,4,4,0,1,1,0,0,0,1,0,0,0,0,1,1,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0
1,4,5,4,4,0,1,0,1,0,0,0,0,1,0,0,1,1,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0
2,4,5,4,4,0,1,0,1,0,0,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
3,4,2,4,4,0,1,1,0,0,0,0,0,1,0,0,1,1,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1
4,4,5,4,4,0,1,1,0,0,0,0,1,0,0,0,1,0,1,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
661,3,2,1,2,1,0,0,0,0,1,0,0,1,0,0,1,1,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0
662,3,4,2,1,0,1,0,0,0,1,0,0,1,0,0,1,1,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0
663,3,5,1,2,0,1,0,0,0,1,0,0,1,0,0,1,0,1,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1
664,3,4,1,1,0,1,0,0,0,1,0,0,1,0,0,1,1,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0


In [340]:
attributes = rawData.columns

print(attributes)

Index(['performance', 'time', 'class_x_percentage', 'class_xii_percentage',
       'gender_female', 'gender_male', 'caste_General', 'caste_OBC',
       'caste_SC', 'caste_ST', 'coaching_NO', 'coaching_OA', 'coaching_WA',
       'class_ten_education_CBSE', 'class_ten_education_OTHERS',
       'class_ten_education_SEBA', 'twelve_education_AHSEC',
       'twelve_education_CBSE', 'twelve_education_OTHERS', 'medium_ASSAMESE',
       'medium_ENGLISH', 'medium_OTHERS', 'father_occupation_BANK_OFFICIAL',
       'father_occupation_BUSINESS', 'father_occupation_COLLEGE_TEACHER',
       'father_occupation_CULTIVATOR', 'father_occupation_DOCTOR',
       'father_occupation_ENGINEER', 'father_occupation_OTHERS',
       'father_occupation_SCHOOL_TEACHER', 'mother_occupation_BANK_OFFICIAL',
       'mother_occupation_BUSINESS', 'mother_occupation_COLLEGE_TEACHER',
       'mother_occupation_CULTIVATOR', 'mother_occupation_DOCTOR',
       'mother_occupation_ENGINEER', 'mother_occupation_HOUSE_WIFE',
    

# Data normalization


## Normalizer

In [341]:
attributes = rawData.columns

In [342]:
print(rawData.values)

[[4 2 4 ... 0 1 0]
 [4 5 4 ... 1 0 0]
 [4 5 4 ... 1 0 0]
 ...
 [3 5 1 ... 0 0 1]
 [3 4 1 ... 1 0 0]
 [3 2 2 ... 0 1 0]]


In [343]:
normalizer = Normalizer()
procData_n = normalizer.transform(rawData.values)

procData_n = pd.DataFrame(procData_n, columns=attributes)

procData_n = round(procData_n, 2)

procData_n

Unnamed: 0,performance,time,class_x_percentage,class_xii_percentage,gender_female,gender_male,caste_General,caste_OBC,caste_SC,caste_ST,coaching_NO,coaching_OA,coaching_WA,class_ten_education_CBSE,class_ten_education_OTHERS,class_ten_education_SEBA,twelve_education_AHSEC,twelve_education_CBSE,twelve_education_OTHERS,medium_ASSAMESE,medium_ENGLISH,medium_OTHERS,father_occupation_BANK_OFFICIAL,father_occupation_BUSINESS,father_occupation_COLLEGE_TEACHER,father_occupation_CULTIVATOR,father_occupation_DOCTOR,father_occupation_ENGINEER,father_occupation_OTHERS,father_occupation_SCHOOL_TEACHER,mother_occupation_BANK_OFFICIAL,mother_occupation_BUSINESS,mother_occupation_COLLEGE_TEACHER,mother_occupation_CULTIVATOR,mother_occupation_DOCTOR,mother_occupation_ENGINEER,mother_occupation_HOUSE_WIFE,mother_occupation_OTHERS,mother_occupation_SCHOOL_TEACHER
0,0.52,0.26,0.52,0.52,0.0,0.13,0.13,0.00,0.0,0.00,0.13,0.00,0.00,0.0,0.00,0.13,0.13,0.00,0.0,0.0,0.13,0.00,0.0,0.00,0.00,0.00,0.13,0.0,0.00,0.00,0.0,0.0,0.0,0.0,0.0,0.0,0.00,0.13,0.00
1,0.44,0.56,0.44,0.44,0.0,0.11,0.00,0.11,0.0,0.00,0.00,0.00,0.11,0.0,0.00,0.11,0.11,0.00,0.0,0.0,0.00,0.11,0.0,0.00,0.00,0.00,0.00,0.0,0.00,0.11,0.0,0.0,0.0,0.0,0.0,0.0,0.11,0.00,0.00
2,0.44,0.56,0.44,0.44,0.0,0.11,0.00,0.11,0.0,0.00,0.00,0.11,0.00,0.0,0.11,0.00,0.00,0.11,0.0,0.0,0.11,0.00,0.0,0.11,0.00,0.00,0.00,0.0,0.00,0.00,0.0,0.0,0.0,0.0,0.0,0.0,0.11,0.00,0.00
3,0.52,0.26,0.52,0.52,0.0,0.13,0.13,0.00,0.0,0.00,0.00,0.00,0.13,0.0,0.00,0.13,0.13,0.00,0.0,0.0,0.00,0.13,0.0,0.00,0.00,0.00,0.00,0.0,0.00,0.13,0.0,0.0,0.0,0.0,0.0,0.0,0.00,0.00,0.13
4,0.44,0.56,0.44,0.44,0.0,0.11,0.11,0.00,0.0,0.00,0.00,0.11,0.00,0.0,0.00,0.11,0.00,0.11,0.0,0.0,0.11,0.00,0.0,0.00,0.11,0.00,0.00,0.0,0.00,0.00,0.0,0.0,0.0,0.0,0.0,0.0,0.11,0.00,0.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
661,0.59,0.39,0.20,0.39,0.2,0.00,0.00,0.00,0.0,0.20,0.00,0.00,0.20,0.0,0.00,0.20,0.20,0.00,0.0,0.0,0.20,0.00,0.0,0.00,0.00,0.00,0.00,0.0,0.20,0.00,0.0,0.0,0.0,0.0,0.0,0.0,0.20,0.00,0.00
662,0.49,0.65,0.32,0.16,0.0,0.16,0.00,0.00,0.0,0.16,0.00,0.00,0.16,0.0,0.00,0.16,0.16,0.00,0.0,0.0,0.16,0.00,0.0,0.00,0.00,0.16,0.00,0.0,0.00,0.00,0.0,0.0,0.0,0.0,0.0,0.0,0.16,0.00,0.00
663,0.44,0.73,0.15,0.29,0.0,0.15,0.00,0.00,0.0,0.15,0.00,0.00,0.15,0.0,0.00,0.15,0.00,0.15,0.0,0.0,0.15,0.00,0.0,0.00,0.00,0.00,0.00,0.0,0.15,0.00,0.0,0.0,0.0,0.0,0.0,0.0,0.00,0.00,0.15
664,0.51,0.68,0.17,0.17,0.0,0.17,0.00,0.00,0.0,0.17,0.00,0.00,0.17,0.0,0.00,0.17,0.17,0.00,0.0,0.0,0.17,0.00,0.0,0.00,0.00,0.00,0.00,0.0,0.00,0.17,0.0,0.0,0.0,0.0,0.0,0.0,0.17,0.00,0.00


# Data standardization

## MinMaxScaler

In [344]:
attributes = rawData.columns

In [345]:
mmscaler = MinMaxScaler()
procData_mm = mmscaler.fit_transform(rawData.values)

procData_mm = pd.DataFrame(procData_mm, columns=attributes)

procData_mm = round(procData_mm, 2)

procData_mm

Unnamed: 0,performance,time,class_x_percentage,class_xii_percentage,gender_female,gender_male,caste_General,caste_OBC,caste_SC,caste_ST,coaching_NO,coaching_OA,coaching_WA,class_ten_education_CBSE,class_ten_education_OTHERS,class_ten_education_SEBA,twelve_education_AHSEC,twelve_education_CBSE,twelve_education_OTHERS,medium_ASSAMESE,medium_ENGLISH,medium_OTHERS,father_occupation_BANK_OFFICIAL,father_occupation_BUSINESS,father_occupation_COLLEGE_TEACHER,father_occupation_CULTIVATOR,father_occupation_DOCTOR,father_occupation_ENGINEER,father_occupation_OTHERS,father_occupation_SCHOOL_TEACHER,mother_occupation_BANK_OFFICIAL,mother_occupation_BUSINESS,mother_occupation_COLLEGE_TEACHER,mother_occupation_CULTIVATOR,mother_occupation_DOCTOR,mother_occupation_ENGINEER,mother_occupation_HOUSE_WIFE,mother_occupation_OTHERS,mother_occupation_SCHOOL_TEACHER
0,1.00,0.4,1.00,1.00,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1,1.00,1.0,1.00,1.00,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
2,1.00,1.0,1.00,1.00,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
3,1.00,0.4,1.00,1.00,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,1.00,1.0,1.00,1.00,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
661,0.67,0.4,0.00,0.33,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
662,0.67,0.8,0.33,0.00,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
663,0.67,1.0,0.00,0.33,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
664,0.67,0.8,0.00,0.00,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


## StandardScaler

In [346]:
scaler = StandardScaler()
procData_s = scaler.fit_transform(rawData.values)

procData_s = pd.DataFrame(procData_s, columns=attributes)

procData_s = round(procData_s, 2)

procData_s

Unnamed: 0,performance,time,class_x_percentage,class_xii_percentage,gender_female,gender_male,caste_General,caste_OBC,caste_SC,caste_ST,coaching_NO,coaching_OA,coaching_WA,class_ten_education_CBSE,class_ten_education_OTHERS,class_ten_education_SEBA,twelve_education_AHSEC,twelve_education_CBSE,twelve_education_OTHERS,medium_ASSAMESE,medium_ENGLISH,medium_OTHERS,father_occupation_BANK_OFFICIAL,father_occupation_BUSINESS,father_occupation_COLLEGE_TEACHER,father_occupation_CULTIVATOR,father_occupation_DOCTOR,father_occupation_ENGINEER,father_occupation_OTHERS,father_occupation_SCHOOL_TEACHER,mother_occupation_BANK_OFFICIAL,mother_occupation_BUSINESS,mother_occupation_COLLEGE_TEACHER,mother_occupation_CULTIVATOR,mother_occupation_DOCTOR,mother_occupation_ENGINEER,mother_occupation_HOUSE_WIFE,mother_occupation_OTHERS,mother_occupation_SCHOOL_TEACHER
0,1.69,-1.37,0.53,0.79,-0.94,0.94,1.01,-0.57,-0.33,-0.44,1.85,-0.33,-1.44,-0.77,-0.18,0.83,0.90,-0.88,-0.11,-0.3,0.49,-0.35,-0.19,-0.43,-0.21,-0.21,3.33,-0.27,-0.84,-0.44,-0.08,-0.07,-0.18,-0.04,-0.14,-0.07,-1.40,2.87,-0.44
1,1.69,0.80,0.53,0.79,-0.94,0.94,-0.99,1.76,-0.33,-0.44,-0.54,-0.33,0.70,-0.77,-0.18,0.83,0.90,-0.88,-0.11,-0.3,-2.03,2.83,-0.19,-0.43,-0.21,-0.21,-0.30,-0.27,-0.84,2.26,-0.08,-0.07,-0.18,-0.04,-0.14,-0.07,0.71,-0.35,-0.44
2,1.69,0.80,0.53,0.79,-0.94,0.94,-0.99,1.76,-0.33,-0.44,-0.54,2.99,-1.44,-0.77,5.54,-1.21,-1.11,1.14,-0.11,-0.3,0.49,-0.35,-0.19,2.34,-0.21,-0.21,-0.30,-0.27,-0.84,-0.44,-0.08,-0.07,-0.18,-0.04,-0.14,-0.07,0.71,-0.35,-0.44
3,1.69,-1.37,0.53,0.79,-0.94,0.94,1.01,-0.57,-0.33,-0.44,-0.54,-0.33,0.70,-0.77,-0.18,0.83,0.90,-0.88,-0.11,-0.3,-2.03,2.83,-0.19,-0.43,-0.21,-0.21,-0.30,-0.27,-0.84,2.26,-0.08,-0.07,-0.18,-0.04,-0.14,-0.07,-1.40,-0.35,2.27
4,1.69,0.80,0.53,0.79,-0.94,0.94,1.01,-0.57,-0.33,-0.44,-0.54,2.99,-1.44,-0.77,-0.18,0.83,-1.11,1.14,-0.11,-0.3,0.49,-0.35,-0.19,-0.43,4.86,-0.21,-0.30,-0.27,-0.84,-0.44,-0.08,-0.07,-0.18,-0.04,-0.14,-0.07,0.71,-0.35,-0.44
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
661,0.74,-1.37,-2.59,-0.96,1.07,-1.07,-0.99,-0.57,-0.33,2.27,-0.54,-0.33,0.70,-0.77,-0.18,0.83,0.90,-0.88,-0.11,-0.3,0.49,-0.35,-0.19,-0.43,-0.21,-0.21,-0.30,-0.27,1.19,-0.44,-0.08,-0.07,-0.18,-0.04,-0.14,-0.07,0.71,-0.35,-0.44
662,0.74,0.07,-1.55,-1.84,-0.94,0.94,-0.99,-0.57,-0.33,2.27,-0.54,-0.33,0.70,-0.77,-0.18,0.83,0.90,-0.88,-0.11,-0.3,0.49,-0.35,-0.19,-0.43,-0.21,4.86,-0.30,-0.27,-0.84,-0.44,-0.08,-0.07,-0.18,-0.04,-0.14,-0.07,0.71,-0.35,-0.44
663,0.74,0.80,-2.59,-0.96,-0.94,0.94,-0.99,-0.57,-0.33,2.27,-0.54,-0.33,0.70,-0.77,-0.18,0.83,-1.11,1.14,-0.11,-0.3,0.49,-0.35,-0.19,-0.43,-0.21,-0.21,-0.30,-0.27,1.19,-0.44,-0.08,-0.07,-0.18,-0.04,-0.14,-0.07,-1.40,-0.35,2.27
664,0.74,0.07,-2.59,-1.84,-0.94,0.94,-0.99,-0.57,-0.33,2.27,-0.54,-0.33,0.70,-0.77,-0.18,0.83,0.90,-0.88,-0.11,-0.3,0.49,-0.35,-0.19,-0.43,-0.21,-0.21,-0.30,-0.27,-0.84,2.26,-0.08,-0.07,-0.18,-0.04,-0.14,-0.07,0.71,-0.35,-0.44


# Classification

## MinMaxScaler data

### Prepare data

In [347]:
procData_mm = procData_mm.drop(['performance'], axis=1)

procData_mm

Unnamed: 0,time,class_x_percentage,class_xii_percentage,gender_female,gender_male,caste_General,caste_OBC,caste_SC,caste_ST,coaching_NO,coaching_OA,coaching_WA,class_ten_education_CBSE,class_ten_education_OTHERS,class_ten_education_SEBA,twelve_education_AHSEC,twelve_education_CBSE,twelve_education_OTHERS,medium_ASSAMESE,medium_ENGLISH,medium_OTHERS,father_occupation_BANK_OFFICIAL,father_occupation_BUSINESS,father_occupation_COLLEGE_TEACHER,father_occupation_CULTIVATOR,father_occupation_DOCTOR,father_occupation_ENGINEER,father_occupation_OTHERS,father_occupation_SCHOOL_TEACHER,mother_occupation_BANK_OFFICIAL,mother_occupation_BUSINESS,mother_occupation_COLLEGE_TEACHER,mother_occupation_CULTIVATOR,mother_occupation_DOCTOR,mother_occupation_ENGINEER,mother_occupation_HOUSE_WIFE,mother_occupation_OTHERS,mother_occupation_SCHOOL_TEACHER
0,0.4,1.00,1.00,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1,1.0,1.00,1.00,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
2,1.0,1.00,1.00,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
3,0.4,1.00,1.00,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,1.0,1.00,1.00,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
661,0.4,0.00,0.33,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
662,0.8,0.33,0.00,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
663,1.0,0.00,0.33,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
664,0.8,0.00,0.00,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


In [None]:
print(perf_label)

0      Excellent
1      Excellent
2      Excellent
3      Excellent
4      Excellent
         ...    
661      Average
662      Average
663      Average
664      Average
665      Average
Name: performance, Length: 666, dtype: object


### Holdout Cross Validation

In [None]:
X = procData_mm.values
y = perf_label.values

(X_train, X_test, y_train, y_test) = train_test_split(X, y, test_size=0.4, random_state=42)

### SVM

In [None]:
all_parameters = [{'kernel': ['rbf'], 
                   'gamma': [1, 2, 0.5, 0.25, 0.1, 1e-2, 1e-3, 1e-4],
                  'C': [0.5, 1, 1.5, 2, 2.5, 5, 7.5, 10, 12.5, 100, 1000]},
                  {'kernel': ['linear'], 'C': [0.5,1, 1.5, 2, 2.5, 5, 7.5, 10, 12.5, 100, 1000]}]

# Find best params using GridSearch
grid_params = clf_best_parameters(GridSearchCV, SVC, all_parameters, X_train, y_train, X_test, y_test)
print(grid_params)

Tuning hyper-parameters for f1

Best parameters set found on development set:

{'C': 2.5, 'gamma': 0.1, 'kernel': 'rbf'}

Detailed classification report:
              precision    recall  f1-score   support

     Average       0.75      0.77      0.76        61
   Excellent       0.50      0.04      0.07        51
        Good       0.38      0.57      0.46        75
          Vg       0.43      0.47      0.45        80

    accuracy                           0.49       267
   macro avg       0.52      0.46      0.44       267
weighted avg       0.50      0.49      0.45       267

{'C': 2.5, 'gamma': 0.1, 'kernel': 'rbf'}


### Complement Naive Bayes

In [None]:
all_parameters = [{'alpha': [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.8, 1.0], 
                   'fit_prior': [True, False], 
                   'norm': [True, False]}]

# Find best params using GridSearch
rand_params = clf_best_parameters(GridSearchCV, ComplementNB, all_parameters, X_train, y_train, X_test, y_test)

Tuning hyper-parameters for f1

Best parameters set found on development set:

{'alpha': 0.8, 'fit_prior': True, 'norm': False}

Detailed classification report:
              precision    recall  f1-score   support

     Average       0.68      0.89      0.77        61
   Excellent       0.53      0.20      0.29        51
        Good       0.41      0.52      0.46        75
          Vg       0.47      0.42      0.44        80

    accuracy                           0.51       267
   macro avg       0.52      0.51      0.49       267
weighted avg       0.51      0.51      0.49       267



### Random Forest

In [None]:
all_parameters = [{'n_estimators': [100, 140, 180], 
                   'criterion': ['gini', 'entropy'], 
                   'min_samples_split': [0.5, 2, 5],
                   'min_samples_leaf': [0.1, 0.5, 1],
                   'min_weight_fraction_leaf': [0.0, 0.2, 0.5],
                   'max_features': ['auto', 'sqrt', 'log2']}]

# Find best params using GridSearch
rand_params = clf_best_parameters(GridSearchCV, RandomForestClassifier, all_parameters, X_train, y_train, X_test, y_test)
print(rand_params)

Tuning hyper-parameters for f1

Best parameters set found on development set:

{'criterion': 'entropy', 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 5, 'min_weight_fraction_leaf': 0.0, 'n_estimators': 180}

Detailed classification report:
              precision    recall  f1-score   support

     Average       0.76      0.74      0.75        61
   Excellent       0.47      0.14      0.21        51
        Good       0.31      0.52      0.39        75
          Vg       0.41      0.35      0.38        80

    accuracy                           0.45       267
   macro avg       0.49      0.44      0.43       267
weighted avg       0.47      0.45      0.43       267

{'criterion': 'entropy', 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 5, 'min_weight_fraction_leaf': 0.0, 'n_estimators': 180}


### K-Nearest Neighbors

In [None]:
all_parameters = [{'n_neighbors': [3, 5, 8, 10, 20], 
                   'weights': ['uniform', 'distance'],
                   'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute'],
                   'leaf_size': [10, 30, 60, 80],
                   'p': [1, 2]}]

# Find best params using GridSearch
rand_params = clf_best_parameters(GridSearchCV, KNeighborsClassifier, all_parameters, X_train, y_train, X_test, y_test)
print(rand_params)

Tuning hyper-parameters for f1

Best parameters set found on development set:

{'algorithm': 'auto', 'leaf_size': 30, 'n_neighbors': 8, 'p': 1, 'weights': 'uniform'}

Detailed classification report:
              precision    recall  f1-score   support

     Average       0.72      0.79      0.75        61
   Excellent       0.35      0.14      0.20        51
        Good       0.39      0.55      0.46        75
          Vg       0.43      0.41      0.42        80

    accuracy                           0.48       267
   macro avg       0.47      0.47      0.46       267
weighted avg       0.47      0.48      0.46       267

{'algorithm': 'auto', 'leaf_size': 30, 'n_neighbors': 8, 'p': 1, 'weights': 'uniform'}


### Decision Tree

In [None]:
all_parameters = [{'criterion': ['gini', 'entropy'], 
                   'splitter': ['best', 'random'],
                   'min_samples_split': [0.5, 2, 5, 10, 20],
                   'min_samples_leaf': [0.1, 0.3, 0.5, 1],
                   'min_weight_fraction_leaf': [0.0, 0.2, 0.4, 0.5],
                   'max_features': ['auto', 'sqrt', 'log2']}]

# Find best params using GridSearch
rand_params = clf_best_parameters(GridSearchCV, DecisionTreeClassifier, all_parameters, X_train, y_train, X_test, y_test)
print(rand_params)

Tuning hyper-parameters for f1

Best parameters set found on development set:

{'criterion': 'entropy', 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 20, 'min_weight_fraction_leaf': 0.0, 'splitter': 'best'}

Detailed classification report:
              precision    recall  f1-score   support

     Average       0.70      0.82      0.76        61
   Excellent       0.57      0.31      0.41        51
        Good       0.36      0.55      0.44        75
          Vg       0.38      0.26      0.31        80

    accuracy                           0.48       267
   macro avg       0.51      0.49      0.48       267
weighted avg       0.49      0.48      0.47       267

{'criterion': 'entropy', 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 20, 'min_weight_fraction_leaf': 0.0, 'splitter': 'best'}


## StandardScaler data

### Prepare data

In [348]:
procData_s = procData_s.drop(['performance'], axis=1)

procData_s

Unnamed: 0,time,class_x_percentage,class_xii_percentage,gender_female,gender_male,caste_General,caste_OBC,caste_SC,caste_ST,coaching_NO,coaching_OA,coaching_WA,class_ten_education_CBSE,class_ten_education_OTHERS,class_ten_education_SEBA,twelve_education_AHSEC,twelve_education_CBSE,twelve_education_OTHERS,medium_ASSAMESE,medium_ENGLISH,medium_OTHERS,father_occupation_BANK_OFFICIAL,father_occupation_BUSINESS,father_occupation_COLLEGE_TEACHER,father_occupation_CULTIVATOR,father_occupation_DOCTOR,father_occupation_ENGINEER,father_occupation_OTHERS,father_occupation_SCHOOL_TEACHER,mother_occupation_BANK_OFFICIAL,mother_occupation_BUSINESS,mother_occupation_COLLEGE_TEACHER,mother_occupation_CULTIVATOR,mother_occupation_DOCTOR,mother_occupation_ENGINEER,mother_occupation_HOUSE_WIFE,mother_occupation_OTHERS,mother_occupation_SCHOOL_TEACHER
0,-1.37,0.53,0.79,-0.94,0.94,1.01,-0.57,-0.33,-0.44,1.85,-0.33,-1.44,-0.77,-0.18,0.83,0.90,-0.88,-0.11,-0.3,0.49,-0.35,-0.19,-0.43,-0.21,-0.21,3.33,-0.27,-0.84,-0.44,-0.08,-0.07,-0.18,-0.04,-0.14,-0.07,-1.40,2.87,-0.44
1,0.80,0.53,0.79,-0.94,0.94,-0.99,1.76,-0.33,-0.44,-0.54,-0.33,0.70,-0.77,-0.18,0.83,0.90,-0.88,-0.11,-0.3,-2.03,2.83,-0.19,-0.43,-0.21,-0.21,-0.30,-0.27,-0.84,2.26,-0.08,-0.07,-0.18,-0.04,-0.14,-0.07,0.71,-0.35,-0.44
2,0.80,0.53,0.79,-0.94,0.94,-0.99,1.76,-0.33,-0.44,-0.54,2.99,-1.44,-0.77,5.54,-1.21,-1.11,1.14,-0.11,-0.3,0.49,-0.35,-0.19,2.34,-0.21,-0.21,-0.30,-0.27,-0.84,-0.44,-0.08,-0.07,-0.18,-0.04,-0.14,-0.07,0.71,-0.35,-0.44
3,-1.37,0.53,0.79,-0.94,0.94,1.01,-0.57,-0.33,-0.44,-0.54,-0.33,0.70,-0.77,-0.18,0.83,0.90,-0.88,-0.11,-0.3,-2.03,2.83,-0.19,-0.43,-0.21,-0.21,-0.30,-0.27,-0.84,2.26,-0.08,-0.07,-0.18,-0.04,-0.14,-0.07,-1.40,-0.35,2.27
4,0.80,0.53,0.79,-0.94,0.94,1.01,-0.57,-0.33,-0.44,-0.54,2.99,-1.44,-0.77,-0.18,0.83,-1.11,1.14,-0.11,-0.3,0.49,-0.35,-0.19,-0.43,4.86,-0.21,-0.30,-0.27,-0.84,-0.44,-0.08,-0.07,-0.18,-0.04,-0.14,-0.07,0.71,-0.35,-0.44
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
661,-1.37,-2.59,-0.96,1.07,-1.07,-0.99,-0.57,-0.33,2.27,-0.54,-0.33,0.70,-0.77,-0.18,0.83,0.90,-0.88,-0.11,-0.3,0.49,-0.35,-0.19,-0.43,-0.21,-0.21,-0.30,-0.27,1.19,-0.44,-0.08,-0.07,-0.18,-0.04,-0.14,-0.07,0.71,-0.35,-0.44
662,0.07,-1.55,-1.84,-0.94,0.94,-0.99,-0.57,-0.33,2.27,-0.54,-0.33,0.70,-0.77,-0.18,0.83,0.90,-0.88,-0.11,-0.3,0.49,-0.35,-0.19,-0.43,-0.21,4.86,-0.30,-0.27,-0.84,-0.44,-0.08,-0.07,-0.18,-0.04,-0.14,-0.07,0.71,-0.35,-0.44
663,0.80,-2.59,-0.96,-0.94,0.94,-0.99,-0.57,-0.33,2.27,-0.54,-0.33,0.70,-0.77,-0.18,0.83,-1.11,1.14,-0.11,-0.3,0.49,-0.35,-0.19,-0.43,-0.21,-0.21,-0.30,-0.27,1.19,-0.44,-0.08,-0.07,-0.18,-0.04,-0.14,-0.07,-1.40,-0.35,2.27
664,0.07,-2.59,-1.84,-0.94,0.94,-0.99,-0.57,-0.33,2.27,-0.54,-0.33,0.70,-0.77,-0.18,0.83,0.90,-0.88,-0.11,-0.3,0.49,-0.35,-0.19,-0.43,-0.21,-0.21,-0.30,-0.27,-0.84,2.26,-0.08,-0.07,-0.18,-0.04,-0.14,-0.07,0.71,-0.35,-0.44


In [None]:
print(perf_label)

0      Excellent
1      Excellent
2      Excellent
3      Excellent
4      Excellent
         ...    
661      Average
662      Average
663      Average
664      Average
665      Average
Name: performance, Length: 666, dtype: object


### Holdout Cross Validation

In [349]:
X = procData_s.values
y = perf_label.values

(X_train, X_test, y_train, y_test) = train_test_split(X, y, test_size=0.4, random_state=42)

### SVM

In [None]:
all_parameters = [{'kernel': ['rbf'], 
                   'gamma': [1, 2, 0.5, 0.25, 0.1, 1e-2, 1e-3, 1e-4],
                  'C': [0.5, 1, 1.5, 2, 2.5, 5, 7.5, 10, 12.5, 100, 1000]},
                  {'kernel': ['linear'], 'C': [0.5,1, 1.5, 2, 2.5, 5, 7.5, 10, 12.5, 100, 1000]}]

# Find best params using GridSearch
grid_params = clf_best_parameters(GridSearchCV, SVC, all_parameters, X_train, y_train, X_test, y_test)
print(grid_params)

Tuning hyper-parameters for f1

Best parameters set found on development set:

{'C': 7.5, 'gamma': 0.01, 'kernel': 'rbf'}

Detailed classification report:
              precision    recall  f1-score   support

     Average       0.74      0.70      0.72        61
   Excellent       0.37      0.14      0.20        51
        Good       0.34      0.47      0.39        75
          Vg       0.41      0.45      0.43        80

    accuracy                           0.45       267
   macro avg       0.47      0.44      0.44       267
weighted avg       0.46      0.45      0.44       267

{'C': 7.5, 'gamma': 0.01, 'kernel': 'rbf'}


### Complement Naive Bayes (don't accept negative values)

In [None]:
# all_parameters = [{'alpha': [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.8, 1.0], 
#                    'fit_prior': [True, False], 
#                    'norm': [True, False]}]

# # Find best params using GridSearch
# rand_params = clf_best_parameters(GridSearchCV, ComplementNB, all_parameters)

# valores neg nao

### Random Forest

In [None]:
all_parameters = [{'n_estimators': [100, 140, 180], 
                   'criterion': ['gini', 'entropy'], 
                   'min_samples_split': [0.5, 2, 5],
                   'min_samples_leaf': [0.1, 0.5, 1],
                   'min_weight_fraction_leaf': [0.0, 0.2, 0.5],
                   'max_features': ['auto', 'sqrt', 'log2']}]

# Find best params using GridSearch
rand_params = clf_best_parameters(GridSearchCV, RandomForestClassifier, all_parameters, X_train, y_train, X_test, y_test)
print(rand_params)

Tuning hyper-parameters for f1

Best parameters set found on development set:

{'criterion': 'entropy', 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 5, 'min_weight_fraction_leaf': 0.0, 'n_estimators': 180}

Detailed classification report:
              precision    recall  f1-score   support

     Average       0.71      0.77      0.74        61
   Excellent       0.44      0.14      0.21        51
        Good       0.32      0.49      0.39        75
          Vg       0.46      0.40      0.43        80

    accuracy                           0.46       267
   macro avg       0.48      0.45      0.44       267
weighted avg       0.47      0.46      0.45       267

{'criterion': 'entropy', 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 5, 'min_weight_fraction_leaf': 0.0, 'n_estimators': 180}


### K-Nearest Neighbors

In [None]:
all_parameters = [{'n_neighbors': [3, 5, 8, 10, 20], 
                   'weights': ['uniform', 'distance'],
                   'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute'],
                   'leaf_size': [10, 30, 60, 80],
                   'p': [1, 2]}]

# Find best params using GridSearch
rand_params = clf_best_parameters(GridSearchCV, KNeighborsClassifier, all_parameters, X_train, y_train, X_test, y_test)
print(rand_params)

Tuning hyper-parameters for f1

Best parameters set found on development set:

{'algorithm': 'auto', 'leaf_size': 80, 'n_neighbors': 8, 'p': 1, 'weights': 'uniform'}

Detailed classification report:
              precision    recall  f1-score   support

     Average       0.73      0.70      0.72        61
   Excellent       0.36      0.16      0.22        51
        Good       0.32      0.47      0.38        75
          Vg       0.35      0.33      0.34        80

    accuracy                           0.42       267
   macro avg       0.44      0.41      0.41       267
weighted avg       0.43      0.42      0.41       267

{'algorithm': 'auto', 'leaf_size': 80, 'n_neighbors': 8, 'p': 1, 'weights': 'uniform'}


### Decision Tree

In [None]:
all_parameters = [{'criterion': ['gini', 'entropy'], 
                   'splitter': ['best', 'random'],
                   'min_samples_split': [0.5, 2, 5, 10, 20],
                   'min_samples_leaf': [0.1, 0.3, 0.5, 1],
                   'min_weight_fraction_leaf': [0.0, 0.2, 0.4, 0.5],
                   'max_features': ['auto', 'sqrt', 'log2']}]

# Find best params using GridSearch
rand_params = clf_best_parameters(GridSearchCV, DecisionTreeClassifier, all_parameters, X_train, y_train, X_test, y_test)
print(rand_params)

Tuning hyper-parameters for f1

Best parameters set found on development set:

{'criterion': 'gini', 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 10, 'min_weight_fraction_leaf': 0.0, 'splitter': 'random'}

Detailed classification report:
              precision    recall  f1-score   support

     Average       0.65      0.74      0.69        61
   Excellent       0.40      0.27      0.33        51
        Good       0.33      0.39      0.36        75
          Vg       0.36      0.34      0.35        80

    accuracy                           0.43       267
   macro avg       0.44      0.43      0.43       267
weighted avg       0.43      0.43      0.42       267

{'criterion': 'gini', 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 10, 'min_weight_fraction_leaf': 0.0, 'splitter': 'random'}


## Normalizer data

### Prepare data

In [351]:
procData_n = procData_n.drop(['performance'], axis=1)

procData_n

Unnamed: 0,time,class_x_percentage,class_xii_percentage,gender_female,gender_male,caste_General,caste_OBC,caste_SC,caste_ST,coaching_NO,coaching_OA,coaching_WA,class_ten_education_CBSE,class_ten_education_OTHERS,class_ten_education_SEBA,twelve_education_AHSEC,twelve_education_CBSE,twelve_education_OTHERS,medium_ASSAMESE,medium_ENGLISH,medium_OTHERS,father_occupation_BANK_OFFICIAL,father_occupation_BUSINESS,father_occupation_COLLEGE_TEACHER,father_occupation_CULTIVATOR,father_occupation_DOCTOR,father_occupation_ENGINEER,father_occupation_OTHERS,father_occupation_SCHOOL_TEACHER,mother_occupation_BANK_OFFICIAL,mother_occupation_BUSINESS,mother_occupation_COLLEGE_TEACHER,mother_occupation_CULTIVATOR,mother_occupation_DOCTOR,mother_occupation_ENGINEER,mother_occupation_HOUSE_WIFE,mother_occupation_OTHERS,mother_occupation_SCHOOL_TEACHER
0,0.26,0.52,0.52,0.0,0.13,0.13,0.00,0.0,0.00,0.13,0.00,0.00,0.0,0.00,0.13,0.13,0.00,0.0,0.0,0.13,0.00,0.0,0.00,0.00,0.00,0.13,0.0,0.00,0.00,0.0,0.0,0.0,0.0,0.0,0.0,0.00,0.13,0.00
1,0.56,0.44,0.44,0.0,0.11,0.00,0.11,0.0,0.00,0.00,0.00,0.11,0.0,0.00,0.11,0.11,0.00,0.0,0.0,0.00,0.11,0.0,0.00,0.00,0.00,0.00,0.0,0.00,0.11,0.0,0.0,0.0,0.0,0.0,0.0,0.11,0.00,0.00
2,0.56,0.44,0.44,0.0,0.11,0.00,0.11,0.0,0.00,0.00,0.11,0.00,0.0,0.11,0.00,0.00,0.11,0.0,0.0,0.11,0.00,0.0,0.11,0.00,0.00,0.00,0.0,0.00,0.00,0.0,0.0,0.0,0.0,0.0,0.0,0.11,0.00,0.00
3,0.26,0.52,0.52,0.0,0.13,0.13,0.00,0.0,0.00,0.00,0.00,0.13,0.0,0.00,0.13,0.13,0.00,0.0,0.0,0.00,0.13,0.0,0.00,0.00,0.00,0.00,0.0,0.00,0.13,0.0,0.0,0.0,0.0,0.0,0.0,0.00,0.00,0.13
4,0.56,0.44,0.44,0.0,0.11,0.11,0.00,0.0,0.00,0.00,0.11,0.00,0.0,0.00,0.11,0.00,0.11,0.0,0.0,0.11,0.00,0.0,0.00,0.11,0.00,0.00,0.0,0.00,0.00,0.0,0.0,0.0,0.0,0.0,0.0,0.11,0.00,0.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
661,0.39,0.20,0.39,0.2,0.00,0.00,0.00,0.0,0.20,0.00,0.00,0.20,0.0,0.00,0.20,0.20,0.00,0.0,0.0,0.20,0.00,0.0,0.00,0.00,0.00,0.00,0.0,0.20,0.00,0.0,0.0,0.0,0.0,0.0,0.0,0.20,0.00,0.00
662,0.65,0.32,0.16,0.0,0.16,0.00,0.00,0.0,0.16,0.00,0.00,0.16,0.0,0.00,0.16,0.16,0.00,0.0,0.0,0.16,0.00,0.0,0.00,0.00,0.16,0.00,0.0,0.00,0.00,0.0,0.0,0.0,0.0,0.0,0.0,0.16,0.00,0.00
663,0.73,0.15,0.29,0.0,0.15,0.00,0.00,0.0,0.15,0.00,0.00,0.15,0.0,0.00,0.15,0.00,0.15,0.0,0.0,0.15,0.00,0.0,0.00,0.00,0.00,0.00,0.0,0.15,0.00,0.0,0.0,0.0,0.0,0.0,0.0,0.00,0.00,0.15
664,0.68,0.17,0.17,0.0,0.17,0.00,0.00,0.0,0.17,0.00,0.00,0.17,0.0,0.00,0.17,0.17,0.00,0.0,0.0,0.17,0.00,0.0,0.00,0.00,0.00,0.00,0.0,0.00,0.17,0.0,0.0,0.0,0.0,0.0,0.0,0.17,0.00,0.00


### Holdout Cross Validation

In [352]:
X = procData_n.values
y = perf_label.values

(X_train, X_test, y_train, y_test) = train_test_split(X, y, test_size=0.4, random_state=42)

### SVM

In [None]:
all_parameters = [{'kernel': ['rbf'], 
                   'gamma': [1, 2, 0.5, 0.25, 0.1, 1e-2, 1e-3, 1e-4],
                  'C': [0.5, 1, 1.5, 2, 2.5, 5, 7.5, 10, 12.5, 100, 1000]},
                  {'kernel': ['linear'], 'C': [0.5,1, 1.5, 2, 2.5, 5, 7.5, 10, 12.5, 100, 1000]}]

# Find best params using GridSearch
grid_params = clf_best_parameters(GridSearchCV, SVC, all_parameters, X_train, y_train, X_test, y_test)
print(grid_params)

Tuning hyper-parameters for f1

Best parameters set found on development set:

{'C': 1000, 'gamma': 2, 'kernel': 'rbf'}

Detailed classification report:
              precision    recall  f1-score   support

     Average       0.90      1.00      0.95        61
   Excellent       1.00      0.90      0.95        51
        Good       0.99      0.99      0.99        75
          Vg       0.99      0.96      0.97        80

    accuracy                           0.97       267
   macro avg       0.97      0.96      0.96       267
weighted avg       0.97      0.97      0.97       267

{'C': 1000, 'gamma': 2, 'kernel': 'rbf'}


### Complement Naive Bayes


In [None]:
all_parameters = [{'alpha': [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.8, 1.0], 
                   'fit_prior': [True, False], 
                   'norm': [True, False]}]

# Find best params using GridSearch
rand_params = clf_best_parameters(GridSearchCV, ComplementNB, all_parameters, X_train, y_train, X_test, y_test)

Tuning hyper-parameters for f1

Best parameters set found on development set:

{'alpha': 0.7, 'fit_prior': True, 'norm': False}

Detailed classification report:
              precision    recall  f1-score   support

     Average       0.70      0.87      0.77        61
   Excellent       0.67      0.12      0.20        51
        Good       0.40      0.55      0.46        75
          Vg       0.43      0.42      0.43        80

    accuracy                           0.50       267
   macro avg       0.55      0.49      0.47       267
weighted avg       0.53      0.50      0.47       267



### Random Forest

In [None]:
all_parameters = [{'n_estimators': [100, 140, 180], 
                   'criterion': ['gini', 'entropy'], 
                   'min_samples_split': [0.5, 2, 5],
                   'min_samples_leaf': [0.1, 0.5, 1],
                   'min_weight_fraction_leaf': [0.0, 0.2, 0.5],
                   'max_features': ['auto', 'sqrt', 'log2']}]

# Find best params using GridSearch
rand_params = clf_best_parameters(GridSearchCV, RandomForestClassifier, all_parameters, X_train, y_train, X_test, y_test)
print(rand_params)

Tuning hyper-parameters for f1

Best parameters set found on development set:

{'criterion': 'gini', 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'n_estimators': 100}

Detailed classification report:
              precision    recall  f1-score   support

     Average       0.89      0.97      0.93        61
   Excellent       0.97      0.71      0.82        51
        Good       0.81      0.95      0.87        75
          Vg       0.88      0.84      0.86        80

    accuracy                           0.87       267
   macro avg       0.89      0.86      0.87       267
weighted avg       0.88      0.87      0.87       267

{'criterion': 'gini', 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'n_estimators': 100}


### K-Nearest Neighbors

In [None]:
all_parameters = [{'n_neighbors': [3, 5, 8, 10, 20], 
                   'weights': ['uniform', 'distance'],
                   'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute'],
                   'leaf_size': [10, 30, 60, 80],
                   'p': [1, 2]}]

# Find best params using GridSearch
rand_params = clf_best_parameters(GridSearchCV, KNeighborsClassifier, all_parameters, X_train, y_train, X_test, y_test)
print(rand_params)

Tuning hyper-parameters for f1

Best parameters set found on development set:

{'algorithm': 'brute', 'leaf_size': 10, 'n_neighbors': 20, 'p': 1, 'weights': 'distance'}

Detailed classification report:
              precision    recall  f1-score   support

     Average       0.73      0.72      0.73        61
   Excellent       1.00      0.41      0.58        51
        Good       0.63      0.76      0.69        75
          Vg       0.51      0.61      0.56        80

    accuracy                           0.64       267
   macro avg       0.72      0.63      0.64       267
weighted avg       0.69      0.64      0.64       267

{'algorithm': 'brute', 'leaf_size': 10, 'n_neighbors': 20, 'p': 1, 'weights': 'distance'}


### Decision Tree

In [None]:
all_parameters = [{'criterion': ['gini', 'entropy'], 
                   'splitter': ['best', 'random'],
                   'min_samples_split': [0.5, 2, 5, 10, 20],
                   'min_samples_leaf': [0.1, 0.3, 0.5, 1],
                   'min_weight_fraction_leaf': [0.0, 0.2, 0.4, 0.5],
                   'max_features': ['auto', 'sqrt', 'log2']}]

# Find best params using GridSearch
rand_params = clf_best_parameters(GridSearchCV, DecisionTreeClassifier, all_parameters, X_train, y_train, X_test, y_test)
print(rand_params)

Tuning hyper-parameters for f1

Best parameters set found on development set:

{'criterion': 'gini', 'max_features': 'auto', 'min_samples_leaf': 1, 'min_samples_split': 10, 'min_weight_fraction_leaf': 0.0, 'splitter': 'best'}

Detailed classification report:
              precision    recall  f1-score   support

     Average       0.80      0.92      0.85        61
   Excellent       0.93      0.75      0.83        51
        Good       0.74      0.81      0.78        75
          Vg       0.85      0.79      0.82        80

    accuracy                           0.82       267
   macro avg       0.83      0.82      0.82       267
weighted avg       0.82      0.82      0.82       267

{'criterion': 'gini', 'max_features': 'auto', 'min_samples_leaf': 1, 'min_samples_split': 10, 'min_weight_fraction_leaf': 0.0, 'splitter': 'best'}
