# Support Vector Machine Classification Using Education Data

In [1]:
import pandas as pd


import numpy as np

In [2]:
encoded = pd.read_csv('/Users/luisr/Desktop/Repository/IBM_courses/Personal Work/Data Frames/Work Copies/Work/encoded_quest_alunos.csv').drop('Unnamed: 0', axis=1)

data = encoded.copy()

In [3]:
data.shape

(1048575, 75)

### Adding 1 to columns where min equals 0 (Including binary columns)

In [5]:
cols_0on = ['ID_MUNICIPIO', 'ID_ESCOLA', 'ID_TURMA', 'ID_ALUNO', 'ID_DEPENDENCIA_ADM', 'ID_TURNO', 'IN_SITUACAO_CENSO', 'IN_PREENCHIMENTO']   # ID columns where minimun is 0.

[i in encoded.columns for i in cols_0on]

[True, True, True, True, True, True, True, True]

In [6]:
for col in cols_0on+data.columns.tolist()[13:]:
    
    data[col] = data[col]+1

### Filtering by applied test and third year student

In [7]:
data = data[data['IN_PREENCHIMENTO']==2]
data = data[data['ID_SERIE']==12]

data.reset_index(drop=True, inplace=True)

In [27]:
data.shape

(20568, 75)

#### Optional: Try binary variables as 0 and 1

### Feature Selection

In [32]:
y = data['Q043'].astype(int)                # Lab example used 2 and for as labels for binary target.

In [39]:
x = data.drop(['Q043', 'IN_SITUACAO_CENSO', 'IN_PREENCHIMENTO'], axis=1).astype(int).values

In [42]:
type(y)

pandas.core.series.Series

### Sample Split into Train and Test Sets

In [61]:
msk = np.random.rand(len(y))<0.8
x_train, x_test, y_train, y_test = x[msk], x[~msk], y[msk], y[~msk]

### Fitting Model

In [33]:
from sklearn import svm
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix, classification_report

In [64]:
svc = svm.SVC(kernel='rbf')
svc.fit(x_train, y_train)
y_pred_test = svc.predict(x_test)

acc_scr = accuracy_score(y_test, y_pred_test)
f1_scr = f1_score(y_test, y_pred_test, average='weighted')
cfm_scr = confusion_matrix(y_test, y_pred_test, labels = y.unique())
cls_rep_scr = classification_report(y_test, y_pred_test)

print(acc_scr)
print(f1_scr)
print(cfm_scr)
print(cls_rep_scr)

### Smaller Feature Selection

##### Questions that might be related to reproval

In [28]:
qst_text = '03,06,07,09,10,14,16,34,35,36,38,40,41,42'
txt_splt = qst_text.split(',')
qst_text = ['Q0'+txt_splt[i] for i in range(len(txt_splt))]

In [29]:
qst_text[5:]

['Q014', 'Q016', 'Q034', 'Q035', 'Q036', 'Q038', 'Q040', 'Q041', 'Q042']

In [30]:
x1 = data.drop(qst_text, axis=1).values

In [34]:
y.head()

0    4
1    2
2    2
3    2
4    2
Name: Q043, dtype: int32

### Modeling Again

In [43]:
msk = np.random.rand(len(y))<0.8
x_train, y_train, x_test, y_test = x1[msk], y[msk], x1[~msk], y[~msk]

y_pred_test = svm.SVC(kernel='rbf').fit(x_train, y_train).predict(x_test)

acc_scr = accuracy_score(y_test, y_pred_test)

In [44]:
acc_scr

0.6769080712021458

In [None]:
f1 = f1_score(y_test, y_pred_test, average='weighted')
cfm = confusion_matrix(y_test, y_pred_test, labels=[y.unique()])
class_reprt = classification_report(y_test, y_pred_test)

### Now, without those IDs

In [10]:
ids_keep = ['ID_REGIAO', 'ID_MUNICIPIO', 'ID_ESCOLA', 'ID_DEPENDENCIA_ADM', 'ID_LOCALIZACAO', 'ID_CAPITAL', 'ID_TURNO']

In [11]:
[i in data.columns for i in ids_keep]

[True, True, True, True, True, True, True]

In [42]:
keep_cols = [i for i in data.columns.values if (i in ids_keep)]+ [i for i in data.columns[13:] if (i not in qst_text)]

In [45]:
x2 = data[keep_cols].drop('Q043', axis=1)

In [46]:
y.head()

0    4
1    2
2    2
3    2
4    2
Name: Q043, dtype: int32

### Modeling

In [50]:
msk = np.random.rand(len(y))<0.8
x_train, y_train, x_test, y_test = x2[msk], y[msk], x2[~msk], y[~msk]

y_pred_test = svm.SVC(kernel='rbf').fit(x_train, y_train).predict(x_test)

acc_scr = accuracy_score(y_test, y_pred_test)

In [51]:
acc_scr

0.6646677860398177

In [57]:
f1 = f1_score(y_test, y_pred_test, average='weighted')
cfm = confusion_matrix(y_test, y_pred_test, labels=[y.unique().astype(str)])
class_reprt = classification_report(y_test, y_pred_test)

  elif np.all([l not in y_true for l in labels]):


ValueError: At least one label specified must be in y_true

In [None]:
print(f1)
print(cfm)
print(class_reprt)

### One more time with standard Scaler

In [62]:
x2.head(2)

Unnamed: 0,ID_REGIAO,ID_MUNICIPIO,ID_ESCOLA,ID_DEPENDENCIA_ADM,ID_LOCALIZACAO,ID_CAPITAL,ID_TURNO,Q001,Q002,Q004,...,Q053,Q054,Q055,Q056,Q057,Q058,Q059,Q060,Q061,Q062
0,0,1,5525,2,0,1,2,2,3,7,...,2,3,3,3,3,3,2,2,3,3
1,0,1,5525,2,0,1,2,3,3,4,...,2,2,3,3,3,4,4,4,3,5


In [63]:
from sklearn.preprocessing import StandardScaler

In [74]:
x1 = data.iloc[:,13:].drop(qst_text+['Q043'], axis=1)

In [75]:
x3 = StandardScaler().fit_transform(x1)

In [76]:
msk = np.random.rand(len(y))<0.8
x_train, y_train, x_test, y_test = x3[msk], y[msk], x3[~msk], y[~msk]

y_pred_test = svm.SVC(kernel='rbf').fit(x_train, y_train).predict(x_test)

acc_scr = accuracy_score(y_test, y_pred_test)

In [77]:
acc_scr

0.7240879439478135

### Conclusion

##### Preliminary classification models did not offer good accuracies. Higher until 26/11 was 0.77 for k=8 using KNN. Next step should be to rethink the feature selection process (to come up with one). To study the models theory deeper could also work.