In [52]:
import pandas as pd
import numpy as np
import sympy as sp
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.neighbors import LocalOutlierFactor
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier

In [17]:
data = pd.read_csv('data2020.student.csv')
data = data.loc[0:999,:]

In [18]:
to_rem = []
to_cat = []
num_cols = []
cat_cols = []

In [19]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 34 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   ID      1000 non-null   int64  
 1   Class   1000 non-null   float64
 2   C1      995 non-null    float64
 3   C2      996 non-null    float64
 4   C3      1000 non-null   object 
 5   C4      1000 non-null   int64  
 6   C5      1000 non-null   object 
 7   C6      1000 non-null   int64  
 8   C7      1000 non-null   object 
 9   C8      1000 non-null   int64  
 10  C9      1000 non-null   int64  
 11  C10     1000 non-null   object 
 12  C11     1000 non-null   object 
 13  C12     1000 non-null   object 
 14  C13     1000 non-null   int64  
 15  C14     995 non-null    object 
 16  C15     1000 non-null   int64  
 17  C16     1000 non-null   object 
 18  C17     1000 non-null   object 
 19  C18     1000 non-null   object 
 20  C19     996 non-null    object 
 21  C20     1000 non-null   int64  
 22  C

## Removing attributes with no information.

In [20]:
for col in data:
    if len(data[col].value_counts()) == 1:
        print(col)
        to_rem.append(col)
        data = data.drop(col,axis=1)

C15
C17
C21
C22


## Determining object attributes that should be categorical

In [21]:
for col in data.select_dtypes(include=object):
    data[col] = data[col].astype('category')
    to_cat.append(col)

## Determining numeric attributes that should be categrocial

In [22]:
for col in data.select_dtypes(include=np.number):
    if data[col].nunique() < 5:
        data[col] = data[col].astype('category')
        to_cat.append(col)

In [23]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 30 columns):
 #   Column  Non-Null Count  Dtype   
---  ------  --------------  -----   
 0   ID      1000 non-null   int64   
 1   Class   1000 non-null   category
 2   C1      995 non-null    float64 
 3   C2      996 non-null    category
 4   C3      1000 non-null   category
 5   C4      1000 non-null   category
 6   C5      1000 non-null   category
 7   C6      1000 non-null   int64   
 8   C7      1000 non-null   category
 9   C8      1000 non-null   int64   
 10  C9      1000 non-null   category
 11  C10     1000 non-null   category
 12  C11     1000 non-null   category
 13  C12     1000 non-null   category
 14  C13     1000 non-null   int64   
 15  C14     995 non-null    category
 16  C16     1000 non-null   category
 17  C18     1000 non-null   category
 18  C19     996 non-null    category
 19  C20     1000 non-null   int64   
 20  C23     1000 non-null   category
 21  C24     1000 no

## Setting major missing entries to dummy variables

In [24]:
for col in data:
    if (data[col].isnull().sum() / data.shape[0]) > 0.05:
        if pd.api.types.is_categorical(data[col]):
            data[col] = data[col].cat.add_categories('Unknown')
            data[col] = data[col].fillna('Unknown')
        elif pd.api.types.is_number(data[col]):
            data[col].fillna(0)

## Dropping small numbers of missing entries

In [25]:
data = data.dropna()

In [26]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 982 entries, 0 to 999
Data columns (total 30 columns):
 #   Column  Non-Null Count  Dtype   
---  ------  --------------  -----   
 0   ID      982 non-null    int64   
 1   Class   982 non-null    category
 2   C1      982 non-null    float64 
 3   C2      982 non-null    category
 4   C3      982 non-null    category
 5   C4      982 non-null    category
 6   C5      982 non-null    category
 7   C6      982 non-null    int64   
 8   C7      982 non-null    category
 9   C8      982 non-null    int64   
 10  C9      982 non-null    category
 11  C10     982 non-null    category
 12  C11     982 non-null    category
 13  C12     982 non-null    category
 14  C13     982 non-null    int64   
 15  C14     982 non-null    category
 16  C16     982 non-null    category
 17  C18     982 non-null    category
 18  C19     982 non-null    category
 19  C20     982 non-null    int64   
 20  C23     982 non-null    category
 21  C24     982 non-

## Removing ID column 

In [27]:
data = data.drop('ID',1)

## Removing duplicate data

In [28]:
data = data.drop_duplicates()

## Determining linearly independent numeric columns

In [29]:
reduced_form, inds = sp.Matrix(data.select_dtypes(include=np.number)).rref()
print('Columns: ' + str(data.select_dtypes(include=np.number).columns))
print('Independent Columns: ' + str(inds))
cols = data.select_dtypes(include=np.number).columns
for i in range(0,len(cols)):
    if i not in inds:
        data = data.drop(cols[i],axis=1)


Columns: Index(['C1', 'C6', 'C8', 'C13', 'C20', 'C26', 'C30'], dtype='object')
Independent Columns: (0, 2, 3, 4, 5)


## Assigning columns to categorical or numeric

In [30]:
for col in data:
    if pd.api.types.is_numeric_dtype(data[col]):
        num_cols.append(col)
    elif col != 'Class':
        cat_cols.append(col)

## Performing hot encoding

In [34]:
dataEnc = OneHotEncoder().fit(data[cat_cols].astype(str))
data = data.reset_index()

In [35]:
data = data.drop(cat_cols,axis=1).join(pd.DataFrame(dataEnc.transform(data[cat_cols].astype(str)).toarray()))

## Splitting test and training 


In [36]:
train_data, test_data = train_test_split(data.loc[0:999,:],test_size = 0.15,random_state = 21)

## Standardising numeric training and test data

In [37]:
dataScaler = StandardScaler().fit(train_data[num_cols])

In [38]:
train_data[num_cols] = dataScaler.transform(train_data[num_cols])
test_data[num_cols] = dataScaler.transform(test_data[num_cols])

## Performing PCA on data

In [39]:
dataPCA = PCA().fit(train_data[num_cols])

In [40]:
train_data[num_cols] = dataPCA.transform(train_data[num_cols])
test_data[num_cols] = dataPCA.transform(test_data[num_cols])

In [41]:
train_data[num_cols].cov()

Unnamed: 0,C1,C8,C13,C20,C26
C1,1.646307,1.97777e-16,3.7996940000000004e-17,-1.306145e-16,1.923595e-16
C8,1.97777e-16,1.062239,6.768205000000001e-17,2.190761e-16,1.270523e-16
C13,3.7996940000000004e-17,6.768205000000001e-17,1.006882,1.8998470000000002e-17,-9.380494000000001e-17
C20,-1.306145e-16,2.190761e-16,1.8998470000000002e-17,0.9270424,-1.306145e-17
C26,1.923595e-16,1.270523e-16,-9.380494000000001e-17,-1.306145e-17,0.3642139


## Test analysis

In [42]:
x_train = train_data.drop('Class',axis=1)
y_train = train_data['Class']
x_test = test_data.drop('Class',axis=1)
y_test = test_data['Class']

In [48]:
knn = KNeighborsClassifier().fit(x_train,y_train)

In [49]:
sum(knn.predict(x_test) == y_test)/len(y_test)

0.6766917293233082

In [50]:
clf = LogisticRegression().fit(x_train,y_train)

In [51]:
sum(clf.predict(x_test) == y_test)/len(y_test)

0.7593984962406015

In [65]:
cnn = MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(30, 3)).fit(x_train,y_train)

In [66]:
sum(cnn.predict(x_test) == y_test)/len(y_test)

0.7669172932330827

## TODO 
- Use LOF to isolate outliers in training and test
- Change sampling technique to improve data representation
- Cluster training data and append cluster to data