## Use PPCA / SVD to eliminate co-variant / orthogonal features 

In [146]:
#imports
import pandas as pd
import sklearn
from sklearn.decomposition import PCA
from collections import Counter
import numpy as np
from numpy import shape, isnan, nanmean, average, log, cov
from numpy.matlib import repmat
from numpy.random import normal
from numpy.linalg import inv, det, eig
from numpy import identity as eye
from numpy import trace as tr
from scipy.linalg import orth
name = "pyppca" #import for PPCA
from pyppca import ppca

In [147]:
# read in a given CSV file and prepare for TrainTestSplit
df = pd.read_csv("./preprocessed_data/2016-19ChronAbsenteeism.csv", sep=",")
X = df.drop(["Unnamed: 0"], axis=1)
#Y = df['dropout_rate']?
X.shape

(6471, 29)

In [148]:
# date columns need to be converted to int for each dataset
X['AcademicYear'] = X['AcademicYear'].replace(['2016-17', '2017-18','2018-19'], ['1','2','3'])

X

Unnamed: 0,AcademicYear,AggregateLevel,CountryCode,DistrictCode,SchoolCode,CountyName,DistrictName,SchoolName,CharterYN,CAR_RB,...,CAR_GF,CAR_GX,CAR_GZ,CAR_SE,CAR_SD,CAR_SS,CAR_SM,CAR_SF,CAR_SH,CAR_TA
0,1,S,32,10322.0,100057.0,Plumas,Plumas County Office of Education,Plumas County Community,All,,...,,,,,,,,,,
1,2,S,32,10322.0,100057.0,Plumas,Plumas County Office of Education,Plumas County Community,All,,...,,,,,,,,,,
2,3,S,32,10322.0,100057.0,Plumas,Plumas County Office of Education,Plumas County Community,All,,...,,,,,,,,,,90.0
3,1,S,1,61259.0,100065.0,Alameda,Oakland Unified,Oakland Unity High,All,19.0,...,16.9,,,24.1,7.4,15.0,,,,14.7
4,2,S,1,61259.0,100065.0,Alameda,Oakland Unified,Oakland Unity High,All,31.3,...,16.6,,,16.7,8.3,15.0,,,,14.5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6466,2,S,30,66464.0,6120356.0,Orange,Capistrano Unified,Opportunities for Learning - Capistrano,All,,...,10.1,,,,18.5,11.7,,,,7.3
6467,3,S,30,66464.0,6120356.0,Orange,Capistrano Unified,Opportunities for Learning - Capistrano,All,0.0,...,0.8,,,0.0,2.6,5.0,,,,3.3
6468,1,S,5,10058.0,9010745.0,Calaveras,Calaveras County Office of Education,Oakendell Community,All,,...,,,,,0.0,4.5,,5.0,,4.5
6469,2,S,5,10058.0,9010745.0,Calaveras,Calaveras County Office of Education,Oakendell Community,All,,...,,,,,7.7,0.0,,0.0,,5.0


In [149]:
#we need to store SchoolName, CountyName before we drop
#we need to initialize this dictionary before running the function
keyz = []
SchoolName_valz = []
DistrictName_valz = []
stringInfo = {}

def get_string_info(df):
    ''' Creates a dictionary with a given schoolcode as the key and an array as the values.
        The contained values are the associated school name and dictrict name, respectively.
        We need to keep this ledger since PCA / SVD require integer-only values. Lastly, we need
        to drop the created string rows.
        Note: should we add County names to this dict?'''
    for col in df['SchoolCode']:
        keyz.append(col)
    for column in df['SchoolName']:
        SchoolName_valz.append(column)
    for col in df['DistrictName']:
        DistrictName_valz.append(column)
    for i, name in enumerate(keyz):
        stringInfo[name] = [SchoolName_valz[i], DistrictName_valz[i]]
    return stringInfo

In [150]:
names_dict = get_string_info(X)
names_dict

{100057.0: ['Plumas County Community', 'Oakendell Community'],
 100065.0: ['Oakland Unity High', 'Oakendell Community'],
 100081.0: ['William J. (Pete) Knight High', 'Oakendell Community'],
 100180.0: ['Mountain Oaks High', 'Oakendell Community'],
 100198.0: ['Golden Valley High', 'Oakendell Community'],
 100255.0: ['Desert Mirage High', 'Oakendell Community'],
 100305.0: ['Cypress Charter High', 'Oakendell Community'],
 100354.0: ['Hawthorne Math and Science Academy', 'Oakendell Community'],
 100420.0: ['Vista Murrieta High', 'Oakendell Community'],
 100529.0: ['Cesar E. Chavez High', 'Oakendell Community'],
 100602.0: ['Lennox Mathematics, Science and Technology Academy',
  'Oakendell Community'],
 100677.0: ['High Tech LA', 'Oakendell Community'],
 100701.0: ['MetWest High', 'Oakendell Community'],
 100750.0: ['Wallis Annenberg High', 'Oakendell Community'],
 100800.0: ['Central City Value', 'Oakendell Community'],
 101162.0: ['Pioneer High', 'Oakendell Community'],
 101196.0: ['ICE

In [151]:
#function that drops DistrictName, SchoolName, CountyName from Dataframe
#we can drop the CharterYN column because there are is 1 unique value (All)
#^same goes for AggregateLevel column

def name_drop(df):
    df = df.drop(['SchoolName','DistrictName', 'CountyName','CharterYN','AggregateLevel'], axis = 1)
    return df

In [152]:
X1 = name_drop(X)
X1

Unnamed: 0,AcademicYear,CountryCode,DistrictCode,SchoolCode,CAR_RB,CAR_RI,CAR_RA,CAR_RF,CAR_RH,CAR_RD,...,CAR_GF,CAR_GX,CAR_GZ,CAR_SE,CAR_SD,CAR_SS,CAR_SM,CAR_SF,CAR_SH,CAR_TA
0,1,32,10322.0,100057.0,,,,,,,...,,,,,,,,,,
1,2,32,10322.0,100057.0,,,,,,,...,,,,,,,,,,
2,3,32,10322.0,100057.0,,,,,,,...,,,,,,,,,,90.0
3,1,1,61259.0,100065.0,19.0,,,,14.1,17.4,...,16.9,,,24.1,7.4,15.0,,,,14.7
4,2,1,61259.0,100065.0,31.3,,,,13.5,15.4,...,16.6,,,16.7,8.3,15.0,,,,14.5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6466,2,30,66464.0,6120356.0,,,,,9.1,5.4,...,10.1,,,,18.5,11.7,,,,7.3
6467,3,30,66464.0,6120356.0,0.0,,,,4.6,4.5,...,0.8,,,0.0,2.6,5.0,,,,3.3
6468,1,5,10058.0,9010745.0,,,,,0.0,,...,,,,,0.0,4.5,,5.0,,4.5
6469,2,5,10058.0,9010745.0,,,,,,,...,,,,,7.7,0.0,,0.0,,5.0


In [153]:
#seperate academic year into one-hot-encoded values 
#2016 == Year1
oneHot = pd.get_dummies(X1.AcademicYear)
X1 = pd.concat([oneHot, X1], axis=1)
X1 = X1.drop('AcademicYear', axis = 1)
X1

Unnamed: 0,1,2,3,CountryCode,DistrictCode,SchoolCode,CAR_RB,CAR_RI,CAR_RA,CAR_RF,...,CAR_GF,CAR_GX,CAR_GZ,CAR_SE,CAR_SD,CAR_SS,CAR_SM,CAR_SF,CAR_SH,CAR_TA
0,1,0,0,32,10322.0,100057.0,,,,,...,,,,,,,,,,
1,0,1,0,32,10322.0,100057.0,,,,,...,,,,,,,,,,
2,0,0,1,32,10322.0,100057.0,,,,,...,,,,,,,,,,90.0
3,1,0,0,1,61259.0,100065.0,19.0,,,,...,16.9,,,24.1,7.4,15.0,,,,14.7
4,0,1,0,1,61259.0,100065.0,31.3,,,,...,16.6,,,16.7,8.3,15.0,,,,14.5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6466,0,1,0,30,66464.0,6120356.0,,,,,...,10.1,,,,18.5,11.7,,,,7.3
6467,0,0,1,30,66464.0,6120356.0,0.0,,,,...,0.8,,,0.0,2.6,5.0,,,,3.3
6468,1,0,0,5,10058.0,9010745.0,,,,,...,,,,,0.0,4.5,,5.0,,4.5
6469,0,1,0,5,10058.0,9010745.0,,,,,...,,,,,7.7,0.0,,0.0,,5.0


### Because many columns contain NaN values, we will use a Probalistic Principal Componet Analysis (PPCA) model to handle this missing data

In [139]:
### Because many columns contain NaN values, we will use a Probalistic Principal Componet Analysis (PPCA) model to handle this missing data

#Installation of pyppca: https://github.com/el-hult/pyppca#readme
#import ppca
def ppca(df,d,dia):
    """
    Implements probabilistic PCA for data with missing values,
    using a factorizing distribution over hidden states and hidden observations.
    Args:
        df:   (N by D) input DateFrame of data vectors; this is later converted to an N by D numpy array
        d:   (  int  ) dimension of latent space
        dia: (boolean) if True: print objective each step
    Returns:
        C:  (D by d ) C*C' + I*ss is covariance model, C has scaled principal directions as cols
        ss: ( float ) isotropic variance outside subspace
        M:  (D by 1 ) data mean
        X:  (N by d ) expected states
        Ye: (N by D ) expected complete observations (differs from Y if data is missing)
        Based on MATLAB code from J.J. VerBeek, 2006. http://lear.inrialpes.fr/~verbeek
    """
    df = df.astype(float)
    Y = df.to_numpy() #turns df to a NxD numpy array
    N, D = shape(Y)  # N observations in D dimensions (i.e. D is number of features, N is samples)
    threshold = 1E-4  # minimal relative change in objective function to continue
    hidden = np.isnan(Y)
    missing = hidden.sum()

    if missing > 0:
        M = nanmean(Y, axis=0)
    else:
        M = average(Y, axis=0)

    Ye = Y - repmat(M, N, 1)

    if missing > 0:
        Ye[hidden] = 0

    # initialize
    C = normal(loc=0.0, scale=1.0, size=(D, d))
    CtC = C.T @ C
    X = Ye @ C @ inv(CtC)
    recon = X @ C.T
    recon[hidden] = 0
    ss = np.sum((recon - Ye) ** 2) / (N * D - missing)

    count = 1
    old = np.inf

    # EM Iterations
    while (count):
        Sx = inv(eye(d) + CtC / ss)  # E-step, covariances
        ss_old = ss
        if missing > 0:
            proj = X @ C.T
            Ye[hidden] = proj[hidden]

        X = Ye @ C @ Sx / ss  # E-step: expected values

        SumXtX = X.T @ X  # M-step
        C = Ye.T @ X @ (SumXtX + N * Sx).T @ inv(((SumXtX + N * Sx) @ (SumXtX + N * Sx).T))
        CtC = C.T @ C
        ss = (np.sum((X @ C.T - Ye) ** 2) + N * np.sum(CtC * Sx) + missing * ss_old) / (N * D)
        # transform Sx determinant into numpy longdouble in order to deal with high dimensionality
        Sx_det = np.min(Sx).astype(np.longdouble) ** shape(Sx)[0] * det(Sx / np.min(Sx))
        objective = N * D + N * (D * log(ss) + tr(Sx) - log(Sx_det)) + tr(SumXtX) - missing * log(ss_old)

        rel_ch = np.abs(1 - objective / old)
        old = objective

        count = count + 1
        if rel_ch < threshold and count > 5:
            count = 0
        if dia:
            print(f"Objective: {objective:.2f}, Relative Change {rel_ch:.5f}")

    C = orth(C)
    covM = cov((Ye @ C).T)
    vals, vecs = eig(covM)
    ordr = np.argsort(vals)[::-1]
    vecs = vecs[:, ordr]

    C = C @ vecs
    X = Ye @ C

    # add data mean to expected complete data
    Ye = Ye + repmat(M, N, 1)

    return C, ss, M, X, Ye


In [140]:
shape(X1)

(6471, 26)

In [141]:
#returns C, ss, M, X, Ye given dimensions of reduced space
ppca_X = ppca(X1, 5, False)
ppca_X

  M = nanmean(Y, axis=0)


(array([[-8.34757769e-10, -2.12828455e-08],
        [ 3.41300198e-10,  2.64682952e-08],
        [ 4.93457446e-10, -4.91343566e-09],
        [-4.92323639e-06,  3.01863602e-05],
        [-1.60011337e-03,  9.99998687e-01],
        [-9.99998720e-01, -1.60011352e-03],
        [ 9.51784702e-07, -3.35153377e-05],
        [ 2.63977375e-06, -1.93040993e-05],
        [ 5.37453503e-07, -2.97101324e-05],
        [-5.43051953e-08, -4.77544566e-06],
        [-1.65501628e-07, -6.28818617e-05],
        [ 1.56248202e-06, -2.81114382e-05],
        [ 2.60193220e-07, -6.15578424e-06],
        [ 1.10985171e-06, -6.12448834e-05],
        [-1.57690493e-07, -7.92436567e-05],
        [-8.36528876e-08, -7.39144545e-05],
        [-2.17548498e-07, -9.92794308e-05],
        [-7.24803138e-24,  1.87359141e-30],
        [ 3.06254075e-23,  4.02386833e-31],
        [-1.46471333e-08, -6.70871781e-05],
        [-3.71807046e-07, -3.55759673e-05],
        [-2.62538413e-07, -3.84293909e-05],
        [ 1.55386360e-07, -7.273

In [145]:
#returns C, ss, M, X, Ye given dimensions of reduced space
ppca_X[3]

array([[ 2082938.8644985 ,   -49599.05483887],
       [ 2082938.86449851,   -49599.05483883],
       [ 2082938.86448969,   -49599.05828932],
       ...,
       [-6827737.30555925,   -64121.16318815],
       [-6827737.30556773,   -64121.16585855],
       [-6827737.30555941,   -64121.16324455]])