######################################################################  
######################################################################  
### --------------------- Setup
######################################################################  
######################################################################  

In [23]:
from pathlib import Path
import os
from joblib import Memory
from sklearn.datasets import load_svmlight_file
from scipy.sparse import csc_matrix
from scipy.special import expit
from scipy import sparse
from sklearn.preprocessing import normalize
from copy import copy
from sklearn import svm
import time
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import KFold, cross_val_score
from sklearn import svm
import matplotlib.pyplot as plt
from sklearn import tree
from xgboost import XGBClassifier
from sklearn.metrics import roc_auc_score
from sklearn.utils import shuffle
from sklearn.utils import resample
from sklearn.neural_network import MLPClassifier
import plotnine as p9


In [26]:
import pyarrow.feather as feather
import pandas as pd
import numpy as np

idcsc = "C:/Code/GITHUB/csc/Classification-of-Pell-Institutions/data/datasubsets/csc_variable_subsets/id_csc.feather"
cipcsc = "C:/Code/GITHUB/csc/Classification-of-Pell-Institutions/data/datasubsets/csc_variable_subsets/cip_csc.feather"
geoloccsc = "C:/Code/GITHUB/csc/Classification-of-Pell-Institutions/data/datasubsets/csc_variable_subsets/geolocation_csc.feather"
instdemocsc = "C:/Code/GITHUB/csc/Classification-of-Pell-Institutions/data/datasubsets/csc_variable_subsets/inst_demographic_csc.feather"
studdemocsc = "C:/Code/GITHUB/csc/Classification-of-Pell-Institutions/data/datasubsets/csc_variable_subsets/stud_demographic_csc.feather"
pcipcsc = "C:/Code/GITHUB/csc/Classification-of-Pell-Institutions/data/datasubsets/csc_variable_subsets/pcip_csc.feather"
numcsc = "C:/Code/GITHUB/csc/Classification-of-Pell-Institutions/data/datasubsets/csc_variable_subsets/num_csc.feather"

id_csc = feather.read_feather(idcsc)
cip_csc = feather.read_feather(cipcsc)
geolocation_csc = feather.read_feather(geoloccsc)
inst_demographic_csc = feather.read_feather(instdemocsc)
stud_demographic_csc = feather.read_feather(studdemocsc)
pcip_csc = feather.read_feather(pcipcsc)
num_csc = feather.read_feather(numcsc)

rand_state = 5991

In [29]:
# FUNCTIONS
def split_data(xx, yy, testsize = 1000):
    xtrain, xtest, y_train, y_test = train_test_split(xx,
                                                      yy,
                                                      test_size = testsize,
                                                      random_state = rand_state)
    xtrain, xvalid, y_train, y_valid = train_test_split(xtrain, y_train, 
                                                        test_size = testsize,
                                                        random_state = rand_state)

    y_train = np.array(y_train).reshape(-1, 1)
    y_test = np.array(y_test).reshape(-1, 1)
    y_valid = np.array(y_valid).reshape(-1, 1)

    print(xtrain.shape)
    print(y_train.shape)
    print(xtest.shape)
    print(y_test.shape)
    print(xvalid.shape)
    print(y_valid.shape)

    return xtrain, y_train, xtest, y_test, xvalid, y_valid 


In [31]:
ids = np.array(range(len(id_csc)))
id_csc["ids"] = ids

In [38]:
justids = id_csc.loc[:,["ids", "UNITID"]]
justids.head(5)


Unnamed: 0,ids,UNITID
0,0,100654.0
1,1,100663.0
2,2,100690.0
3,3,100706.0
4,4,100724.0


In [57]:
print("cip_csc")
print("BEFORE:", cip_csc.shape)
cip_csc = pd.merge(justids, 
                   cip_csc,
                   how = "left")
print("AFTER: ", cip_csc.shape)

print(" ")
print("geolocation_csc")
print("BEFORE:", geolocation_csc.shape)
geolocation_csc = pd.merge(justids, 
                           geolocation_csc,
                           how = "left")
print("AFTER: ", geolocation_csc.shape)

print(" ")
print("inst_demographic_csc")
print("BEFORE:", inst_demographic_csc.shape)
inst_demographic_csc = pd.merge(justids, 
                   inst_demographic_csc,
                   how = "left")
print("AFTER: ", inst_demographic_csc.shape)

print(" ")
print("stud_demographic_csc")
print("BEFORE:", stud_demographic_csc.shape)
stud_demographic_csc = pd.merge(justids, 
                   stud_demographic_csc,
                   how = "left")
print("AFTER: ", stud_demographic_csc.shape)

print(" ")
print("pcip_csc")
print("BEFORE:", pcip_csc.shape)
pcip_csc = pd.merge(justids, 
                   pcip_csc,
                   how = "left")
print("AFTER: ", pcip_csc.shape)

print(" ")
print("num_csc")
print("BEFORE:", num_csc.shape)
num_csc = pd.merge(justids, 
                   num_csc,
                   how = "left")
print("AFTER: ", num_csc.shape)

cip_csc
BEFORE: (5879, 193)
AFTER:  (5879, 193)
 
geolocation_csc
BEFORE: (5879, 14)
AFTER:  (5879, 14)
 
inst_demographic_csc
BEFORE: (5879, 16)
AFTER:  (5879, 16)
 
stud_demographic_csc
BEFORE: (5879, 13)
AFTER:  (5879, 13)
 
pcip_csc
BEFORE: (5879, 41)
AFTER:  (5879, 41)
 
num_csc
BEFORE: (5879, 164)
AFTER:  (5879, 164)


######################################################################  
######################################################################  
### --------------------- Imputing missing values
######################################################################  
######################################################################  

### Numeric Values

In [99]:
num_csc.drop(columns = ["ids", "UNITID", "INSTNM"], axis = 1).head(5)

Unnamed: 0,NUMBRANCH,PCTFLOAN,D_PCTPELL_PCTFLOAN,SCUGFFN,UGDS,UGDS_WHITE,UGDS_BLACK,UGDS_HISP,UGDS_ASIAN,UGDS_AIAN,...,OMENRUP8_FTNFT,BBRR2_PP_UG_N,BBRR2_FED_UGNOCOMP_N,DBRR4_PP_UG_N,DBRR4_PP_UG_NUM,DBRR4_PP_UG_DEN,DBRR4_PP_UG_RT,TUITIONFEE_IN,TUITIONFEE_OUT,PLUS_DEBT_INST_MD
0,1.0,0.7503,5039.0,1288.0,4990.0,0.0186,0.912,0.0088,0.0018,0.0022,...,0.1697,785.0,1858.0,668.0,10785863.0,8969946.0,1.202445,9744.0,18354.0,14838.0
1,1.0,0.5127,13134.0,2228.0,13186.0,0.5717,0.2553,0.0334,0.0633,0.0034,...,0.2114,597.0,2511.0,535.0,8537186.0,8543933.0,0.99921,8568.0,19704.0,16145.0
2,1.0,0.8962,318.0,5.0,351.0,0.2393,0.7151,0.0171,0.0057,0.0057,...,0.7027,,247.0,,,,,6900.0,6900.0,
3,1.0,0.4192,7090.0,1341.0,7458.0,0.7167,0.0969,0.0528,0.0381,0.0095,...,0.1667,237.0,1014.0,239.0,3494955.0,3914476.0,0.892828,10714.0,22362.0,13524.0
4,1.0,0.7845,4208.0,951.0,3903.0,0.0167,0.9352,0.0095,0.0041,0.0013,...,0.1759,799.0,1708.0,790.0,13008748.0,10302465.0,1.262683,11068.0,19396.0,15351.0


In [102]:
num_vars = num_csc.drop(columns = ["ids", "UNITID", "INSTNM"], axis = 1)

for x in list(num_vars):
    med = num_csc[x].median()
    num_csc[x].fillna(med, inplace = True)

missing = pd.DataFrame(num_csc.isna().sum())
missing.reset_index(inplace=True)
missing[missing[0] != 0]

Unnamed: 0,index,0
