## Notebook Summary
#### *Capstone: Pre-Processing/Feature Engineering #1*
---
This contents of this notebook includes the pre-processing for model #1

In [1]:
# import packages
import pandas as pd
import numpy as np
import re
import pickle
from sklearn.impute import KNNImputer, SimpleImputer
from sklearn.compose import make_column_transformer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder, PolynomialFeatures

pd.set_option('display.max_rows', None)
# pd.set_option('max_colwidth', 150)

import warnings
warnings.filterwarnings('ignore')

## Pre-Processing & Feature Engineering K-8 Dataset

In [2]:
#read in datasets
# column names which need to be string
cols = ['rcdts', 'nces_id']
dict_dtypes = {x: 'str' for x in cols}

df_k8 = pd.read_csv('../Capstone/cleaned_datasets/pre-processing/df_k8.csv', dtype=dict_dtypes)
df_k8.drop(columns = ['Unnamed: 0', 'latcod', 'loncod'], inplace=True)

In [3]:
df_k8.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9042 entries, 0 to 9041
Data columns (total 61 columns):
 #   Column                                                          Non-Null Count  Dtype  
---  ------                                                          --------------  -----  
 0   rcdts                                                           9042 non-null   object 
 1   nces_id                                                         9042 non-null   object 
 2   school_year                                                     9042 non-null   object 
 3   county                                                          9042 non-null   object 
 4   district_size                                                   9032 non-null   object 
 5   summative_designation                                           8888 non-null   float64
 6   state_senate_district                                           9042 non-null   float64
 7   state_representative_district                      

In [4]:
# Grouping together region_type into smaller groupings
group = {('21-Suburb: Large', 'Suburb - Large'),
         ('11-City: Large', 'City - Large'),
         ('42-Rural: Distant', 'Rural'),
         ('32-Town: Distant', 'Town'),
         ('41-Rural: Fringe', 'Rural'),
         ('13-City: Small', 'City - Small/Mid-Size'),
         ('12-City: Mid-size', 'City - Small/Mid-Size'),
         ('31-Town: Fringe', 'Town'),
         ('23-Suburb: Small', 'Suburb - Small/Mid-Size'),
         ('33-Town: Remote', 'Town'),
         ('43-Rural: Remote', 'Rural'),
         ('22-Suburb: Mid-size', 'Suburb - Small/Mid-Size')}

[df_k8.replace(r[0], r[1], inplace= True) for r in group]

[None, None, None, None, None, None, None, None, None, None, None, None]

In [5]:
## Dropping 2 features
df_k8.drop(columns=['state_senate_district', 'state_representative_district'], inplace=True)

### Train-Test-Split dataset

In [6]:
# create X,y
# Total Incidents - target

X = df_k8.drop(columns = ['total_incidents'])
y = df_k8[['rcdts', 'total_incidents']]

In [7]:
# create train_test_split of the dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.33, random_state=17)

#print shape of each
print(f'X_train:', X_train.shape)
print(f'y_train:', y_train.shape)
print(f'X_test:', X_test.shape)
print(f'y_test:', y_test.shape)

X_train: (6058, 58)
y_train: (6058, 2)
X_test: (2984, 58)
y_test: (2984, 2)


In [8]:
#removing ids from train dfs
X_train_key = X_train[['nces_id', 'rcdts']]
X_train.drop(columns=['nces_id', 'rcdts'], inplace=True)

X_test_key = X_test[['nces_id', 'rcdts']]
X_test.drop(columns=['nces_id', 'rcdts'], inplace=True)

In [9]:
#check nulls
nulls = pd.DataFrame(X_train.isna().sum()).sort_values(by=0, ascending=False)
nulls[nulls[0] != 0]

Unnamed: 0,0
%_math_proficiency_low_income,549
%_ela_proficiency_low_income,549
%_math_proficiency,371
%_ela_proficiency,371
summative_designation,109
avg_class_size_all_grades,97
fte,70
stuteratio,70
teacher_retention_rate,67
student_chronic_truancy_rate,53


#### Impute 2 categorical with most frequent b/c it's a very small percentage of nulls

In [10]:
#I used value_counts to check the most frequent
X_train['region_type'] = X_train['region_type'].replace(np.nan, 'Suburb - Large')
X_test['region_type'] = X_test['region_type'].replace(np.nan, 'Suburb - Large')

X_train['district_size'] = X_train['district_size'].replace(np.nan, 'LARGE')
X_test['district_size'] = X_test['district_size'].replace(np.nan, 'LARGE')

#### Build pipeline to dummy specific categorical columns

In [11]:
#build pipeline

#OHE
ohe = OneHotEncoder(drop='first', sparse=False, handle_unknown = 'ignore', dtype='float')

# making a column transformer
imp = make_column_transformer(
            (ohe, ['region_type', 'school_year', 'charter', 'county', 'district_size', 'title_i_status']),
            remainder = 'passthrough',
            verbose_feature_names_out=False)

In [12]:
# turn the output into a dataframe with the column names

X_train_ohe = pd.DataFrame(imp.fit_transform(X_train), columns=imp.get_feature_names_out())
X_test_ohe = pd.DataFrame(imp.transform(X_test), columns=imp.get_feature_names_out())

#### Create column transformer to scale & KNN impute nulls

In [13]:
#instantiate standard scaler
sc = StandardScaler()
cols_to_scale = list(X_train_ohe.drop(columns=['%_math_proficiency_low_income', '%_ela_proficiency_low_income',
       '%_math_proficiency', '%_ela_proficiency', 'summative_designation',
       'avg_class_size_all_grades', 'fte', 'stuteratio',
       'teacher_retention_rate', 'chronically_truant_students',
       'student_chronic_truancy_rate', 'student_attendance_rate',
       'principal_turnover_within_6_years',
       '$_instructional_expenditure_per_pupil', '%_general_state_aid',
       '%_local_property_taxes', '%_federal_funding',
       'student_mobility_rate_female', 'student_mobility_rate_male',
       'student_mobility_rate', 'total_number_of_school_days',
       'inc_to_pov_ratio']))

#instantiate knnimputer
knn_imputer = KNNImputer()

# making a column transformer
knn_imp = make_column_transformer(
            (sc, cols_to_scale),
            (knn_imputer, ['%_math_proficiency_low_income', '%_ela_proficiency_low_income',
                           '%_math_proficiency', '%_ela_proficiency', 'summative_designation',
                           'avg_class_size_all_grades', 'fte', 'stuteratio',
                           'teacher_retention_rate', 'chronically_truant_students',
                           'student_chronic_truancy_rate', 'student_attendance_rate',
                           'principal_turnover_within_6_years',
                           '$_instructional_expenditure_per_pupil', '%_general_state_aid',
                           '%_local_property_taxes', '%_federal_funding',
                           'student_mobility_rate_female', 'student_mobility_rate_male',
                           'student_mobility_rate', 'total_number_of_school_days',
                           'inc_to_pov_ratio']),
            remainder = 'passthrough',
            verbose_feature_names_out=False)  #we need false so that the feature names come out in the form we want

In [14]:
# use the column transformer to impute the nulls
# turn the output into a dataframe with the column names

X_train_imp = pd.DataFrame(knn_imp.fit_transform(X_train_ohe), columns=knn_imp.get_feature_names_out())
X_test_imp = pd.DataFrame(knn_imp.transform(X_test_ohe), columns=knn_imp.get_feature_names_out())

### Rescale the entire dataframe after imputation

In [15]:
#instantiate standard scaler
sc = StandardScaler()

#fit & reassign to df
Z_train = pd.DataFrame(sc.fit_transform(X_train_imp), columns=sc.get_feature_names_out())
Z_test = pd.DataFrame(sc.transform(X_test_imp), columns=sc.get_feature_names_out())

In [16]:
print(Z_train.shape)
print(y_train.shape)
print(Z_test.shape)
print(y_test.shape)

(6058, 165)
(6058, 2)
(2984, 165)
(2984, 2)


In [17]:
Z_train.to_csv('../Capstone/cleaned_datasets/modeling/Z_train.csv')
Z_test.to_csv('../Capstone/cleaned_datasets/modeling/Z_test.csv')

### Remove multicollinear features by running df through VIF to remove any features

In [18]:
from statsmodels.stats.outliers_influence import variance_inflation_factor

In [19]:
# make a copy of X_train to run through VIF
Z_train_VIF = Z_train.copy()

In [20]:
def vif_dropper(df):  #input a dataframe
    df_new = df
    while True:  #creating a while loop
        df_vif = pd.DataFrame()        # create blank dataframe and assigning it to vif_data
        df_vif["feature"] = df_new.columns   # create a new column called feature that extracts col names
        df_vif["VIF"] = [variance_inflation_factor(df_new.values, i) for i in range(len(df_new.columns))] #calculates VIF for each feature and adds it to blank df

        v = df_vif[df_vif['VIF'] != float('inf')] #filter df for VIF values that are above 5 and not inf & saving to new df called v
        m = max([i for i in v['VIF']])   # for each of the VIF vals in v df, return the max
        #print(m)
        
        if m < 5:
            break
            
        else:
            feature = df_vif.iloc[df_vif[df_vif['VIF'] == m].index]['feature'].values[0]
            df_new.drop(columns = feature, inplace=True)
            # print(feature)
            
    return df_new

In [21]:
#commenting out so I don't rerun
# Z_train_VIF_post = vif_dropper(Z_train_VIF)

In [22]:
#pickle so I don't need to rerun
#commenting out so I don't rerun
Z_train_VIF.to_pickle('../Capstone/pickles/Z_train_VIF_post.pkl')

In [23]:
Z_train_VIF_post = pd.read_pickle('../Capstone/pickles/Z_train_VIF_post.pkl')

Z_train_VIF_post.shape
# 13 features were dropped due to multicollinearity

In [24]:
set(Z_test.columns) - set(Z_train_VIF_post.columns)

set()

In [25]:
#drop these columns from my dataframe

vif_todrop = ['%_ela_proficiency', '%_local_property_taxes', '%_math_proficiency',
              '%_student_enrollment_hispanic_or_latino','%_student_enrollment_low_income',
              '%_student_enrollment_white','county_Cook','region_type_Rural',
              'student_chronic_truancy_rate','student_enrollment_total',
              'student_mobility_rate','student_mobility_rate_male','title_i_status_Yes']

X_train_final = Z_train.drop(columns= vif_todrop)
X_test_final = Z_test.drop(columns= vif_todrop)

In [26]:
#export final train/test datasets for modeling
X_train_final.to_csv('../Capstone/cleaned_datasets/modeling/model_1/X_train_final.csv')
X_test_final.to_csv('../Capstone/cleaned_datasets/modeling/model_1/X_test_final.csv')
y_train.to_csv('../Capstone/cleaned_datasets/modeling/model_1/y_train.csv')
y_test.to_csv('../Capstone/cleaned_datasets/modeling/model_1/y_test.csv')