In [4]:
import numpy as np 
import pandas as pd
import os

import matplotlib.pyplot as plt
%matplotlib inline

from datetime import date
import seaborn as sns
import random 
import warnings
import operator
warnings.filterwarnings("ignore")

import plotly.offline as py
py.init_notebook_mode(connected=True)
import plotly.graph_objs as go
import plotly.tools as tls
import plotly.figure_factory as ff

# from IPython.core.interactiveshell import InteractiveShell
# InteractiveShell.ast_node_interactivity = "all"

In [5]:
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from collections import defaultdict
from sklearn.metrics.pairwise import euclidean_distances
from sklearn.preprocessing import StandardScaler

class PFA(object):
    def __init__(self, n_features, q=None):
        self.q = q
        self.n_features = n_features

    def fit(self, X):
        if not self.q:
            self.q = X.shape[1]

        sc = StandardScaler()
        X = sc.fit_transform(X)

        pca = PCA(n_components=self.q).fit(X)
        A_q = pca.components_.T

        kmeans = KMeans(n_clusters=self.n_features).fit(A_q)
        clusters = kmeans.predict(A_q)
        cluster_centers = kmeans.cluster_centers_

        dists = defaultdict(list)
        for i, c in enumerate(clusters):
            dist = euclidean_distances([A_q[i, :]], [cluster_centers[c, :]])[0][0]
            dists[c].append((i, dist))

        self.indices_ = [sorted(f, key=lambda x: x[1])[0][0] for f in dists.values()]
        self.features_ = X[:, self.indices_]

In [7]:
# Convert percent values to float
pct_features = ["Student Attendance Rate", "Percent of Students Chronically Absent",
            "Rigorous Instruction %", "Collaborative Teachers %",
            "Supportive Environment %", "Effective School Leadership %",
            "Strong Family-Community Ties %", "Trust %",
            "Percent Black", "Percent White", "Percent Asian", "Percent Hispanic",
            "Percent Black / Hispanic", "Percent ELL"]

# Label encoding all the Rating features
mapping_dict = {"Not Meeting Target":0, "Approaching Target":1, "Meeting Target":2, "Exceeding Target" : 3}

def label_encoding(df, column_name):
    df[column_name] = df[column_name].map(mapping_dict)
    df[column_name].fillna(-1, inplace=True)
    df[column_name] = df[column_name].astype(int)

rating_cols = ['Rigorous Instruction Rating', 'Collaborative Teachers Rating', 
               'Supportive Environment Rating', 'Effective School Leadership Rating',
               'Strong Family-Community Ties Rating', 'Trust Rating','Student Achievement Rating']    



In [8]:
# Load data
school_reg = pd.read_csv("2016 School Explorer.csv")

# Feature engineering
school_reg = school_reg[np.isfinite(school_reg['Economic Need Index'])]

# Encoding community school
school_reg['Community School?'] = school_reg['Community School?'].map(lambda s: 1 if s == 'Yes' else 0)

# Convert percent values to float
def convert_percent(x):
    x = str(x)
    if x == 'nan':
        return np.nan
    else:
        return float(x.strip().replace('%', '').replace(',', ''))

for i, cn in enumerate(school_reg[pct_features]):
     school_reg[str(cn)]=school_reg[str(cn)].apply(convert_percent)

# Label encoding all the Rating features  
for c in rating_cols:
    label_encoding(school_reg, c)

# Focus on Grade 08
school_reg = school_reg[school_reg['Grade High'] == '08']

# Final cleanup NAN data
school_reg['Average ELA Proficiency'].fillna(0, inplace=True)
school_reg['Average Math Proficiency'].fillna(0, inplace=True)

school_reg.reset_index(inplace=True)

In [9]:
pd.set_option('max_columns', 200)
pd.set_option('max_info_columns', 200)

school_reg_clean_raw = school_reg.iloc[:, 16:].drop('School Income Estimate', axis=1)

school_reg_clean1 = school_reg_clean_raw.iloc[:, :25]

school_reg_clean2 = school_reg_clean_raw.iloc[:, 125:]
school_reg_clean = pd.concat([school_reg_clean1, school_reg_clean2], axis = 1)

In [10]:
x = StandardScaler().fit_transform(school_reg_clean)
pfa = PFA(n_features=15)
pfa.fit(x)

# To get the transformed matrix
X = pfa.features_

# To get the column indices of the kept features
column_indices = pfa.indices_
column_indices

[0, 8, 2, 3, 4, 6, 11, 14, 20, 22, 31, 25, 28, 35, 44]

In [11]:
school_reg_clean.columns[column_indices].values

array(['Community School?', 'Student Attendance Rate', 'Percent ELL',
       'Percent Asian', 'Percent Black', 'Percent Black / Hispanic',
       'Rigorous Instruction Rating', 'Supportive Environment %',
       'Trust %', 'Student Achievement Rating', 'Grade 8 ELA 4s - White',
       'Grade 8 ELA - All Students Tested',
       'Grade 8 ELA 4s - Black or African American',
       'Grade 8 Math - All Students Tested',
       'Grade 8 Math 4s - Economically Disadvantaged'], dtype=object)