# PCA/Cluster Notebook
[Return to project overview](final_project_overview.ipynb)

### Andrew Larimer, Deepak Nagaraj, Daniel Olmstead, Michael Winton (W207-4-Summer 2018 Final Project)

In [None]:
# import necessary libraries
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from util import our_train_test_split

# set default options
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 200)

%matplotlib inline

In [None]:
# Import cleaned dataset
merged_df = pd.read_csv('data_merged/combined_data_2018-07-18.csv')

# Keep the numeric columns.
features_to_keep = [
                    'high_registrations',
                    #'district', 
                    #'zip',
                    'community_school', 
                    'economic_need_index', 
                    #'school_income_estimate',
                    'percent_ell', 
                    'percent_asian', 
                    'percent_black', 
                    'percent_hispanic',
                    'percent_black__hispanic', 
                    'percent_white', 
                    'student_attendance_rate',
                    'percent_of_students_chronically_absent',
                    'rigorous_instruction_percent', 
                    'rigorous_instruction_rating',
                    'collaborative_teachers_percent', 
                    'collaborative_teachers_rating',
                    'supportive_environment_percent', 
                    'supportive_environment_rating',
                    'effective_school_leadership_percent',
                    'effective_school_leadership_rating',
                    'strong_family_community_ties_percent',
                    'strong_family_community_ties_rating', 
                    'trust_percent', 
                    'trust_rating',
                    'student_achievement_rating', 
                    'average_ela_proficiency',
                    'average_math_proficiency', 
                    'grade_7_ela_all_students_tested',
                    'grade_7_ela_4s_all_students',
                    'grade_7_ela_4s_american_indian_or_alaska_native',
                    'grade_7_ela_4s_black_or_african_american',
                    'grade_7_ela_4s_hispanic_or_latino',
                    'grade_7_ela_4s_asian_or_pacific_islander', 
                    'grade_7_ela_4s_white',
                    'grade_7_ela_4s_multiracial',
                    'grade_7_ela_4s_limited_english_proficient',
                    'grade_7_ela_4s_economically_disadvantaged',
                    'grade_7_math_all_students_tested', 
                    'grade_7_math_4s_all_students',
                    'grade_7_math_4s_american_indian_or_alaska_native',
                    'grade_7_math_4s_black_or_african_american',
                    'grade_7_math_4s_hispanic_or_latino',
                    'grade_7_math_4s_asian_or_pacific_islander', 
                    'grade_7_math_4s_white',
                    'grade_7_math_4s_multiracial',
                    'grade_7_math_4s_limited_english_proficient',
                    'grade_7_math_4s_economically_disadvantaged',
                    'number_of_students_english', 
                    'number_of_students_math',
                    'number_of_students_science', 
                    'number_of_students_social_studies',
                    'number_of_classes_english', 
                    'number_of_classes_math',
                    'number_of_classes_science', 
                    'number_of_classes_social_studies',
                    'average_class_size_english', 
                    'average_class_size_math',
                    'average_class_size_science',
                    'average_class_size_social_studies',
                    'school_pupil_teacher_ratio'
                   ]

X = merged_df[features_to_keep]
X.head()

In [None]:
y = X['high_registrations']
X = X.drop(['high_registrations'], axis=1)

In [None]:
from sklearn.preprocessing import Imputer

imp = Imputer(missing_values=np.nan, strategy='mean')
X_i = pd.DataFrame(imp.fit_transform(X))
X_i.columns = X.columns
X_i.index = X.index
X_i.head()

In [None]:
# Split the full dataset into high and low-registrant
X_pos = X_i[y==1]
X_neg = X_i[y==0]

In [None]:
from functools import partial
from sklearn.model_selection import train_test_split
import util

train_data, test_data, train_labels, test_labels = our_train_test_split(X_i, y, stratify = y)
train_data.head()

In [None]:
# Split just the training data into high and low-registrant
train_pos = train_data[train_labels==1]
train_neg = train_data[train_labels==0]

In [None]:
from sklearn.decomposition import SparsePCA
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
scaled_X = scaler.fit_transform(X_i)

pca = SparsePCA(n_components=2, alpha=2, random_state=207)
pc = pca.fit_transform(scaled_X)
pcdf = pd.DataFrame(data = pc, columns=['pc1', 'pc2'])

# Attach labels
pcdf = pd.concat([pcdf, y], axis=1)

# Plot the Results
fig = plt.figure(figsize=(10,10))
ax = fig.add_subplot(1,1,1)
ax.set_xlabel('Principal Component 1', fontsize=15)
ax.set_ylabel('Principal Component 2', fontsize=15)
ax.set_title('2-component PCA for SHSAT Registration', fontsize=20)
ax.scatter(pcdf.loc[pcdf['high_registrations']==1, 'pc1'], pcdf.loc[pcdf['high_registrations']==1, 'pc2'], c='blue', s=50)
ax.scatter(pcdf.loc[pcdf['high_registrations']==0, 'pc1'], pcdf.loc[pcdf['high_registrations']==0, 'pc2'], c='red', s=50)
ax.legend(['High Registrations', 'Low Registrations'])
ax.grid()

In [None]:
from sklearn.mixture import GaussianMixture
import matplotlib.patches as mpatches
from matplotlib.colors import LogNorm

scaled_train_X = scaler.fit_transform(train_data)
scaled_test_X = scaler.fit_transform(test_data)

pca = SparsePCA(n_components=2, alpha=2, random_state=207)
pc = pca.fit_transform(scaled_train_X)

# Filter out the positive (poisonous) results
pc_pos = pc[train_labels==1]
# Filter out the negative (non-poisonous) results
pc_neg = pc[train_labels==0]

# Fit a GMM for the positive examples
clfpos = GaussianMixture(n_components=4, covariance_type='full', random_state=207)
clfpos.fit(pc_pos)
# Fit a GMM for the negative examples
clfneg = GaussianMixture(n_components=4, covariance_type='full', random_state=207)
clfneg.fit(pc_neg)

# Run PCA on the test answer
pca_test = pca.transform(scaled_test_X)
# Filter out the positive and negative results from the test data
pca_test_pos = pca_test[test_labels==1]
pca_test_neg = pca_test[test_labels==0]
# Obtain the log-likelihood score for each result in the positive and negative result set
score1 = clfpos.score_samples(pca_test)
score0 = clfneg.score_samples(pca_test)
# Compare site of likelihood to create a results array that looks like the labels array, and compare those for accuracy
print("Accuracy: {:.2f}%".format(np.mean((score1 >= score0)==test_labels)*100))

# The rest of this is to generate the plots
# Create a result set like the test_labels array
predicted_result = (score1 >= score0).astype(int)
# Create sub-arrays for all the correct and incorrect answers
pos_correct = np.logical_and(test_labels==1, predicted_result==1) # True Positives
neg_correct = np.logical_and(test_labels==0, predicted_result==0) # True Negatives
pos_wrong = np.logical_and(test_labels==0, predicted_result==1) # False Positives
neg_wrong = np.logical_and(test_labels==1, predicted_result==0) # False NEgatives

# Compare each of these to the PCA resultset
pca_pred_pos_y = pca_test[pos_correct]
pca_pred_neg_y = pca_test[neg_correct]
pca_pred_pos_n = pca_test[pos_wrong]
pca_pred_neg_n = pca_test[neg_wrong]

# display predicted scores by the model as a contour plot
fig = plt.figure(figsize=(20,10))
red_patch = mpatches.Patch(color='red', label='Classified Incorrectly')
blue_patch = mpatches.Patch(color='blue', label='Classified Correctly')
rangex = np.linspace(-.15, .45)
rangey = np.linspace(-.3, .2)
rangeX, rangeY = np.meshgrid(rangex, rangey)
XX = np.array([rangeX.ravel(), rangeY.ravel()]).T

# Plot the positive results
Z = abs(clfpos.score_samples(XX))
Z = Z.reshape(rangeX.shape)

ax = fig.add_subplot(1,2,1)
ax.contour(rangeX, rangeY, Z, norm=LogNorm(vmin=1.0, vmax=20.0), levels=np.logspace(0, 2, 10))
ax.set_title('Test Values plotted in Positive Training Space \n (n_components=4, covariance_type=full)', fontsize=15)
ax.scatter(pca_pred_pos_n[:, 0], pca_pred_pos_n[:, 1], 40, c='red')
ax.scatter(pca_pred_pos_y[:, 0], pca_pred_pos_y[:, 1], 20, c='blue')
ax.axis('equal')
ax.legend(handles=[blue_patch, red_patch])
ax.grid()

# Plot the negative results
Z2 = abs(clfneg.score_samples(XX))
Z2 = Z2.reshape(rangeX.shape)

ax = fig.add_subplot(1,2,2)
ax.contour(rangeX, rangeY, Z2, norm=LogNorm(vmin=1.0, vmax=20.0), levels=np.logspace(0, 2, 10))
ax.set_title('Test Values plotted in Negative Training Space \n (n_components=4, covariance_type=full)', fontsize=15)
ax.scatter(pca_pred_neg_n[:, 0], pca_pred_neg_n[:, 1], 40, c='red')
ax.scatter(pca_pred_neg_y[:, 0], pca_pred_neg_y[:, 1], 20, c='blue')
ax.legend(handles=[blue_patch, red_patch])
ax.axis('equal')
ax.grid()    

plt.show()