# Assignment 3

In [26]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

# import matplotlib
%matplotlib inline
import matplotlib

from mpl_toolkits.mplot3d import Axes3D
from plyfile import PlyData, PlyElement

# Look pretty...
matplotlib.style.use('ggplot')

# Do * NOT * alter this line, until instructed!
scaleFeatures = False

# HELPER FUNCTIONS
import math
from sklearn import preprocessing

# A Note on SKLearn .transform() calls:
#
# Any time you transform your data, you lose the column header names.
# This actually makes complete sense. There are essentially two types
# of transformations,  those that change the scale of your features,
# and those that change your features entire. Changing the scale would
# be like changing centimeters to inches. Changing the features would
# be like using PCA to reduce 300 columns to 30. In either case, the
# original column's units have been altered or no longer exist, so it's
# up to you to rename your columns after ANY transformation. Due to
# this, SKLearn returns an NDArray from *transform() calls.

def scaleFeatures(df):
    # SKLearn has many different methods for doing transforming your
    # features by scaling them (this is a type of pre-processing).
    # RobustScaler, Normalizer, MinMaxScaler, MaxAbsScaler, StandardScaler...
    # http://scikit-learn.org/stable/modules/classes.html#module-sklearn.preprocessing
    #
    # However in order to be effective at PCA, there are a few requirements
    # that must be met, and which will drive the selection of your scaler.
    # PCA required your data is standardized -- in other words it's mean is
    # equal to 0, and it has ~unit variance.
    #
    # SKLearn's regular Normalizer doesn't zero out the mean of your data,
    # it only clamps it, so it's inappropriate to use here (depending on
    # your data). MinMaxScaler and MaxAbsScaler both fail to set a unit
    # variance, so you won't be using them either. RobustScaler can work,
    # again depending on your data (watch for outliers). For these reasons
    # we're going to use the StandardScaler. Get familiar with it by visiting
    # these two websites:
    #
    # http://scikit-learn.org/stable/modules/preprocessing.html#preprocessing-scaler
    #
    # http://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.StandardScaler.html#sklearn.preprocessing.StandardScaler
    #
    
    # ---------
    # Feature scaling is the type of transformation that only changes the
    # scale and not number of features, so we'll use the original dataset
    # column names. However we'll keep in mind that the _units_ have been
    # altered:
    
    scaled = preprocessing.StandardScaler().fit_transform(df)
    scaled = pd.DataFrame(scaled, columns=df.columns)
    print "New Variances:\n", scaled.var()
    print "New Describe:\n", scaled.describe()
    return scaled

def drawVectors(transformed_features, components_, columns, plt, scaled):
    if not scaled:
        return plt.axes() # No cheating
    
    num_columns = len(columns)
    
    # This funtion will project your *original* feature (columns)
    # onto your principal component feature-space, so that you can
    # visualize how "important" each one was in the
    # multi-dimensional scaling
    
    # Scale the principal components by the max value in
    # the transformed set belonging to that component
    xvector = components_[0] * max(transformed_features[:,0])
    yvector = components_[1] * max(transformed_features[:,1])
    
    ## visualize projections
    
    # Sort each column by it's length. These are your *original*
    # columns, not the principal components.
    important_features = { columns[i] : math.sqrt(xvector[i]**2 + yvector[i]**2) for i in range(num_columns) }
    important_features = sorted(zip(important_features.values(), important_features.keys()), reverse=True)
    print "Features by importance:\n", important_features

    ax = plt.axes()

    for i in range(num_columns):
        # Use an arrow to project each original feature as a
        # labeled vector on your principal component axes
        plt.arrow(0, 0, xvector[i], yvector[i], color='b', width=0.0005, head_width=0.02, alpha=0.75)
        plt.text(xvector[i]*1.2, yvector[i]*1.2, list(columns)[i], color='b', alpha=0.75)

    return ax


## Attribute Information:

1. Age(numerical) age in years 
2. Blood Pressure(numerical) bp in mm/Hg 
3. Specific Gravity(nominal) sg - (1.005,1.010,1.015,1.020,1.025) 
4. Albumin(nominal) al - (0,1,2,3,4,5) 
5. Sugar(nominal) su - (0,1,2,3,4,5) 
6. Red Blood Cells(nominal) rbc - (normal, abnormal) 
7. Pus Cell (nominal) pc - (normal, abnormal) 
8. Pus Cell clumps(nominal) pcc - (present, notpresent) 
9. Bacteria(nominal) ba - (present, notpresent) 
10. Blood Glucose Random(numerical)	bgr in mgs/dl 
11. Blood Urea(numerical) bu in mgs/dl 
12. Serum Creatinine(numerical)	sc in mgs/dl 
13. Sodium(numerical) sod in mEq/L 
14. Potassium(numerical) pot in mEq/L 
15. Hemoglobin(numerical) hemo in gms 
16. Packed Cell Volume(numerical) 
17. White Blood Cell Count(numerical) wc in cells/cumm 
18. Red Blood Cell Count(numerical)	rc in millions/cmm 
19. Hypertension(nominal) htn - (yes,no) 
20. Diabetes Mellitus(nominal) dm - (yes,no) 
21. Coronary Artery Disease(nominal) cad - (yes,no) 
22. Appetite(nominal) appet - (good,poor) 
23. Pedal Edema(nominal) pe - (yes,no)
24. Anemia(nominal) ane - (yes,no) 
25. Class (nominal) class - (ckd,notckd)

In [30]:
# TODO: Load up the dataset and remove any and all
# Rows that have a nan. You should be a pro at this
# by now ;-)
#

df = pd.read_csv('PCA/Datasets/kidney_disease.csv', header=0, index_col=0)
df = df.drop(['classification'], 1)
df = df.dropna()

# check dataframe
df

Unnamed: 0_level_0,age,bp,sg,al,su,rbc,pc,pcc,ba,bgr,...,pcv,wc,rc,htn,dm,cad,appet,pe,ane,classification
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
3,48.0,70.0,1.005,4.0,0.0,normal,abnormal,present,notpresent,117.0,...,32,6700,3.9,yes,no,no,poor,yes,yes,ckd
9,53.0,90.0,1.020,2.0,0.0,abnormal,abnormal,present,notpresent,70.0,...,29,12100,3.7,yes,yes,no,poor,no,yes,ckd
11,63.0,70.0,1.010,3.0,0.0,abnormal,abnormal,present,notpresent,380.0,...,32,4500,3.8,yes,yes,no,poor,yes,no,ckd
14,68.0,80.0,1.010,3.0,2.0,normal,abnormal,present,present,157.0,...,16,11000,2.6,yes,yes,yes,poor,yes,no,ckd
20,61.0,80.0,1.015,2.0,0.0,abnormal,abnormal,notpresent,notpresent,173.0,...,24,9200,3.2,yes,yes,yes,poor,yes,yes,ckd
22,48.0,80.0,1.025,4.0,0.0,normal,abnormal,notpresent,notpresent,95.0,...,32,6900,3.4,yes,no,no,good,no,yes,ckd
27,69.0,70.0,1.010,3.0,4.0,normal,abnormal,notpresent,notpresent,264.0,...,37,9600,4.1,yes,yes,yes,good,yes,no,ckd
48,73.0,70.0,1.005,0.0,0.0,normal,normal,notpresent,notpresent,70.0,...,29,18900,3.5,yes,yes,no,good,yes,no,ckd
58,73.0,80.0,1.020,2.0,0.0,abnormal,abnormal,notpresent,notpresent,253.0,...,33,7200,4.3,yes,yes,yes,good,no,no,ckd
71,46.0,60.0,1.010,1.0,0.0,normal,normal,notpresent,notpresent,163.0,...,28,14600,3.2,yes,yes,no,good,no,no,ckd


In [32]:
# TODO: Print out and check your dataframe's dtypes. You'll probably
# want to call 'exit()' after you print it out so you can stop the
# program's execution.
#
# You can either take a look at the dataset webpage in the attribute info
# section: https://archive.ics.uci.edu/ml/datasets/Chronic_Kidney_Disease
# or you can actually peek through the dataframe by printing a few rows.
# What kind of data type should these three columns be? If Pandas didn't
# properly detect and convert them to that data type for you, then use
# an appropriate command to coerce these features into the right type.
#

#df = df.apply(pd.to_numeric)
#df.dtypes

In [33]:
# TODO: PCA Operates based on variance. The variable with the greatest
# variance will dominate. Go ahead and peek into your data using a
# command that will check the variance of every feature in your dataset.
# Print out the results. Also print out the results of running .describe
# on your dataset.
#
# Hint: If you don't see all three variables: 'bgr','wc' and 'rc', then
# you probably didn't complete the previous step properly.
#

# print variance
print (df.var())

# print description
print (df.describe())

age      240.629727
bp       124.889140
sg         0.000030
al         1.996936
su         0.661614
bgr     4217.181811
bu      2246.322220
sc         9.471717
sod       56.091429
pot       12.085013
hemo       8.307100
dtype: float64
              age          bp          sg          al          su         bgr  \
count  158.000000  158.000000  158.000000  158.000000  158.000000  158.000000   
mean    49.563291   74.050633    1.019873    0.797468    0.253165  131.341772   
std     15.512244   11.175381    0.005499    1.413130    0.813397   64.939832   
min      6.000000   50.000000    1.005000    0.000000    0.000000   70.000000   
25%     39.250000   60.000000    1.020000    0.000000    0.000000   97.000000   
50%     50.500000   80.000000    1.020000    0.000000    0.000000  115.500000   
75%     60.000000   80.000000    1.025000    1.000000    0.000000  131.750000   
max     83.000000  110.000000    1.025000    4.000000    5.000000  490.000000   

               bu          sc      

In [35]:
# TODO: This method assumes your dataframe is called df. If it isn't,
# make the appropriate changes. Don't alter the code in scaleFeatures()
# just yet though!
#

#if scaleFeatures: df = scaleFeatures(df)

In [36]:
# TODO: Run PCA on your dataset and reduce it to 2 components
# Ensure your PCA instance is saved in a variable called 'pca',
# and that the results of your transformation are saved in 'T'.
#

# import sklearn pca
from sklearn.decomposition import PCA

# train on input
pca = PCA(n_components=2)
pca.fit(df)

# transform input
T = pca.transform(df)

ValueError: could not convert string to float: notckd

In [37]:
# Plot the transformed data as a scatter plot. Recall that transforming
# the data will result in a NumPy NDArray. You can either use MatPlotLib
# to graph it directly, or you can convert it to DataFrame and have pandas
# do it for you.
#
# Since we've already demonstrated how to plot directly with MatPlotLib in
# Module4/assignment1.py, this time we'll convert to a Pandas Dataframe.
#
# Since we transformed via PCA, we no longer have column names. We know we
# are in P.C. space, so we'll just define the coordinates accordingly:

ax = drawVectors(T, pca.components_, df.columns.values, plt, scaleFeatures)
T = pd.DataFrame(T)
T.columns = ['component1', 'component2']
T.plot.scatter(x='component1', y='component2', marker='o', c=labels, alpha=0.75, ax=ax)
plt.show()

NameError: name 'T' is not defined