In [13]:
# import necessary packages
import numpy as np
import pandas as pd
import math
import matplotlib.pyplot as plt
import seaborn as sns


In [14]:
# Change 'yes' and 'no' to 1 and 0 indicating 'with disease' and 'without disease'.
def preprocessing(filename):
    fileData = open(filename, "r",encoding='utf-8-sig')
    lines = fileData.readlines()
    data = []
    for line in lines:
        x = line.split()
        data.append(x)
    headers = data.pop(0) # gives the headers as list and leaves data
    df = pd.DataFrame(data, columns=headers)
    df['type'] = df['type'].map({'Yes': 1, 'No': 0})
    df = df.apply(pd.to_numeric)
    return df

# calculate mean and variance for the dataframe
def train_mean_std(data):
    mean = data.mean ()
    std = data.std()
    return mean,std

# normalize the dataset
def normalize (data,mean,std):
    normalized_data = (data - mean)/std
    d = data[['type']].copy()
    cls = d['type']
    normalized_data ['type'] = cls
    return normalized_data

# save it to a text file for further use
def save_df (df,filename,fmt):
    col = len (df.columns)
    X = df.iloc[:,0:col].values
    Y = np.matrix(X)    
    np.savetxt(filename,Y,fmt)
    
# for training data
preprocessed_train = preprocessing ('PIMA.TR')
mean,std = train_mean_std (preprocessed_train)
norm_train = normalize (preprocessed_train,mean,std)
# print (norm_train)

# for testing data
preprocessed_test = preprocessing ('PIMA.TE')
norm_test = normalize (preprocessed_test,mean,std)
save_df (norm_train,'nX_PIMA_TR.txt','%.6f')
save_df (norm_test, 'nX_PIMA_TE.txt','%.6f')
# ### all of this can be done by using list or matrix. I have used dataframe to avoid complexities

In [15]:
## check with sklearn if the normalization is correct
# from sklearn import preprocessing
# # Get column names first
# names = preprocessed_train.columns
# # Create the Scaler object
# scaler = preprocessing.StandardScaler(copy=True, with_mean=True, with_std=True)
# # Fit your data on the scaler object
# scaled_df = scaler.fit_transform(preprocessed_train)
# scaled_df = pd.DataFrame(scaled_df, columns=names)
# scaled_df

In [16]:
# visualize the raw train and test data
def visualize_raw_data(data):
    headers = data.columns.tolist()
    n = len(headers)
    f, ax = plt.subplots(1, n, figsize=(10,3))
    for i in range (0,n):
        vis1 = sns.distplot(data[headers[i]],bins=10, ax= ax[i])
    sns.pairplot(data, hue="type")
    
# visualize_raw_data (preprocessed_train)    
# visualize_raw_data (preprocessed_test)    


In [19]:
# calculate covariance and perform eigenvalue decomposition
def basis(data):
    np.set_printoptions(formatter={'float':"{:6.20g}".format})

    col = len (data.columns)-1
    X = data.iloc[:,0:col].values
    y = data.iloc[:,col].values
    mat_X = np.matrix(X)    
    mu = mat_X.sum(axis = 0)/(len(mat_X)) 
    mean = np.matrix(mu).T
    cls = np.matrix(y).T

    covariance_mat = (mat_X - mu).T.dot((mat_X - mu))/(mat_X.shape[0]-1)
    e_values, e_vectors = np.linalg.eig(covariance_mat)
    e_pairs = [(np.abs(e_values[i]), e_vectors[:,i]) for i in range(len(e_values))]

    # sort from high to low
    e_pairs.sort()
    e_pairs.reverse()
    e_val_mat = np.asmatrix(e_values)
    # sort the eigenvalue in the ascending order
    e_val_mat.sort()

    return mat_X, e_vectors, e_val_mat, e_pairs, cls, e_values

mat_X, e_vectors, e_val_mat, e_pairs, cls, e_values= basis (norm_train)


In [20]:
# derive a new set of basis and choose the major axes with variable error rate
def PCA(eigen_val_mat,eigen_pairs,normalized_data,error_rate):
    psum = 0.0
    nf = np.size(eigen_val_mat,1)
    for i in range (0, nf):
        psum += eigen_val_mat[0, i]
    p = 0
    sum = 0.0
    while sum/psum < error_rate and p < nf:
        sum += eigen_val_mat[0,p]
        p += 1   
    pnf = nf - (p-1)  
    matrix_w = np.hstack((eigen_pairs[i][1].reshape(nf,1))for i in range (0,pnf))
    return matrix_w

# Represent the data using this new set of basis for a reduced dimension
def dimension_reduction(W, data):
    col = len (data.columns)-1
    x = data.iloc[:,0:col].values
    y = data['type'].values
    mat = np.matrix(x)
    classs = np.matrix(y).T

    reduced_data = (mat.dot(W))
    pX = np.hstack((reduced_data.real,classs))
#     print ('\n Reduced Data: \n', pX)
    return pX


In [21]:
W = PCA (e_val_mat,e_pairs,mat_X,.10)

pX = dimension_reduction (W, norm_train )
pX1 = dimension_reduction (W, norm_test )

np.savetxt('pX_PIMA_TR.txt',pX,fmt='%.4f')
np.savetxt('pX_PIMA_TE.txt',pX1,fmt='%.4f')


# PCA (eigVal,eig_pairs,mat,cls)

In [None]:
# PCA for variable error rate (eigenvalues from 1 to 7)
def PCA_error_rate(PCA_error_rate,eig_pairs,mat,data1,data2):
    error = [.01,.05,.1,.2,.4,.5,.7]
    pX = []
    pX_test = []
    for e in error:
        W = PCA (eigen_val_mat,eigen_pairs,mat_X,e)
        pX.append(dimension_reduction (W, data1))
        pX_test.append(dimension_reduction (W, data2))
#         print (W)    
    return pX,pX_test

pX,pX1 = PCA_error_rate(PCA_error_rate,eig_pairs,mat,norm_train,norm_test)
np.savetxt('pX_PIMA_TR_0.01.txt',pX[0],fmt='%.4f')
np.savetxt('pX_PIMA_TR_0.05.txt',pX[1],fmt='%.4f')
np.savetxt('pX_PIMA_TR_0.1.txt',pX[2],fmt='%.4f')
np.savetxt('pX_PIMA_TR_0.2.txt',pX[3],fmt='%.4f')
np.savetxt('pX_PIMA_TR_0.4.txt',pX[4],fmt='%.4f')
np.savetxt('pX_PIMA_TR_0.5.txt',pX[5],fmt='%.4f')
np.savetxt('pX_PIMA_TR_0.7.txt',pX[6],fmt='%.4f')

np.savetxt('pX_PIMA_TE_0.01.txt',pX1[0],fmt='%.4f')
np.savetxt('pX_PIMA_TE_0.05.txt',pX1[1],fmt='%.4f')
np.savetxt('pX_PIMA_TE_0.1.txt',pX1[2],fmt='%.4f')
np.savetxt('pX_PIMA_TE_0.2.txt',pX1[3],fmt='%.4f')
np.savetxt('pX_PIMA_TE_0.4.txt',pX1[4],fmt='%.4f')
np.savetxt('pX_PIMA_TE_0.5.txt',pX1[5],fmt='%.4f')
np.savetxt('pX_PIMA_TE_0.7.txt',pX1[6],fmt='%.4f')

# np.savetxt('pX_PIMA_TE.txt',pX1,fmt='%.4f')


# PCA (eigVal,eig_pairs,mat,cls)

In [None]:
# ## use built in PCA from sklearn to check with the model accuracy
# from sklearn.decomposition import PCA
# pca = PCA(n_components=6)
# principalComponents = pca.fit_transform(norm_train)
# principalDf = pd.DataFrame(data = principalComponents
#              , columns = ['1', '2','3','4','5','6'])

# principalDf