In [1]:
# import necessary packages
import numpy as np
import pandas as pd
import math
import matplotlib.pyplot as plt
import seaborn as sns


In [2]:
#Change 'yes' and 'no' to 1 and 0 indicating 'with disease' and 'without disease'.
def preprocessing(filename):
    fileData = open(filename, "r",encoding='utf-8-sig')
    lines = fileData.readlines()
    data = []
    for line in lines:
        x = line.split()
        data.append(x)
    headers = data.pop(0) # gives the headers as list and leaves data
    df = pd.DataFrame(data, columns=headers)
    df['type'] = df['type'].map({'Yes': 1, 'No': 0})
    df = df.apply(pd.to_numeric)
    return df

# calculate mean and variance for the dataframe
def train_mean_std(data):
    ## calculate mean and variance for the dataframe
    mean = data.mean ()
    std = data.std()
    return mean,std
    
# normalize the dataset    
def normalize (data,mean,std):
    normalized_data = (data - mean)/std
    d = data[['type']].copy()
    cls = d['type']
    normalized_data ['type'] = cls
    return normalized_data

# save it to a text file for further use
def save_df (df,filename,fmt):
    col = len (df.columns)
    X = df.iloc[:,0:col].values
    Y = np.matrix(X)    
    np.savetxt(filename,Y,fmt)
    
# for training data
preprocessed_train = preprocessing ('PIMA.TR')
mean,std = train_mean_std (preprocessed_train)
norm_train = normalize (preprocessed_train,mean,std)
# print (norm_train)

# for testing data
preprocessed_test = preprocessing ('PIMA.TE')
norm_test = normalize (preprocessed_test,mean,std)


In [5]:
# calculate the mean-vectors for each class
def mean_vectors(x, y):
    class_ = np.unique(y)
    mean_vectors = []
    for cls in class_:
        mean_vectors.append(np.mean(x[y==cls], axis=0))
    return mean_vectors

# calculate the within class scatter matrix 
def scatter_within(x, y):
    class_ = np.unique(y)
    n_col = x.shape[1]
    mean = mean_vectors(x, y)
    Sw = np.zeros((n_col, n_col))
    for cls, mean_vec in zip(class_, mean):
        Sc = np.zeros((n_col, n_col))                 
        for row in x[y == cls]:
            row, mean_vec = row.reshape(n_col, 1), mean_vec.reshape(n_col, 1)
            Sc += (row-mean_vec).dot((row-mean_vec).T)
        Sw += Sc   
    return Sw

# create W matrix
def create_w (train_data):
    col = len (train_data.columns)-1
    x = train_data.iloc[:,0:col].values
    y = train_data['type'].values
    mean_vec = mean_vectors(x, y)
    Sw = scatter_within(x, y)
    W = (np.linalg.inv(Sw)).dot(mean_vec[0]-mean_vec[1])
    W_mat = np.matrix(W).T
    return W_mat
    
# build LDA model with mean and Sw    
def LDA(data, W_mat):
    col = len (data.columns)-1
    x = data.iloc[:,0:col].values
    y = data['type'].values
    mat = np.matrix(x)
    classs = np.matrix(y).T

    reduced_data = mat.dot(W_mat)
    fX = np.hstack((reduced_data.real,classs))
#     print ('\n Reduced Data: \n', fX)
    return fX

W_mat = create_w (norm_train)
fX = LDA (norm_train,W_mat )
fX1 = LDA (norm_test, W_mat )
np.savetxt('fX_PIMA_TR.txt',fX,fmt='%.8f')
np.savetxt('fX_PIMA_TE.txt',fX1,fmt='%.8f')

