In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sb
from sklearn.datasets import load_diabetes

diabetes = load_diabetes(return_X_y = False)

data = diabetes.data
headers = diabetes.feature_names
target = diabetes.target
target_header = 'Y'

nosex_data = np.delete(data, 1, axis=1)

file = 'diabetes_data.txt'

rawdata = pd.read_csv(file, sep='\s+|,' ,header=0, engine='python')
rawdf = pd.DataFrame(rawdata)

raw_nosex = np.delete(rawdata.values, [1,10], axis=1)
raw_target = rawdata.values[:,-1]
print(raw_nosex)

[[ 59.      32.1    101.     ...   4.       4.8598  87.    ]
 [ 48.      21.6     87.     ...   3.       3.8918  69.    ]
 [ 72.      30.5     93.     ...   4.       4.6728  85.    ]
 ...
 [ 60.      24.9     99.67   ...   3.77     4.1271  95.    ]
 [ 36.      30.      95.     ...   4.79     5.1299  85.    ]
 [ 36.      19.6     71.     ...   3.       4.5951  92.    ]]


In [6]:
from sklearn.decomposition import PCA

pca_std = PCA().fit(nosex_data)
pca_raw = PCA().fit(raw_nosex)

plt.plot(np.cumsum(pca_std.explained_variance_ratio_))
plt.xlabel('Number of Components')
plt.ylabel('Variance')
plt.title('Cumulative Explained Variance of Standardized Data')
plt.savefig('StdExplainedVariance.png')
plt.clf()

plt.plot(np.cumsum(pca_raw.explained_variance_ratio_))
plt.xlabel('Number of Components')
plt.ylabel('Variance')
plt.title('Cumulative Explained Variance of Raw Data')
plt.savefig('RawExplainedVariance.png')
plt.clf()

<Figure size 432x288 with 0 Axes>

In [2]:
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA

std_train, std_test, stdy_train, stdy_test = train_test_split(nosex_data, target, test_size = 0.3, random_state = 0)
raw_train, raw_test, rawy_train, rawy_test = train_test_split(raw_nosex, raw_target, test_size = 0.3, random_state = 0)

pca_std = PCA(0.9).fit(std_train)
pca_raw = PCA(0.9).fit(raw_train)

# return "PCAed" data - shape is n (number of data points/rows original dataset had) x number of PCs (ie. whatever the new dimensionality is)
tstdtrain = pca_std.transform(std_train)
tstdtest = pca_std.transform(std_test)
trawtrain = pca_raw.transform(raw_train)
trawtest = pca_raw.transform(raw_test)

# print min number of components required to explain 90% variance
print(pca_std.n_components_)
print(pca_raw.n_components_)

print(tstdtrain.shape)

6
3
(309, 6)


In [3]:
print(tstdtrain)
tstdtrain_new = pca_std.inverse_transform(tstdtrain)
print(tstdtrain_new)

[[ 0.08981515  0.02522913 -0.04420049 -0.02353178  0.01885785 -0.00791689]
 [-0.12312281 -0.0050809  -0.04632678 -0.03934343  0.06117195 -0.0050187 ]
 [ 0.05278314  0.01848511 -0.08078608 -0.002738   -0.04171401 -0.03813727]
 ...
 [ 0.18206418 -0.02201507  0.05108272 -0.04540464  0.07567119  0.02244588]
 [-0.15679671 -0.05785083 -0.07111359 -0.01044076  0.00306107  0.05481611]
 [ 0.13287331 -0.01858947  0.032655    0.01609344  0.03310316 -0.00761054]]
[[ 1.23299135e-02  1.66724387e-05 -1.04648531e-02 ...  5.93116582e-02
   2.20223864e-02  3.20166179e-02]
 [-3.97917832e-02 -8.50558529e-02 -8.17699322e-02 ... -2.50104077e-02
  -5.24063077e-02 -7.61366830e-03]
 [-1.06689088e-02  3.47080295e-02 -1.75207346e-02 ...  5.41475610e-02
  -2.23431868e-02 -2.73511724e-02]
 ...
 [ 7.50748092e-02  2.11136184e-03  5.06033129e-02 ...  6.65105513e-02
   9.66837223e-02  1.33084632e-01]
 [-7.67330756e-02 -7.00170839e-02 -6.01942172e-02 ... -2.86975585e-02
  -2.01035391e-02 -8.45022062e-02]
 [ 2.53949458e

In [4]:
trawtrain_new = pca_raw.inverse_transform(trawtrain)
print(trawtrain)

[[ 4.66605949e+01  6.64502844e-01 -1.35490112e+01]
 [-4.77809802e+01 -1.69558542e+01 -1.15017148e+01]
 [ 3.38857071e+01 -1.18645913e+01 -2.20595030e+01]
 [ 2.77474526e+01  2.71901655e+01  2.00617223e+01]
 [ 9.14370386e+01 -2.73492792e+01  9.54255006e+00]
 [-4.48265839e+01 -2.28956097e+01 -1.31337527e+01]
 [ 5.38905526e+00  2.69542347e+00 -1.78762450e+01]
 [ 2.49174305e+01 -2.56308399e+01 -1.91438567e+01]
 [ 8.77174616e+01 -9.31929166e+00 -1.82142625e+01]
 [ 1.58842264e+00  3.20491422e+00 -1.38694976e+01]
 [-3.84525610e+01 -1.63968474e+01 -2.43014284e+01]
 [ 2.88868396e+01 -1.35649478e+01 -1.81727137e+01]
 [-8.42500039e+00 -1.42222548e+01  1.19582559e+01]
 [-1.39291132e+01 -3.68740761e+01  2.71312949e+00]
 [-5.96804170e+01 -1.71315857e+01  1.64365756e+00]
 [-7.13463044e+01 -3.73623175e+00 -8.53091491e+00]
 [ 2.20304122e+01  6.77490225e+00  9.61250367e-01]
 [-3.57304897e+00  9.12968366e+00  2.50494876e+01]
 [-3.00242891e+01 -4.09137282e+00 -1.58421820e+01]
 [-3.14811953e+01  6.24511446e+

In [7]:
# Plot of the 3 principal components from raw data
from mpl_toolkits import mplot3d
fig = plt.figure()
ax = plt.axes(projection='3d')
xdata = trawtrain[:,0] # PC 1
ydata = trawtrain[:,1] # PC 2
zdata = trawtrain[:,2] # PC 3
ax.scatter3D(xdata,ydata,zdata)

ax.set_xlabel('PC 1')
ax.set_ylabel('PC 2')
ax.set_zlabel('PC 3')
ax.set_title('Three Principal Components (PC) from Raw Data without Sex Feature')
plt.savefig('PCAplot.png')
plt.clf()


<Figure size 432x288 with 0 Axes>

In [24]:
from sklearn.linear_model import LinearRegression

raw_clf = LinearRegression().fit(raw_train, rawy_train)
                                
predicted = raw_clf.predict(raw_test)
expected = rawy_test
print("RMS: %s" % np.sqrt(np.mean((predicted - expected)**2)))

# print the model score
raw_clf.score(raw_test, rawy_test)

RMS: 57.166409947865745


0.3594033298421414

In [34]:
print(tstdtest.shape)

(133, 6)


In [36]:
pcastd_clf = LinearRegression().fit(tstdtrain, stdy_train)

predicted = pcastd_clf.predict(tstdtest)
expected = stdy_test
print("RMS: %s" % np.sqrt(np.mean((predicted - expected)**2)))

# print the model score
pcastd_clf.score(tstdtest, stdy_test)

RMS: 56.919600756469094


0.3649227894763307

In [37]:
pcaraw_clf = LinearRegression().fit(trawtrain, rawy_train)

predicted = pcaraw_clf.predict(trawtest)
expected = rawy_test
print("RMS: %s" % np.sqrt(np.mean((predicted - expected)**2)))

# print the model score
pcaraw_clf.score(trawtest, rawy_test)

RMS: 60.75003944741938


0.27657091271663126

In [39]:
std_clf = LinearRegression().fit(std_train, stdy_train)

predicted = std_clf.predict(std_test)
expected = stdy_test
print("RMS: %s" % np.sqrt(np.mean((predicted - expected)**2)))

# print the model score
std_clf.score(std_test, stdy_test)

RMS: 57.16668365443128


0.3593971956128138