# Implementing PCA

In [1]:
# import all libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
#%matplotlib inline
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.datasets import load_breast_cancer

In [2]:
data=load_breast_cancer()
print("Data keys: ",data.keys())

# Check the output classes
print("target names: ",data['target_names'])

# Check the input attributes
print("feature names: ",data['feature_names'])

Data keys:  dict_keys(['data', 'target', 'frame', 'target_names', 'DESCR', 'feature_names', 'filename'])
target names:  ['malignant' 'benign']
feature names:  ['mean radius' 'mean texture' 'mean perimeter' 'mean area'
 'mean smoothness' 'mean compactness' 'mean concavity'
 'mean concave points' 'mean symmetry' 'mean fractal dimension'
 'radius error' 'texture error' 'perimeter error' 'area error'
 'smoothness error' 'compactness error' 'concavity error'
 'concave points error' 'symmetry error' 'fractal dimension error'
 'worst radius' 'worst texture' 'worst perimeter' 'worst area'
 'worst smoothness' 'worst compactness' 'worst concavity'
 'worst concave points' 'worst symmetry' 'worst fractal dimension']


In [3]:
df1 = pd.DataFrame(data['data'],columns=data['feature_names'])
scaling = StandardScaler()
scaling.fit(df1)
scaled_data = scaling.transform(df1)
scaled_data

array([[ 1.09706398, -2.07333501,  1.26993369, ...,  2.29607613,
         2.75062224,  1.93701461],
       [ 1.82982061, -0.35363241,  1.68595471, ...,  1.0870843 ,
        -0.24388967,  0.28118999],
       [ 1.57988811,  0.45618695,  1.56650313, ...,  1.95500035,
         1.152255  ,  0.20139121],
       ...,
       [ 0.70228425,  2.0455738 ,  0.67267578, ...,  0.41406869,
        -1.10454895, -0.31840916],
       [ 1.83834103,  2.33645719,  1.98252415, ...,  2.28998549,
         1.91908301,  2.21963528],
       [-1.80840125,  1.22179204, -1.81438851, ..., -1.74506282,
        -0.04813821, -0.75120669]])

In [4]:
principal = PCA(n_components=3)
principal.fit(scaled_data)
x = principal.transform(scaled_data)


In [7]:
print("shape of x: ",x.shape)
print("principal components: ",principal.components_)


shape of x:  (569, 3)
principal components:  [[ 0.21890244  0.10372458  0.22753729  0.22099499  0.14258969  0.23928535
   0.25840048  0.26085376  0.13816696  0.06436335  0.20597878  0.01742803
   0.21132592  0.20286964  0.01453145  0.17039345  0.15358979  0.1834174
   0.04249842  0.10256832  0.22799663  0.10446933  0.23663968  0.22487053
   0.12795256  0.21009588  0.22876753  0.25088597  0.12290456  0.13178394]
 [-0.23385713 -0.05970609 -0.21518136 -0.23107671  0.18611302  0.15189161
   0.06016536 -0.0347675   0.19034877  0.36657547 -0.10555215  0.08997968
  -0.08945723 -0.15229263  0.20443045  0.2327159   0.19720728  0.13032156
   0.183848    0.28009203 -0.21986638 -0.0454673  -0.19987843 -0.21935186
   0.17230435  0.14359317  0.09796411 -0.00825724  0.14188335  0.27533947]
 [-0.00853126  0.06454989 -0.00931424  0.02869953 -0.10429205 -0.07409156
   0.0027339  -0.02556348 -0.04023993 -0.02257396  0.26848138  0.37463368
   0.26664538  0.21600651  0.30883904  0.15477959  0.17646372  0.2

# Handling missing values

In [8]:
dictionary = {'First Score':[87,82,np.nan,96], 'Second Score':[42,32,77,np.nan], 'Third Score':[np.nan,52,97,22]}

df = pd.DataFrame(dictionary)

df = df.fillna(0)

print("Dataframe: ", df)

Dataframe:     First Score  Second Score  Third Score
0         87.0          42.0          0.0
1         82.0          32.0         52.0
2          0.0          77.0         97.0
3         96.0           0.0         22.0


# Label encoder and hot encoder

In [9]:
from sklearn import preprocessing
le = preprocessing.LabelEncoder()

In [10]:
X,y=load_breast_cancer(return_X_y=True,as_frame=False)
X=X.ravel()
X

array([1.799e+01, 1.038e+01, 1.228e+02, ..., 0.000e+00, 2.871e-01,
       7.039e-02])

In [11]:
le.fit_transform(X)

array([ 8962,  8372, 10750, ...,     0,  6427,  3909])

In [14]:
data = pd.read_csv("diabetes.csv")
print(data.head())

   Pregnancies  Glucose  BloodPressure  SkinThickness  Insulin   BMI  \
0            6      148             72             35        0  33.6   
1            1       85             66             29        0  26.6   
2            8      183             64              0        0  23.3   
3            1       89             66             23       94  28.1   
4            0      137             40             35      168  43.1   

   DiabetesPedigreeFunction  Age  Outcome  
0                     0.627   50        1  
1                     0.351   31        0  
2                     0.672   32        1  
3                     0.167   21        0  
4                     2.288   33        1  


In [15]:
print(data['BMI'].unique())
print(data['Pregnancies'].unique())


[33.6 26.6 23.3 28.1 43.1 25.6 31.  35.3 30.5  0.  37.6 38.  27.1 30.1
 25.8 30.  45.8 29.6 43.3 34.6 39.3 35.4 39.8 29.  36.6 31.1 39.4 23.2
 22.2 34.1 36.  31.6 24.8 19.9 27.6 24.  33.2 32.9 38.2 37.1 34.  40.2
 22.7 45.4 27.4 42.  29.7 28.  39.1 19.4 24.2 24.4 33.7 34.7 23.  37.7
 46.8 40.5 41.5 25.  25.4 32.8 32.5 42.7 19.6 28.9 28.6 43.4 35.1 32.
 24.7 32.6 43.2 22.4 29.3 24.6 48.8 32.4 38.5 26.5 19.1 46.7 23.8 33.9
 20.4 28.7 49.7 39.  26.1 22.5 39.6 29.5 34.3 37.4 33.3 31.2 28.2 53.2
 34.2 26.8 55.  42.9 34.5 27.9 38.3 21.1 33.8 30.8 36.9 39.5 27.3 21.9
 40.6 47.9 50.  25.2 40.9 37.2 44.2 29.9 31.9 28.4 43.5 32.7 67.1 45.
 34.9 27.7 35.9 22.6 33.1 30.4 52.3 24.3 22.9 34.8 30.9 40.1 23.9 37.5
 35.5 42.8 42.6 41.8 35.8 37.8 28.8 23.6 35.7 36.7 45.2 44.  46.2 35.
 43.6 44.1 18.4 29.2 25.9 32.1 36.3 40.  25.1 27.5 45.6 27.8 24.9 25.3
 37.9 27.  26.  38.7 20.8 36.1 30.7 32.3 52.9 21.  39.7 25.5 26.2 19.3
 38.1 23.5 45.5 23.1 39.9 36.8 21.8 41.  42.2 34.4 27.2 36.5 29.8 39.2
 38.4 36.

In [16]:
data['BMI'].value_counts()
data['Pregnancies'].value_counts()


1     135
0     111
2     103
3      75
4      68
5      57
6      50
7      45
8      38
9      28
10     24
11     11
13     10
12      9
14      2
15      1
17      1
Name: Pregnancies, dtype: int64

In [17]:
one_hot_encoded_data = pd.get_dummies(data, columns = ['Pregnancies', 'BMI'])
print(one_hot_encoded_data)

     Glucose  BloodPressure  SkinThickness  Insulin  DiabetesPedigreeFunction  \
0        148             72             35        0                     0.627   
1         85             66             29        0                     0.351   
2        183             64              0        0                     0.672   
3         89             66             23       94                     0.167   
4        137             40             35      168                     2.288   
..       ...            ...            ...      ...                       ...   
763      101             76             48      180                     0.171   
764      122             70             27        0                     0.340   
765      121             72             23      112                     0.245   
766      126             60              0        0                     0.349   
767       93             70             31        0                     0.315   

     Age  Outcome  Pregnanc

# Outlier Detection and removal

In [18]:
data = [1, 2, 2, 2, 3, 1, 1, 15, 2, 2, 2, 3, 1, 1, 2]
mean = np.mean(data)
std = np.std(data)
print('mean of the dataset is', mean)
print('std. deviation is', std)

mean of the dataset is 2.6666666666666665
std. deviation is 3.3598941782277745


In [19]:
threshold = 3
outlier = []
for i in data:
    z = (i-mean)/std
    if z > threshold:
        data.remove(i)
        print('outlier in dataset is', i)
print('List after removal of outlier',data)

outlier in dataset is 15
List after removal of outlier [1, 2, 2, 2, 3, 1, 1, 2, 2, 2, 3, 1, 1, 2]


# Min Max Normalisation

In [20]:
df = pd.DataFrame([[180000, 110, 18.9, 1400],
			[360000, 905, 23.4, 1800],
			[230000, 230, 14.0, 1300],
			[60000, 450, 13.5, 1500]],
columns=['Col A', 'Col B','Col C', 'Col D'])

In [21]:
display(df)

Unnamed: 0,Col A,Col B,Col C,Col D
0,180000,110,18.9,1400
1,360000,905,23.4,1800
2,230000,230,14.0,1300
3,60000,450,13.5,1500


In [22]:
df_min_max_scaled = df.copy()

# apply normalization techniques
for column in df_min_max_scaled.columns:
    df_min_max_scaled[column] = (df_min_max_scaled[column] -df_min_max_scaled[column].min()) / (df_min_max_scaled[column].max() - df_min_max_scaled[column].min())	

# view normalized data
print(df_min_max_scaled)

      Col A     Col B     Col C  Col D
0  0.400000  0.000000  0.545455    0.2
1  1.000000  1.000000  1.000000    1.0
2  0.566667  0.150943  0.050505    0.0
3  0.000000  0.427673  0.000000    0.4
