In [1]:
# In this section, we will try to apply the Principal Component Analysis (PCA) to analyze the data of bank marketing.
# Firstly, importing the necessary librabries
import pandas as pd
import numpy as np

# Import the data
data = pd.read_csv('bank-additional-full.csv', delimiter = ";")
df = pd.DataFrame(data)

# Check the first 5 rows in the data frame.
df.head()

Unnamed: 0,age,job,marital,education,default,housing,loan,contact,month,day_of_week,...,campaign,pdays,previous,poutcome,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,y
0,56,housemaid,married,basic.4y,no,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
1,57,services,married,high.school,unknown,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
2,37,services,married,high.school,no,yes,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
3,40,admin.,married,basic.6y,no,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
4,56,services,married,high.school,no,no,yes,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no


In [2]:
# As we can see, the data has 21 variables (attributes), in which the first 20 variables are input varibles, the last one
# is the output variable

# Description of these variables could be found in the link below, 'https://archive.ics.uci.edu/ml/datasets/bank+marketing'

# As we can see, this data set includes some numeric varibles, and some categorical variables, to apply PCA, we have to 
# digitalize them.

num_col = len(df.columns)
new_df = pd.DataFrame()
for i in range(0, num_col-1):
    # Check the type of data in each variable
    if (type(df.iloc[0,i]) == str) == True:
        # Extract all categories in each variable
        a = list(set(df.iloc[:,i]))
        encoding_var = [] 
        for n in range(0, len(df)):
            encoding_var.append(a.index(df.iloc[n,i]))
        new_df[i] = pd.Series(encoding_var)
    else:
        new_df[i] = df.iloc[:,i]

In [3]:
# Now, the data frame 'new_df' is the digitalized version of the original data frame
# However, it is not ready to be used yet, in PCA, all variables need to be standardized

for i in range(0, len(new_df.columns)):
    new_df.iloc[:,i] = (new_df.iloc[:,i]-new_df.mean(axis = 0)[i])/np.sqrt(new_df.var(axis = 0)[i])

new_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
0,1.533016,0.748598,0.444335,1.690696,0.513678,1.056733,0.438084,-1.318254,-1.21378,1.404961,0.010471,-0.565915,0.195412,-0.34949,0.19262,0.648084,0.722714,0.886436,0.712451,0.331676
1,1.628973,0.187708,0.444335,-1.367018,-1.946911,1.056733,0.438084,-1.318254,-1.21378,1.404961,-0.421495,-0.565915,0.195412,-0.34949,0.19262,0.648084,0.722714,0.886436,0.712451,0.331676
2,-0.290182,0.187708,0.444335,-1.367018,0.513678,-0.79098,0.438084,-1.318254,-1.21378,1.404961,-0.124518,-0.565915,0.195412,-0.34949,0.19262,0.648084,0.722714,0.886436,0.712451,0.331676
3,-0.002309,1.309488,0.444335,0.380247,0.513678,1.056733,0.438084,-1.318254,-1.21378,1.404961,-0.413782,-0.565915,0.195412,-0.34949,0.19262,0.648084,0.722714,0.886436,0.712451,0.331676
4,1.533016,0.187708,0.444335,-1.367018,0.513678,1.056733,-1.754891,-1.318254,-1.21378,1.404961,0.187886,-0.565915,0.195412,-0.34949,0.19262,0.648084,0.722714,0.886436,0.712451,0.331676


In [5]:
# Now, the dataset is totally transformed into the standardized form and ready to be analyzed.
# Firstly, find the covariance matrix of these variables
N = len(new_df)
Y = new_df.values
R = 1/N*np.matmul(np.transpose(Y),Y)

# Compute the eigenvalues and eigenvectors of R
from numpy import linalg as la
eig_val, eig_vec = la.eig(R)

# Now, arrange the eigenvalues in the largest to lowest order
# According to the theory, eigenvalues are the variance of each factors
sort_eig_val = np.sort(eig_val)[::-1]
DELTA = np.diag(sort_eig_val)
V = pd.DataFrame()
for i in range(0, R.shape[0]):
    V[i] = pd.Series(eig_vec[:, list(eig_val).index(sort_eig_val[i])])
V = V.values

# Cumulative sum of variances explained by the factors
cum_var = np.cumsum(sort_eig_val)/sum(sort_eig_val)*100

# If we want the latent factors can explain only 90% of total variance, then, we will choose
for i in range(0, len(cum_var)-1):
    if (cum_var[i] <= 90 and cum_var[i+1] > 90):
        k = i+1
        
print('The number of latent factors we should consider is ' + str(k))

# The loadings
B = pd.DataFrame(V).iloc[:,0:k]

print('The loading matrix is: ')
B

The number of latent factors we should consider is 13
The loading matrix is: 


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12
0,0.018066,0.311249,-0.51095,-0.224399,0.057258,0.057345,0.11651,0.165455,-0.154511,-0.016207,0.038102,0.03996,-0.037862
1,-0.053342,-0.030042,0.300961,0.037272,-0.043938,0.162153,0.364861,0.473173,-0.280273,0.187125,0.043578,0.46993,-0.401227
2,0.067324,0.209378,-0.488895,-0.183207,0.061814,-0.008997,0.075243,0.223319,-0.321764,-0.213643,0.027943,-0.215591,-0.116491
3,0.006702,0.147343,-0.216023,-0.139833,-0.006828,0.309531,0.096934,-0.153047,0.517755,0.613425,0.061855,-0.109323,-0.296137
4,-0.130274,-0.093783,0.28845,0.046795,-0.003427,0.203729,0.189901,0.268172,-0.153149,0.087776,0.094731,-0.814687,0.031361
5,0.03769,0.054005,0.003808,0.259605,0.649975,0.010883,0.032363,-0.002725,0.009788,0.039455,0.012171,-0.014645,-0.1373
6,0.001663,0.031699,-0.008849,0.22493,0.667618,0.084028,0.05343,-0.026607,0.016149,-0.023773,-0.034973,0.04755,0.134377
7,-0.252279,-0.186655,0.018404,-0.422149,0.139148,0.335724,0.042802,0.032871,0.045169,-0.053678,-0.075796,0.153966,0.392699
8,0.125683,-0.074473,0.21883,-0.614743,0.23646,0.03781,-0.028967,0.135682,0.028923,0.041738,-0.088507,-0.005137,0.013667
9,-0.013455,-0.049665,-0.013539,-0.105679,0.0354,-0.298059,0.298704,-0.447421,-0.523998,0.50649,-0.21199,-0.023093,0.167498
