In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.decomposition import PCA

## Principal Components Analysis

In [None]:
SP500_DATA_CSV = '../data/sp500_data.csv.gz'

- 셰브런(Chevron, CVX)와 엑슨모빌(Exxon Mobil, XOM)의 주가 수익 데이터

In [None]:
sp500_px = pd.read_csv(SP500_DATA_CSV, index_col=0)
oil_px = sp500_px[['XOM', 'CVX']]
print(oil_px.head())

In [None]:
oil_x = [i for i in range(len(oil_px.index))]

In [None]:
plt.figure('original', figsize=(4, 4))
plt.plot(oil_x, oil_px['XOM'])
plt.show()

In [None]:
plt.figure('original', figsize=(4, 4))
plt.plot(oil_x, oil_px['CVX'])
plt.show()

In [None]:
plt.figure('original', figsize=(4, 4))
plt.scatter(oil_px['XOM'], oil_px['CVX'])
plt.xlim(-3, 3)
plt.ylim(-3, 3)
plt.show()

In [None]:
pcs = PCA(n_components=2)
pcs.fit(oil_px)
loadings = pd.DataFrame(pcs.components_, columns=oil_px.columns)
print(loadings)

첫번째 주성분 - 두 석유 회사 사이의 상관관계를 반영하는 CVX & XOM의 평균 (양의 상관관계?)

두번째 주성분 - 두 회사의 주가가 달라지는 지점을 반영 (음의 상관관계?)

In [None]:
def abline(slope, intercept, ax):
    """Calculate coordinates of a line based on slope and intercept"""
    x_vals = np.array(ax.get_xlim())
    return (x_vals, intercept + slope * x_vals)

ax = oil_px.plot.scatter(x='XOM', y='CVX', alpha=0.3, figsize=(4, 4))
ax.set_xlim(-3, 3)
ax.set_ylim(-3, 3)
ax.plot(*abline(loadings.loc[0, 'CVX'] / loadings.loc[0, 'XOM'], 0, ax), '--', color='C1')
ax.plot(*abline(loadings.loc[1, 'CVX'] / loadings.loc[1, 'XOM'], 0, ax), '--', color='C1')

plt.tight_layout()
plt.show()

- 스크리 그래프 (screeplot)

In [None]:
syms = sorted(['AAPL', 'MSFT', 'CSCO', 'INTC', 'CVX', 'XOM', 'SLB', 'COP',
               'JPM', 'WFC', 'USB', 'AXP', 'WMT', 'TGT', 'HD', 'COST'])
top_sp = sp500_px.loc[sp500_px.index >= '2011-01-01', syms]

sp_pca = PCA()
sp_pca.fit(top_sp)

explained_variance = pd.DataFrame(sp_pca.explained_variance_)
ax = explained_variance.head(10).plot.bar(legend=False, figsize=(4, 4))
ax.set_xlabel('Component')

plt.tight_layout()
plt.show()

In [None]:
print(type(top_sp))
print(type(top_sp[['AAPL']]))
print(type(top_sp['AAPL']))

In [None]:
print(top_sp[['AAPL']])
print(top_sp[['AAPL']].shape)
print(top_sp[['AAPL']].index)

In [None]:
print(top_sp['AAPL'])
print(top_sp['AAPL'].shape)
print(top_sp['AAPL'].index)

In [None]:
loadings = pd.DataFrame(sp_pca.components_[0:5, :], columns=top_sp.columns)
print(loadings.head())

In [None]:
maxPC = 1.01 * loadings.loc[0:5, :].abs().to_numpy().max()

f, axes = plt.subplots(5, 1, figsize=(5, 5), sharex=True)

for i, ax in enumerate(axes):
    pc_loadings = loadings.loc[i, :]
    colors = ['C0' if l > 0 else 'C1' for l in pc_loadings]
    ax.axhline(color='#888888')
    pc_loadings.plot.bar(ax=ax, color=colors)
    ax.set_ylabel(f'PC{i+1}')
    ax.set_ylim(-maxPC, maxPC)

plt.tight_layout()
plt.show()