## Лабораторная работа №15. Решение задач понижения размерности. Метод главных компонент.

Используемый набор данных: [Dow Jones Index Data Set](https://archive.ics.uci.edu/ml/datasets/Dow+Jones+Index)

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
import os
import requests
from zipfile import ZipFile

%matplotlib inline
pd.options.display.max_columns = None

In [2]:
def downloadFile(url, filePath):
    if not os.path.exists(filePath):
        req = requests.get(url)
        f = open(filePath, "wb")
        f.write(req.content)
        f.close

url = "https://archive.ics.uci.edu/ml/machine-learning-databases/00312/"
downloadFile(url + "/dow_jones_index.zip", "dataset/dow_jones_index.zip")

with ZipFile("dataset/dow_jones_index.zip", "r") as arch:
    arch.extractall("dataset")

In [3]:
headers = ["quarter", "stock", "date", "open", "high", "low", "close", "volume",
    "percent_change_price", "percent_change_volume_over_last_wk", "previous_weeks_volume",
    "next_weeks_open", "next_weeks_close", "percent_change_next_weeks_price",
    "days_to_next_dividend", "percent_return_next_dividend"]
data = pd.read_csv("dataset/dow_jones_index.data", names=headers, header=None, skiprows=[0])

data = data.astype({"stock": "category"})
data.drop(columns=["date", "quarter"], inplace=True)

display(data.dtypes)
display(data.isna().sum())

stock                                 category
open                                    object
high                                    object
low                                     object
close                                   object
volume                                   int64
percent_change_price                   float64
percent_change_volume_over_last_wk     float64
previous_weeks_volume                  float64
next_weeks_open                         object
next_weeks_close                        object
percent_change_next_weeks_price        float64
days_to_next_dividend                    int64
percent_return_next_dividend           float64
dtype: object

stock                                  0
open                                   0
high                                   0
low                                    0
close                                  0
volume                                 0
percent_change_price                   0
percent_change_volume_over_last_wk    30
previous_weeks_volume                 30
next_weeks_open                        0
next_weeks_close                       0
percent_change_next_weeks_price        0
days_to_next_dividend                  0
percent_return_next_dividend           0
dtype: int64

Преобразуем данные.

In [4]:
for col in ["open", "high", "low", "close", "next_weeks_open", "next_weeks_close"]:
    data[col] = data[col].str.replace("$", "")
    data[col] = data[col].astype(float)
for col in ["percent_change_volume_over_last_wk", "previous_weeks_volume"]:
    data[col].fillna(data[col].mode()[0], inplace=True)

display(data.isna().sum())
display(data.dtypes)

stock                                 0
open                                  0
high                                  0
low                                   0
close                                 0
volume                                0
percent_change_price                  0
percent_change_volume_over_last_wk    0
previous_weeks_volume                 0
next_weeks_open                       0
next_weeks_close                      0
percent_change_next_weeks_price       0
days_to_next_dividend                 0
percent_return_next_dividend          0
dtype: int64

stock                                 category
open                                   float64
high                                   float64
low                                    float64
close                                  float64
volume                                   int64
percent_change_price                   float64
percent_change_volume_over_last_wk     float64
previous_weeks_volume                  float64
next_weeks_open                        float64
next_weeks_close                       float64
percent_change_next_weeks_price        float64
days_to_next_dividend                    int64
percent_return_next_dividend           float64
dtype: object

In [5]:
n_components=3
X = data.drop(columns=["stock"]).copy()

pca = PCA(n_components)
pca_data = pca.fit_transform(X)


Определим, скольких  компонент хватит, чтобы объяснить 90% дисперсии.

In [6]:
for i in range(n_components):
    variance_ratio = pca.explained_variance_ratio_[i]
    if variance_ratio >= 0.9:
        display("Component {}, variance_ratio: {}".format(i + 1, variance_ratio))

'Component 1, variance_ratio: 0.9095995167792645'

Определим корреляцию Пирсона между первой компонентой и инжексом Доу-Джонса.

In [7]:
cols = ["open", "high", "low", "close"]
pearsonData = [[np.corrcoef(pca_data[:, 0], data[col])[0, 1] for col in cols]]
pd.DataFrame(data=pearsonData, columns=cols)

Unnamed: 0,open,high,low,close
0,-0.529777,-0.529008,-0.530383,-0.530174
