In [1]:
import pandas as pd
import numpy as np
import pandas_datareader as pdr
import matplotlib.pyplot as plt
import FinanceDataReader as fdr
import matplotlib
matplotlib.rcParams['font.family'] = 'Malgun Gothic' # Windows
# matplotlib.rcParams['font.family'] = 'AppleGothic' # Mac
matplotlib.rcParams['font.size'] = 12 # 글자 크기
matplotlib.rcParams['axes.unicode_minus'] = False # 한글 폰트 사용 시, 마이너스 글자가 깨지는 현상을 해결

## 파일 읽어오는 함수

In [2]:
def Read_data(filename):
    df = pd.read_csv("{}.csv".format(filename))
    df.set_index('Date', inplace=True)
    df.sort_index(ascending=True, inplace=True)
    df['Name'] = filename # 식별자 추가(나중에 분석 돌리기 위해서 추가함..)
    return df  

## Test

In [3]:
UK_df = Read_data("UK")
KS200_df = Read_data("KS200")
OIL_df = Read_data("OIL")
CD91_df = Read_data("CD91")

In [4]:
CD91_df.head()

Unnamed: 0_level_0,Close,Name
Date,Unnamed: 1_level_1,Unnamed: 2_level_1
2023-03-02,3.61,CD91
2023-03-03,3.61,CD91
2023-03-06,3.61,CD91
2023-03-07,3.61,CD91
2023-03-08,3.61,CD91


## 파일 저장 후 그 파일의 인덱스 맞추는 함수

In [5]:
def array_index(*dfs):
    longest_df = max(dfs, key=lambda df: len(df)) # 넣은 데이터 중 가장 길이가 긴 데이터 찾기
    result_dfs = []
    
    for df in dfs:
        df = df.reindex(longest_df.index).fillna(method='ffill') # 가장 긴 데이터 기준 인덱스 채워넣기
        result_dfs.append(df)
        
    return result_dfs

#### Test

In [6]:
len(UK_df), len(KS200_df), len(OIL_df), len(CD91_df)

(58, 55, 57, 54)

In [7]:
UK_df, KS200_df, OIL_df, CD91_df = array_index(UK_df, KS200_df, OIL_df, CD91_df)

In [8]:
UK_df.head()

Unnamed: 0_level_0,Open,High,Low,Close,Adj Close,Volume,Name
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2023-03-01,1321.599976,1323.77002,1298.069946,1321.599976,1321.599976,0,UK
2023-03-02,1305.72998,1317.0,1304.079956,1305.72998,1305.72998,0,UK
2023-03-03,1311.060059,1311.290039,1298.199951,1311.060059,1311.060059,0,UK
2023-03-06,1294.780029,1301.160034,1293.589966,1294.780029,1294.780029,0,UK
2023-03-07,1299.0,1315.810059,1296.660034,1299.0,1299.0,0,UK


## 종가(Close),Name 데이터 중 NaN 값을 그 다음 날짜 값으로 채워넣는 함수

In [9]:
def fill_Close_data(*dfs):
    result = []
    for df in dfs:
        df['Close'] = df['Close'].fillna(method='bfill')
        df['Name'] = df['Name'].fillna(method='bfill')
        result.append(df)
    return result

#### Test

In [10]:
UK_df , KS200_df, OIL_df, CD91_df = fill_Close_data(UK_df, KS200_df, OIL_df, CD91_df)

In [11]:
UK_df.head()

Unnamed: 0_level_0,Open,High,Low,Close,Adj Close,Volume,Name
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2023-03-01,1321.599976,1323.77002,1298.069946,1321.599976,1321.599976,0,UK
2023-03-02,1305.72998,1317.0,1304.079956,1305.72998,1305.72998,0,UK
2023-03-03,1311.060059,1311.290039,1298.199951,1311.060059,1311.060059,0,UK
2023-03-06,1294.780029,1301.160034,1293.589966,1294.780029,1294.780029,0,UK
2023-03-07,1299.0,1315.810059,1296.660034,1299.0,1299.0,0,UK


In [12]:
UK_df['Close'].isnull().any(), KS200_df['Close'].isnull().any(), OIL_df['Close'].isnull().any(), CD91_df['Close'].isnull().any()
# NaN 값이 하나라도 있는 경우 True, 아니면 False

(False, False, False, False)

In [13]:
UK_df['Name'].isnull().any(), KS200_df['Name'].isnull().any(), OIL_df['Name'].isnull().any(), CD91_df['Name'].isnull().any()

(False, False, False, False)

## 데이터 처리 된 파일 저장

In [14]:
def get_Data_SaveCsv(df, date, filename): # date ex)'2023-01-01' filename ex) 'OIL'
    df = df.loc[date:]
    return df.to_csv('{}.csv'.format(filename), index=True)

# df 는 table, ex) OIL_df, KS200_df ..등등
# date 에는 언제부터 ~ 오늘까지 가져오는 날짜임(시작날짜)
# filename 은 내가 저장할 파일 이름 적어주면 됨

In [15]:
Total_df = UK_df, KS200_df, OIL_df, CD91_df

In [16]:
for x in Total_df:
    get_Data_SaveCsv(x,"2023-01-01",x['Name'].iloc[0])

## 상관관계 분석(인덱스, 데이터 다 맞아야함)

In [17]:
import scipy.stats # 상관관계 분석 import

In [18]:
scipy.stats.pearsonr(KS200_df['Close'], UK_df['Close'])

PearsonRResult(statistic=0.5070005866795846, pvalue=4.878277317690711e-05)

In [19]:
def anal_scipy(*dfs):
    result = []
    for i in range(len(dfs)):
        for j in range(i+1, len(dfs)):
            x = dfs[i]
            y = dfs[j]
            result_key = str(x['Name'].iloc[0]) + ", " + str(y['Name'].iloc[0]) # UK,KS200 이런 형태로 만들어줌
            result_value = (result_key, scipy.stats.pearsonr(x['Close'], y['Close']))
            result.append(result_value)

    return result

In [20]:
anal_scipy(UK_df, KS200_df, OIL_df, CD91_df)

[('UK, KS200',
  PearsonRResult(statistic=0.5070005866795846, pvalue=4.878277317690711e-05)),
 ('UK, OIL',
  PearsonRResult(statistic=-0.2831039947611312, pvalue=0.03128820682475924)),
 ('UK, CD91',
  PearsonRResult(statistic=-0.17897795388878196, pvalue=0.17886216934713917)),
 ('KS200, OIL',
  PearsonRResult(statistic=0.1474624844848569, pvalue=0.26931308135635246)),
 ('KS200, CD91',
  PearsonRResult(statistic=-0.6677514130260247, pvalue=1.0305506842095527e-08)),
 ('OIL, CD91',
  PearsonRResult(statistic=-0.4781424069809705, pvalue=0.000147037726580906))]