###### excel -> csv 변환

In [1]:
import pandas as pd
import numpy as np
import os

In [2]:
pwd

'C:\\MarketCaster\\Industrial Index'

In [3]:
# 년, 월, 일로 표기된 DATE를 처리하기 위한 function
# input 인자인 dataFrame은 AMD['DATE']와 같이 dataframe 내 컬럼을 명시해주어야 함
# 2019년 12월 4일 -> 2019-12-04
def date_preprocessing(dataFrame):
    i = 0
    for date in dataFrame:
        try:
            # 띄어쓰기 제거
            tmp1, tmp2, tmp3 = date.split(' ')
            # 연, 월, 일 조립
            tmp = tmp1[:4] + '-' + tmp2[:2] + '-' + tmp3[:2]
        except:
            tmp = date
        # reset
        dataFrame.loc[i] = pd.to_datetime(tmp)
        i = i + 1

In [4]:
# 거래량이 M(Milion) 단위로 표기된 VOLUME을 처리하기 위한 function
# input 인자인 dataFrame은 AMD['VOLUME']와 같이 dataframe 내 컬럼을 명시해주어야 함
# 10.00M -> 10000000
def volume_preprocessing(dataFrame):
    i = 0
    for volume in dataFrame:
        if type(volume) == str:
            if volume[-1] == 'M':
                volume = int(float(volume[:-1]) * 1000000)
            elif volume == '-':
                volume = 0
            dataFrame.loc[i] = int(volume)
        elif volume == np.nan:
            pass
        else:
            try:
                dataFrame.loc[i] = int(dataFrame.loc[i])
            except:
                pass
        i = i + 1

In [5]:
# 일별수익률에 %기호가 붙어있는 경우 사용
def change_preprocessing(dataFrame):
    i = 0
    for change in dataFrame:
        try:
            # %기호 제거
            if change[-1] == '%':
                change = change[:-1]
        except:
            pass
        try:
            dataFrame.loc[i] = float(change)
        except:
            print(type(change), ' ', i)
        i = i + 1

##### 시계열 데이터 from to 설정

In [6]:
# 사용할 데이터의 기간 설정
# date_from < 사용할 기간 < date_to
date_from = pd.to_datetime('2006-01-01')
date_to   = pd.to_datetime('2019-11-01')

# investing.com 데이터 중 비영업일이 영업일로 체크된 날짜 설정
investing_ignore_date = pd.to_datetime('2016-02-27')

##### Directory 위치 설정

In [19]:
# data file이 위치한 directory 설정
path = './Data/Taiwan/'
csv_path = './Data/CSV/'
# 최종 data name 설정
describe_file_name = 'describe_Taiwan.csv'

In [8]:
# directory 내 전체 file list 추출
file_list = os.listdir(path)

In [9]:
# csv file 추출
file_list_csv = [file for file in file_list if file.endswith(".csv")]

In [10]:
# xlsx file 추출
file_list_xlsx = [file for file in file_list if file.endswith(".xlsx")]

In [11]:
# 파일명을 저장하기 위한 list 선언
df_name_list = []
# dataFrame을 저장하기 위한 list 선언
df_list = []

In [12]:
# directory 내 csv file read
for file in file_list_csv:
    df_list.append(pd.read_csv(path+file, encoding='CP949'))
    df_name_list.append(file[:-4])

In [13]:
# directory 내 xlsx file read
for file in file_list_xlsx:
    df_list.append(pd.read_excel(path+file, sheet_name='Sheet1'))
    df_name_list.append(file[:-5])

In [14]:
df_list

[           DATE     OPEN     HIGH      LOW    CLOSE     VOLUME  CHANGE
 0    2019-12-02  10.0633  10.0960  10.0469  10.0797   25516808  0.8858
 1    2019-11-29  10.1222  10.1386   9.9912   9.9912   32423562 -1.5605
 2    2019-11-28  10.1659  10.2315  10.1167  10.1495   19718982 -0.4170
 3    2019-11-27  10.0773  10.2084  10.0773  10.1920   18467648  1.2199
 4    2019-11-26  10.1676  10.1840  10.0692  10.0692  101916624  0.0820
 ...         ...      ...      ...      ...      ...        ...     ...
 3433 2006-01-09   1.9789   2.0477   1.9789   2.0388   79241392  4.7635
 3434 2006-01-06   1.9253   1.9461   1.9223   1.9461   69380192  1.7525
 3435 2006-01-05   1.8946   1.9125   1.8707   1.9125   45456860  1.9777
 3436 2006-01-04   1.8666   1.8754   1.8340   1.8754   55472680  1.7457
 3437 2006-01-03   1.7876   1.8433   1.7729   1.8433   71567648  3.6518
 
 [3438 rows x 7 columns]]

In [15]:
df_name_list

['TSMC']

In [16]:
df_columns = df_list[0].columns

In [17]:
tmp_df_list = []
for df in df_list:
    # 날짜 데이터 일원화
    date_preprocessing(df['DATE'])
    # 거래량 데이터 일원화
    volume_preprocessing(df['VOLUME'])
    # 일별수익률 데이터 일원화
    change_preprocessing(df['CHANGE'])
    df['CHANGE'] = df['CHANGE'].apply(pd.to_numeric)
    
    # 데이터 기간 일원화
    tmp_df = df[df['DATE'] < date_to]
    
    # investing.com에서 영업일로 표기된 날짜에 대한 index 추출 및 행 삭제
    try:
        index = df[df['DATE'] == investing_ignore_date].index[0]
        tmp_df = tmp_df.drop(index, axis = 0)
        tmp_df_list.append(tmp_df)
    except:
        tmp_df_list.append(tmp_df)
    

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_with_indexer(indexer, value)


-------

In [None]:
describe_df_list = pd.DataFrame(columns=df_columns[1:])
i = 0
for df in tmp_df_list:
    date_data = pd.DataFrame([[df_name_list[i], 'Date_From', df['DATE'].min(), 'Date_To', df['DATE'].max(), len(df['DATE'])]],
                             columns=df_columns[1:])
    describe_df_list = pd.concat([describe_df_list, date_data], axis = 0, sort=False)
    describe_df_list = pd.concat([describe_df_list, df.describe()], axis = 0, sort=False)
    i += 1

In [None]:
describe_df_list.to_csv(describe_file_name)

In [18]:
tmp_df_list[0]

Unnamed: 0,DATE,OPEN,HIGH,LOW,CLOSE,VOLUME,CHANGE
22,2019-10-31,9.8393,9.8885,9.7901,9.7901,43123512,-0.5431
23,2019-10-30,9.8107,9.8436,9.7450,9.8436,32370342,0.6417
24,2019-10-29,9.7316,9.7808,9.7153,9.7808,37130132,1.5243
25,2019-10-28,9.6503,9.6667,9.6176,9.6339,20216560,0.3932
26,2019-10-25,9.6289,9.6289,9.5635,9.5962,25082712,0.2132
...,...,...,...,...,...,...,...
3433,2006-01-09,1.9789,2.0477,1.9789,2.0388,79241392,4.7635
3434,2006-01-06,1.9253,1.9461,1.9223,1.9461,69380192,1.7525
3435,2006-01-05,1.8946,1.9125,1.8707,1.9125,45456860,1.9777
3436,2006-01-04,1.8666,1.8754,1.8340,1.8754,55472680,1.7457


In [20]:
# save csv file
tmp_df_list[0].to_csv(csv_path+df_name_list[0]+'.csv', index=False)