### 第一部分：
- 基本統計量說明: Raw Data在當沖前/當沖後的各種基本統計量陳述（平均數、中位數、min、max、標準差），對Raw Data有個基本的概念

### 第二部分：
- 當沖前/當沖後的各種基本統計量的檢定（ex:現股當沖比重、日報酬率、週報酬率、月報酬率）
- 做平均數檢定、中位數檢定，看看當沖前/當沖後這些基本統計量有沒有顯著變化

### 第三部分：
- 政策面的研究。政府開放當沖是為了縮小spread、提高成交量、提高成交量週轉率。
- 研究data做的回歸是不是符合政府所宣稱的?當沖真的有穩定市場嗎？對資本市場有貢獻嗎？
- <span style="color:red">價量資料</span>

### 第四部分：
- 市場面研究。有沒有其他在市場上面的因素會影響我們的回歸式？要控制這些變數，放入回歸式（很多x)

### $ \Delta $ $上市公司日報酬率標準差_i$ = $a_0$ + $a_1$ * (上市公司現股當沖比重平均)

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
plt.style.use('ggplot')
import statsmodels.stats.api as sms
import statsmodels.api as sm
import statsmodels.formula.api as smf
import warnings
import datetime as datetime
import json
import os
import csv
from datetime import timedelta
from openpyxl import load_workbook
warnings.filterwarnings("ignore")
%matplotlib inline

from sklearn.linear_model import LinearRegression
from scipy import stats

In [218]:
class tw_day_trade():
    def __init__(self):
        self.pos_1 = ""
        self.pos_2 = ""
        self.pos_3 = ""
        self.df_before = "" 
        self.df_after = ""  
        self.basic_stats_info_before = []
        self.basic_stats_info_after = []
        self.basic_stats_info_before_vol = []
        self.basic_stats_info_after_vol = []
        self.basic_stock_stats_info_dict = {}
        self.x = "" 
        self.y_daily = "" 
        self.y_weekly = "" 
        self.y_monthly = "" 
        self.day_trade_data = ""
        self.excel_data = {}

        """
        當沖相關重要日期們
        
        第一階段：2014/1/6 開放單向當沖200檔
        「台灣50、台灣中型100、富櫃50」200檔。
        
        第二階段：2014/6/30 開放雙向當沖
        仍為「台灣50、台灣中型100、富櫃50」200檔。
        
        第三階段：2015/6/1 新增「得發行認購(售)權證標的」及「ETF」亦得為當沖標的
        此時有377檔。
        
        第四階段：2016/2/1 開放「所有可做融資融券」的股票
        上市櫃共1432檔。
        
        第五階段：2017/4/28 調降當沖稅率
        """

    def read_raw_csv(self, str_filename):
        """
        讀進我們需要的raw data
        """
        self.day_trade_data = pd.read_csv(str_filename)
        print("Read Data Successfully")
        print("data shape: {}".format(self.day_trade_data.shape))

        
    def day_trade_split(self, date_1, date_2, date_3):
        """
        根據date_1, date_2, date_3來切割dataframe
        """
        tmp_arr = np.array(list(self.day_trade_data))
        self.pos_1 = np.where(tmp_arr==date_1)[0][0]
        self.pos_2 = np.where(tmp_arr==date_2)[0][0]
        self.pos_3 = np.where(tmp_arr==date_3)[0][0]
        #print("date_1 pos: {}\ndate_2 pos: {}\ndate_3 pos: {}".format(self.pos_1,self.pos_2,self.pos_3))
        
        self.df_before = self.day_trade_data.iloc[:,self.pos_1:self.pos_2]
        self.df_after = self.day_trade_data.iloc[:,self.pos_2:(self.pos_3+1)]
        #print("df_before shape: {}".format(self.df_before.shape))
        #print("df after shape: {}".format(self.df_after.shape))
        print(self.df_before)
          
    
    def day_trade_stats(self,flag):
        """
        第一部分：計算當沖前/當沖後的基本統計量，對Raw Data有個基本概念
        """
        
        # 平均數 (axis=1橫條往右apply func., axis=0是直條往下apply func.)
        # 取axis=1, 每間公司在一段期間內為一個單位
        self.df_before['mean_before'] = self.df_before.mean(axis=1)
        self.df_after['mean_after'] = self.df_after.mean(axis=1)
        self.df_before['std_before'] = self.df_before.std(axis=1)
        self.df_after['std_after'] = self.df_after.std(axis=1)
        self.df_before['median_before'] = self.df_before.median(axis=1)
        self.df_after['median_after'] = self.df_after.median(axis=1)
        
        # count, mean, std, min, 25%, 50%, 75%, max
        for i in range(0,4):
            self.basic_stats_info_before.append(self.df_before['mean_before'][i::4].describe())
            self.basic_stats_info_after.append(self.df_after['mean_after'][i::4].describe())
        #print(self.basic_stats_info_before)
        print(self.basic_stats_info_after)
        
        """
        第二部分：Raw Data各種基本統計量檢定
        """
        # 日/週/月報酬率平均數檢定 
        # [t-statistics, p-value], 取p-value存入字典
        self.basic_stock_stats_info_dict['tttest_mean_daily'] = stats.ttest_ind(self.df_before['mean_before'][0::4],self.df_after['mean_after'][0::4])[1]
        self.basic_stock_stats_info_dict['tttest_mean_weekly'] = stats.ttest_ind(self.df_before['mean_before'][1::4],self.df_after['mean_after'][1::4])[1]
        self.basic_stock_stats_info_dict['ttest_mean_monthly'] = stats.ttest_ind(self.df_before['mean_before'][2::4],self.df_after['mean_after'][2::4])[1]
        # 日/週/月報酬率標準差檢定
        # [t-statistics, p-value], 取p-value存入字典
        self.basic_stock_stats_info_dict['ttest_std_daily'] = stats.ttest_ind(self.df_before['std_before'][0::4],self.df_after['std_after'][0::4])[1]
        self.basic_stock_stats_info_dict['ttest_std_weekly'] = stats.ttest_ind(self.df_before['std_before'][1::4],self.df_after['std_after'][1::4])[1]
        self.basic_stock_stats_info_dict['ttest_std_monthly'] = stats.ttest_ind(self.df_before['std_before'][2::4],self.df_after['std_after'][2::4])[1]       
        # 日/週/月報酬率中位數檢定
        # [t-statistics, p-value], 取p-value存入字典
        self.basic_stock_stats_info_dict['ttest_median_daily'] = stats.ttest_ind(self.df_before['median_before'][0::4],self.df_after['median_after'][0::4])[1]
        self.basic_stock_stats_info_dict['ttest_median_weekly'] = stats.ttest_ind(self.df_before['median_before'][1::4],self.df_after['median_after'][1::4])[1]
        self.basic_stock_stats_info_dict['ttest_median_monthly'] = stats.ttest_ind(self.df_before['median_before'][2::4],self.df_after['median_after'][2::4])[1]

        #print(json.dumps(self.basic_stock_stats_info_dict, indent=2))

    
    def write_excel_reports_prepro(self):
        arr_weight = []
        arr_day = []
        arr_week = []
        arr_month = []
        arr_vol = []
        
        self.excel_data = {
            'weight': arr_weight,
            'day': arr_day,
            'week': arr_week,
            'month': arr_month,
            'vol': arr_vol
        }
        
        for outside_item in range(len(self.basic_stats_info_before)):
            if outside_item == 0:# day
                for j1 in range(1,len(self.basic_stats_info_before[0])):
                    arr_day.append(self.basic_stats_info_before[outside_item][j1])
            if outside_item == 1:# week
                for j2 in range(1,len(self.basic_stats_info_before[0])):
                    arr_week.append(self.basic_stats_info_before[outside_item][j2])
            if outside_item == 2:# month
                for j3 in range(1,len(self.basic_stats_info_before[0])):
                    arr_month.append(self.basic_stats_info_before[outside_item][j3])
            if outside_item == 3:# weight
                for j4 in range(1,len(self.basic_stats_info_before[0])):
                    arr_weight.append(self.basic_stats_info_before[outside_item][j4])
        
        for outside_item in range(len(self.basic_stats_info_after)):
            if outside_item == 0:# day
                for j1 in range(1,len(self.basic_stats_info_after[0])):
                    arr_day.append(self.basic_stats_info_after[outside_item][j1])
            if outside_item == 1:# week
                for j2 in range(1,len(self.basic_stats_info_after[0])):
                    arr_week.append(self.basic_stats_info_after[outside_item][j2])
            if outside_item == 2:# month
                for j3 in range(1,len(self.basic_stats_info_after[0])):
                    arr_month.append(self.basic_stats_info_after[outside_item][j3])
            if outside_item == 3:# weight
                for j4 in range(1,len(self.basic_stats_info_after[0])):
                    arr_weight.append(self.basic_stats_info_after[outside_item][j4])
        
        
        #print(json.dumps(self.excel_data, indent=2))
    
    
    def write_excel_reports(self,sheetName,types):
        """
        上市type=0, 上櫃type=1, 上市上櫃type=2
        """
        data = self.excel_data
        wb = load_workbook('20190923_testing.xlsx')

        itemList = ['B','C','D','E','F','G','H','I','J','K','L','M','N','O']
        rowIndex = -1
        columeIndex = -1
        result = True

        # §ó·s¤u§@ªí©úºÙ
        wb_sheet = wb[sheetName]
        print(wb_sheet)

        # 上市type=0, 上櫃type=1, 上市上櫃type=2
        for rowItem in data:

            if rowItem == 'weight':
                rowIndex = 5+types*15
            elif rowItem == 'day':
                rowIndex = 6+types*15
            elif rowItem == 'week':
                rowIndex = 7+types*15
            elif rowItem == 'month':
                rowIndex = 8+types*15
            elif rowItem == 'volume':
                rowIndex = 10+types*15
            else:
                result = False
                break

            for columeIndex in range(0,len(data[rowItem])):
                wb_sheet[str(itemList[columeIndex])+str(rowIndex)] = round(data[rowItem][columeIndex],2)

        wb.save('20190923_testing.xlsx')
    
    
    def day_trade_calculate(self):
        """
        準備OLS的x和y
        """
        self.x = self.df_after['mean_after'][3::4]-self.df_before['mean_before'][3::4]
        self.y_daily = self.df_after['std_after'][0::4]-self.df_before['std_before'][0::4]
        self.y_weekly = self.df_after['std_after'][1::4]-self.df_before['std_before'][1::4]
        self.y_monthly = self.df_after['std_after'][2::4]-self.df_before['std_before'][2::4]
    
    
    def day_trade_OLS(self, date_freq):
        """
        根據傳進的date_freq，計算OLS(日/週/月)
        date_freq's value: 'daily','weekly','monthly'
        """
        if date_freq=='daily':
            self.x = sm.add_constant(self.x)
            self.y_daily = list(self.y_daily)
            model_daily = sm.OLS(self.y_daily, self.x).fit()
            print(model_daily.summary())

        if date_freq=='weekly':
            self.x = sm.add_constant(self.x)
            self.y_weekly = list(self.y_weekly)
            model_weekly = sm.OLS(self.y_weekly, self.x).fit()
            print(model_weekly.summary())

        if date_freq=='monthly':
            self.x = sm.add_constant(self.x)
            self.y_monthly = list(self.y_monthly)
            model_monthly = sm.OLS(self.y_monthly, self.x).fit()
            print(model_monthly.summary())

In [219]:
out_sm = pd.read_csv("成交量_juu/上市20130613-20140630.csv")
out_sm.head()

Unnamed: 0,代碼,Data Field,2014/6/30,2014/6/27,2014/6/26,2014/6/25,2014/6/24,2014/6/23,2014/6/20,2014/6/19,...,2013/6/26,2013/6/25,2013/6/24,2013/6/21,2013/6/20,2013/6/19,2013/6/18,2013/6/17,2013/6/14,2013/6/13
0,1101 台泥,成交量(千股),4902.0,4172.0,7980,6401.0,3667.0,7586.0,8081.0,6808.0,...,0,0,0,0,0,0,0,0,0,0
1,1101 台泥,現股當沖比重,4.73,0.17,0,6.09,0.33,0.82,0.19,0.43,...,0,0,0,0,0,0,0,0,0,0
2,1102 亞泥,成交量(千股),3762.0,4340.0,4816,5046.0,6559.0,5211.0,7144.0,10119.0,...,0,0,0,0,0,0,0,0,0,0
3,1102 亞泥,現股當沖比重,0.58,0.16,0,4.36,0.29,0.27,0.06,0.0,...,0,0,0,0,0,0,0,0,0,0
4,1216 統一,成交量(千股),4497.0,3577.0,4459,9256.0,8142.0,3241.0,4686.0,3916.0,...,0,0,0,0,0,0,0,0,0,0


In [220]:
data = tw_day_trade()
data.read_raw_csv("成交量_juu/上市20130613-20140630.csv")
data.day_trade_split("2013/6/13","2014/1/6","2014/6/30")
data.day_trade_stats(flag=1)

Read Data Successfully
data shape: (280, 261)
Empty DataFrame
Columns: []
Index: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, ...]

[280 rows x 0 columns]
[count    0.0
mean     NaN
std      NaN
min      NaN
25%      NaN
50%      NaN
75%      NaN
max      NaN
Name: mean_after, dtype: float64, count    0.0
mean     NaN
std      NaN
min      NaN
25%      NaN
50%      NaN
75%      NaN
max      NaN
Name: mean_after, dtype: float64]


<img src="https://i.imgur.com/YZ8f21o.jpg" height="500" width="500">
<img src="https://i.imgur.com/VRp7NCJ.jpg" height="500" width="500">
<img src="https://i.imgur.com/XmJpppE.jpg" height="500" width="500">
<img src="https://i.imgur.com/7H8eg1g.jpg" height="500" width="500">

<img src="https://i.imgur.com/1giFKNu.png" height="800" width="800">