In [1]:
import numpy as np
import pandas as pd
from scipy import stats
import os

In [2]:
house_price_gr = pd.read_csv("house_price_gr.csv",encoding= 'gbk')
house_price_gr.head()

Unnamed: 0,dis_name,rate
0,东城区甘南小区,0.169747
1,东城区察慈小区,0.165484
2,东城区胡家园小区,0.141358
3,东城区台基厂小区,0.063197
4,东城区青年湖小区,0.101528


## 进行点估计

In [3]:
np.mean(house_price_gr.rate)

0.11006099670192315

In [4]:
stats.sem(house_price_gr.rate) #样本均值的标准误

0.0033748324091783266

In [5]:
np.std(house_price_gr.rate)

0.041195079496415814

## 进行区间估计

In [6]:
se = stats.sem(house_price_gr.rate)
LB = house_price_gr.rate.mean()  - 1.98*se
UB = house_price_gr.rate.mean()  + 1.98*se
print (LB,UB)

0.10337882853175007 0.11674316487209624


In [8]:
stats.t.ppf(1-0.05/2, 1000)

1.9623390808264074

In [9]:
#==============================================================================
# #上证指数的收益率
#==============================================================================
#读取数据
SHindex = pd.read_csv("TRD_Index.csv")
mu = SHindex.Retindex.mean()
sigma =  SHindex.Retindex.std()
#计算区间
stats.t.interval(0.95,len(SHindex)-1,mu,stats.sem(SHindex.Retindex))

(-0.0006112437012950778, 0.0007318872656515137)

In [10]:
#%%
#==============================================================================
# #3.4 方差分析
#==============================================================================
#单因素方差分析
# 研究不同行业股票收益率水平
from statsmodels.stats.anova import anova_lm
from statsmodels.formula.api import ols
year_return = pd.read_csv('TRD_Year.csv',encoding= 'gbk')
year_return.head()

Unnamed: 0,Code,Year,Return,Industry
0,1,2014,0.57298,货币金融服务
1,2,2014,0.827567,房地产业
2,4,2014,0.336481,医药制造业
3,5,2014,0.64,房地产业
4,6,2014,0.477997,房地产业


In [11]:
model = ols('Return ~Industry',data =year_return.dropna()).fit()
print(anova_lm(model))

              df      sum_sq   mean_sq         F        PR(>F)
Industry    74.0   60.517228  0.817800  4.177614  4.382045e-28
Residual  2302.0  450.634318  0.195758       NaN           NaN


可以看到PR(P值远小于0.05，P值越小，影响越显著！)，所以结论是行业对于收益率是有影响的

In [13]:
#%%
# 多因素方差分析

creditcard_exp  = pd.read_csv('creditcard_exp.csv',skipinitialspace = True)
creditcard_exp.head()

Unnamed: 0,id,Acc,avg_exp,avg_exp_ln,gender,Age,Income,Ownrent,Selfempl,dist_home_val,dist_avg_income,age2,high_avg,edu_class
0,19,1,1217.03,7.104169,1,40,16.03515,1,1,99.93,15.932789,1600,0.102361,3
1,5,1,1251.5,7.132098,1,32,15.8475,1,0,49.88,15.796316,1024,0.051184,2
2,95,0,,,1,36,8.4,0,0,88.61,7.49,1296,0.91,1
3,86,1,856.57,6.752936,1,41,11.47285,1,0,16.1,11.275632,1681,0.197218,3
4,50,1,1321.83,7.186772,1,28,13.40915,1,0,100.39,13.346474,784,0.062676,2


In [14]:
ana = ols('avg_exp ~ C(edu_class) + C (gender)', data = creditcard_exp).fit()
anova_lm(ana)

Unnamed: 0,df,sum_sq,mean_sq,F,PR(>F)
C(edu_class),3.0,8126056.0,2708685.0,31.578365,1.031496e-12
C(gender),1.0,41782.73,41782.73,0.487111,0.4877082
Residual,65.0,5575481.0,85776.62,,


In [None]:
可以看出，edu_class的PR远小于0.05，说明教育水平对收入的影响很显著！
gender的PR值为0.487，远大于0.05，说明性别对于收入的影响不显著！

In [15]:
ana.summary()

0,1,2,3
Dep. Variable:,avg_exp,R-squared:,0.594
Model:,OLS,Adj. R-squared:,0.569
Method:,Least Squares,F-statistic:,23.81
Date:,"Tue, 23 Jul 2019",Prob (F-statistic):,3.75e-12
Time:,12:17:08,Log-Likelihood:,-494.31
No. Observations:,70,AIC:,998.6
Df Residuals:,65,BIC:,1010.0
Df Model:,4,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,207.3700,207.095,1.001,0.320,-206.227,620.967
C(edu_class)[T.1],439.5956,216.032,2.035,0.046,8.151,871.040
C(edu_class)[T.2],786.0626,217.826,3.609,0.001,351.035,1221.091
C(edu_class)[T.3],1241.1927,219.557,5.653,0.000,802.707,1679.678
C(gender)[T.1],-57.8200,82.845,-0.698,0.488,-223.272,107.632

0,1,2,3
Omnibus:,15.243,Durbin-Watson:,2.303
Prob(Omnibus):,0.0,Jarque-Bera (JB):,21.012
Skew:,0.875,Prob(JB):,2.74e-05
Kurtosis:,5.035,Cond. No.,14.2


In [16]:
#添加交互项

ana1 = ols('avg_exp ~ C(edu_class) + C (gender) + C(edu_class)*C(gender)', data = creditcard_exp).fit()
anova_lm(ana1)

Unnamed: 0,df,sum_sq,mean_sq,F,PR(>F)
C(edu_class),3.0,8126056.0,2708685.0,33.83935,3.753889e-13
C(gender),1.0,41782.73,41782.73,0.521988,0.4726685
C(edu_class):C(gender),3.0,535579.9,178526.6,2.230316,0.09333507
Residual,63.0,5042862.0,80045.44,,


In [17]:
ana1.summary()

0,1,2,3
Dep. Variable:,avg_exp,R-squared:,0.633
Model:,OLS,Adj. R-squared:,0.598
Method:,Least Squares,F-statistic:,18.12
Date:,"Tue, 23 Jul 2019",Prob (F-statistic):,4.35e-12
Time:,12:18:12,Log-Likelihood:,-490.8
No. Observations:,70,AIC:,995.6
Df Residuals:,63,BIC:,1011.0
Df Model:,6,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,207.3700,200.057,1.037,0.304,-192.412,607.152
C(edu_class)[T.1],417.8090,209.367,1.996,0.050,-0.577,836.195
C(edu_class)[T.2],732.2613,212.977,3.438,0.001,306.661,1157.861
C(edu_class)[T.3],1346.5708,216.086,6.232,0.000,914.757,1778.384
C(gender)[T.1],-0.0168,67.939,-0.000,1.000,-135.782,135.749
C(edu_class)[T.1]:C(gender)[T.1],192.7428,162.889,1.183,0.241,-132.765,518.251
C(edu_class)[T.2]:C(gender)[T.1],96.8755,110.846,0.874,0.385,-124.632,318.383
C(edu_class)[T.3]:C(gender)[T.1],-289.6350,109.331,-2.649,0.010,-508.115,-71.155

0,1,2,3
Omnibus:,35.697,Durbin-Watson:,2.331
Prob(Omnibus):,0.0,Jarque-Bera (JB):,98.497
Skew:,1.59,Prob(JB):,4.0900000000000002e-22
Kurtosis:,7.864,Cond. No.,2.12e+16
