In [1]:
import pandas as pd
import sys
sys.path.append('../src/data/')
import data_wrangling as dw

from sklearn import preprocessing
from statsmodels.formula.api import ols
from scipy.stats import chi2_contingency

## Seperate data into discrete and continuous features

In [2]:
#create data set for statistical analysis
df =  dw.clean_data(pd.read_excel("../data/raw/pd_rbd.xls", header=1))

In [3]:
#create a dataframe of discrete variables
discrete = df[['status', 'gender']]

In [4]:
#normalize continuous variables
X = df.iloc[:,-24:]
normalized_X = preprocessing.normalize(X)

In [5]:
#create a dataframe of continuous variables
continuous = df[['status', 'age']]
continuous[df.iloc[:,-24:].columns] = pd.DataFrame(normalized_X)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[k1] = value[k2]


In [6]:
cont_feat = continuous.drop(['status'], axis=1).columns

## ANOVA

In [7]:
#Find p-value for continuous values using ANOVA
p_sig = []
for feature in cont_feat:
    model = ols('status ~ '+feature, continuous).fit()
    print(feature + f" p = {model.f_pvalue: .4f}")
    if model.f_pvalue < .05:
        p_sig.append(feature)
p_sig
    

age p =  0.4495
pass_est p =  0.5200
pass_rst p =  0.0394
pass_ast p =  0.1575
pass_dpi p =  0.0073
pass_dvi p =  0.1952
pass_gvi p =  0.3964
pass_dus p =  0.2233
pass_duf p =  0.1533
pass_rlr p =  0.0385
pass_pir p =  0.1253
pass_rsr p =  0.7190
pass_lre p =  0.0175
mono_est p =  0.5407
mono_rst p =  0.0019
mono_ast p =  0.3507
mono_dpi p =  0.0018
mono_dvi p =  0.0226
mono_gvi p =  0.3343
mono_dus p =  0.0002
mono_duf p =  0.2152
mono_rlr p =  0.6164
mono_pir p =  0.0139
mono_rsr p =  0.0434
mono_lre p =  0.3767


['pass_rst',
 'pass_dpi',
 'pass_rlr',
 'pass_lre',
 'mono_rst',
 'mono_dpi',
 'mono_dvi',
 'mono_dus',
 'mono_pir',
 'mono_rsr']

In [8]:
p_sig_input = "+".join(p_sig)

In [9]:
#create model with all significant continuous features
model = ols('status ~ '+p_sig_input , continuous).fit()

In [10]:
model.summary()

0,1,2,3
Dep. Variable:,status,R-squared:,0.267
Model:,OLS,Adj. R-squared:,0.206
Method:,Least Squares,F-statistic:,4.338
Date:,"Thu, 07 May 2020",Prob (F-statistic):,3.52e-05
Time:,16:09:38,Log-Likelihood:,-147.21
No. Observations:,130,AIC:,316.4
Df Residuals:,119,BIC:,348.0
Df Model:,10,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,-0.6787,2.372,-0.286,0.775,-5.376,4.018
pass_rst,1.1936,1.752,0.681,0.497,-2.277,4.664
pass_dpi,4.7893,2.050,2.336,0.021,0.730,8.849
pass_rlr,30.6431,16.351,1.874,0.063,-1.734,63.020
pass_lre,-2.2550,1.154,-1.954,0.053,-4.541,0.031
mono_rst,-2.7202,2.805,-0.970,0.334,-8.273,2.833
mono_dpi,0.6142,2.259,0.272,0.786,-3.859,5.087
mono_dvi,0.0446,2.186,0.020,0.984,-4.284,4.373
mono_dus,23.3568,8.930,2.616,0.010,5.675,41.039

0,1,2,3
Omnibus:,10.997,Durbin-Watson:,0.589
Prob(Omnibus):,0.004,Jarque-Bera (JB):,4.875
Skew:,0.218,Prob(JB):,0.0874
Kurtosis:,2.157,Cond. No.,941.0


### Chi-Square Test

In [11]:
#create cross tabulation of gender and status
chi_gender = pd.crosstab(discrete.status, discrete.gender)

In [12]:
#calculate chi-square
chi2, p_value, dof, ex = chi2_contingency(chi_gender)
print(f"chi2: {chi2:.3f} \np-value: {p_value: .3f} \nDOF: {dof} \nExpected: {ex}")

chi2: 2.019 
p-value:  0.364 
DOF: 2 
Expected: [[10.38461538 39.61538462]
 [ 6.23076923 23.76923077]
 [10.38461538 39.61538462]]


In [13]:
df1 = df[['status','age', 'gender']]
df2 = continuous[p_sig]
model_data = pd.concat([df1, df2], axis=1, join='inner')

In [15]:
#save data for model
#model_data.to_csv("../data/processed/model_data.csv",index=False)