# Import libraries

In [174]:
import pandas
import numpy
import plotly.express
import scipy
import statsmodels.api
import statsmodels.formula.api
import patsy.contrasts

# Generate data
## `group` variable with 4 levels

In [150]:
group = numpy.repeat(["A", "B", "C", "D"], repeats = 25)

## Save parameters

In [151]:
mean = (3, 4, 4, 3)
sigma = (1, 1, 1, 1)
n = (25, 25, 25, 25)

## Random normal variable `x` whose means depend on levels of `group` variable

In [152]:
# Empty list for appending results
x = []

# Loop through equal-length mean, sigma, and n (group size)
for (i, j, k) in zip(mean, sigma, n):
    x.append(numpy.random.normal(loc = i, scale = j, size = k))

# Concatenate transforms the result into 1 array
x = numpy.concatenate(x)

## Variable `y` correlated r = 0.75 with x

In [153]:
y = x * 0.75 * numpy.random.normal(loc = 0, scale = 1, size = sum(n))

## Store variables in a data frame

In [154]:
data1 = pandas.DataFrame({"y": y, "x": x, "group": group})

## Create contrast variables for use in linear regression on `group` variable

In [155]:
# Helmert contrasts
group_helmert = patsy.contrasts.Helmert().code_without_intercept(list(set(group)))

# 2 main effects and 1 interaction contrast
group_factorial = patsy.contrasts.ContrastMatrix([[-1, -1, 1], [-1, 1, -1], [1, -1, -1], [1, 1, 1]], 
                                                 ["Main Effect 1", "Main Effect 2", "Interaction"])

## Add contrast variables to data frame

In [156]:
# Main Effect 1 function
def main1(x):
    if x == "A": return -1
    elif x == "B": return -1
    elif x == "C": return 1
    elif x == "D": return 1

# Main Effect 2 function
def main2(x):
    if x == "A": return -1
    elif x == "B": return 1
    elif x == "C": return -1
    elif x == "D": return 1

# Main Effect 1 function
def interaction(x):
    if x == "A": return 1
    elif x == "B": return -1
    elif x == "C": return -1
    elif x == "D": return 1

# Add recoded variables to data frame
data1["main1"] = data1["group"].apply(main1)
data1["main2"] = data1["group"].apply(main2)
data1["interaction"] = data1["group"].apply(interaction)

# Plots
## Boxplots

In [157]:
# boxplots of y by group
plotly.express.box(data1, x = "group", y = "y")

In [158]:
# boxplots of x by group
plotly.express.box(data1, x = "group", y = "x")

## Histograms

In [159]:
# y histograms by group
plotly.express.histogram(data1, x = "y", facet_col = "group")

In [160]:
# x histograms by group
plotly.express.histogram(data1, x = "x", facet_col = "group")

## Scatterplot

In [161]:
plotly.express.scatter(data1, x = "x", y = "y", color = "group", trendline = "ols", facet_col = "group")

# Descriptive Statistics

In [162]:
data1.groupby("group")[["x", "y"]].describe().round(2)

Unnamed: 0_level_0,x,x,x,x,x,x,x,x,y,y,y,y,y,y,y,y
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max,count,mean,std,min,25%,50%,75%,max
group,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2
A,25.0,3.4,1.03,2.12,2.56,3.23,3.82,5.47,25.0,0.33,3.16,-5.89,-1.2,0.57,1.69,10.04
B,25.0,3.97,1.16,2.11,2.9,4.28,4.73,6.25,25.0,0.1,2.81,-5.39,-0.94,-0.28,1.05,9.19
C,25.0,3.73,0.77,2.21,3.16,3.75,4.06,5.59,25.0,-1.2,2.68,-6.92,-2.96,-0.81,0.52,5.3
D,25.0,2.97,0.98,1.33,2.41,2.86,3.54,5.45,25.0,0.04,2.29,-6.07,-0.92,0.08,1.39,4.49


# Correlation

In [210]:
# Save r and p-value
pearsonr1 = scipy.stats.pearsonr(x, y)

# compute t-statistic and degress of freedom
t = scipy.stats.t.ppf(pearsonr1[1], df = len(x) - 2)

nan

# Analyses
## Regression
### Fit linear regression

In [163]:
# regress y on numeric/continuous x
ols_fit1 = statsmodels.formula.api.ols("y ~ x", data = data1).fit()

### Results Summary

In [164]:
ols_fit1.summary()

0,1,2,3
Dep. Variable:,y,R-squared:,0.004
Model:,OLS,Adj. R-squared:,-0.006
Method:,Least Squares,F-statistic:,0.4006
Date:,"Fri, 10 Jan 2020",Prob (F-statistic):,0.528
Time:,22:21:52,Log-Likelihood:,-243.17
No. Observations:,100,AIC:,490.3
Df Residuals:,98,BIC:,495.5
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,-0.7738,0.973,-0.795,0.428,-2.705,1.157
x,0.1678,0.265,0.633,0.528,-0.358,0.694

0,1,2,3
Omnibus:,8.935,Durbin-Watson:,2.327
Prob(Omnibus):,0.011,Jarque-Bera (JB):,14.8
Skew:,0.319,Prob(JB):,0.000611
Kurtosis:,4.773,Cond. No.,13.7


## Fit linear regression

In [165]:
# helmert contrasts on group
ols_fit2 = statsmodels.formula.api.ols("y ~ C(group, group_helmert)", data = data1).fit()

### Results Summary

In [166]:
ols_fit2.summary()

0,1,2,3
Dep. Variable:,y,R-squared:,0.046
Model:,OLS,Adj. R-squared:,0.017
Method:,Least Squares,F-statistic:,1.559
Date:,"Fri, 10 Jan 2020",Prob (F-statistic):,0.204
Time:,22:21:52,Log-Likelihood:,-240.99
No. Observations:,100,AIC:,490.0
Df Residuals:,96,BIC:,500.4
Df Model:,3,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,-0.1837,0.275,-0.668,0.506,-0.729,0.362
"C(group, group_helmert)[H.A]",-0.1140,0.389,-0.293,0.770,-0.886,0.658
"C(group, group_helmert)[H.C]",-0.4697,0.224,-2.092,0.039,-0.915,-0.024
"C(group, group_helmert)[H.B]",0.0734,0.159,0.463,0.645,-0.242,0.389

0,1,2,3
Omnibus:,12.304,Durbin-Watson:,2.458
Prob(Omnibus):,0.002,Jarque-Bera (JB):,21.568
Skew:,0.477,Prob(JB):,2.07e-05
Kurtosis:,5.065,Cond. No.,2.45


In [167]:
# factorial contrasts (2 main effects and 1 interaction)
ols_fit3 = statsmodels.formula.api.ols("y ~ C(group, group_factorial)", data = data1).fit()

## Results Summary

In [168]:
ols_fit3.summary()

0,1,2,3
Dep. Variable:,y,R-squared:,0.046
Model:,OLS,Adj. R-squared:,0.017
Method:,Least Squares,F-statistic:,1.559
Date:,"Fri, 10 Jan 2020",Prob (F-statistic):,0.204
Time:,22:21:52,Log-Likelihood:,-240.99
No. Observations:,100,AIC:,490.0
Df Residuals:,96,BIC:,500.4
Df Model:,3,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,-0.1837,0.275,-0.668,0.506,-0.729,0.362
"C(group, group_factorial)Main Effect 1",-0.3962,0.275,-1.441,0.153,-0.942,0.150
"C(group, group_factorial)Main Effect 2",0.2513,0.275,0.914,0.363,-0.294,0.797
"C(group, group_factorial)Interaction",0.3653,0.275,1.329,0.187,-0.180,0.911

0,1,2,3
Omnibus:,12.304,Durbin-Watson:,2.458
Prob(Omnibus):,0.002,Jarque-Bera (JB):,21.568
Skew:,0.477,Prob(JB):,2.07e-05
Kurtosis:,5.065,Cond. No.,1.0


## Fit linear regression

In [169]:
ols_fit4 = statsmodels.formula.api.ols("y ~ main1 + main2 + interaction", data = data1).fit()

## Results Summary

In [170]:
ols_fit4.summary()

0,1,2,3
Dep. Variable:,y,R-squared:,0.046
Model:,OLS,Adj. R-squared:,0.017
Method:,Least Squares,F-statistic:,1.559
Date:,"Fri, 10 Jan 2020",Prob (F-statistic):,0.204
Time:,22:21:53,Log-Likelihood:,-240.99
No. Observations:,100,AIC:,490.0
Df Residuals:,96,BIC:,500.4
Df Model:,3,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,-0.1837,0.275,-0.668,0.506,-0.729,0.362
main1,-0.3962,0.275,-1.441,0.153,-0.942,0.150
main2,0.2513,0.275,0.914,0.363,-0.294,0.797
interaction,0.3653,0.275,1.329,0.187,-0.180,0.911

0,1,2,3
Omnibus:,12.304,Durbin-Watson:,2.458
Prob(Omnibus):,0.002,Jarque-Bera (JB):,21.568
Skew:,0.477,Prob(JB):,2.07e-05
Kurtosis:,5.065,Cond. No.,1.0


(3, 2, 1)