# Learning Python as an R user

In [163]:
# Introduction

In [164]:
# what are your goals for this post? Are you going to explain every detail?

# Import libraries

In [165]:
# did you install these libraries somewhere? what does import do? what do the dots "." mean?

In [166]:
import pandas
import math
import numpy
import plotly.express
import plotly.offline
import scipy
import statsmodels.api
import statsmodels.formula.api
import patsy.contrasts

## Offline Plotting Settings

In [167]:
# what do this code mean? why do you have to start with "plotly"?

In [168]:
plotly.offline.init_notebook_mode(connected = True)

# Generate data
## `group` variable with 4 levels

In [169]:
# similar to the `base::rep` function in R. [] make lists. what are lists?

In [170]:
group = numpy.repeat(["A", "B", "C", "D"], repeats = 25)

## Save parameters

In [171]:
# () make tuples. what are tuples?

In [172]:
mean = (3, 4, 4, 3)
sigma = (1, 1, 1, 1)
n = (25, 25, 25, 25)

## Random normal variable `x` whose means depend on levels of `group` variable

In [173]:
# what are list comprehensions? why do you have to concatenate it?

In [174]:
# Loop through equal-length mean, sigma, and n (group size)
x = numpy.concatenate(
    [numpy.random.normal(loc = i, scale = j, size = k) for (i, j, k) in zip(mean, sigma, n)]
)

## Variable `y` correlated r = 0.75 with x

In [175]:
y = x * 0.75 + numpy.random.normal(loc = 0, scale = 1, size = sum(n))

## Store variables in a data frame

In [176]:
# why the colons? why the squigly brackets?

In [177]:
data1 = pandas.DataFrame({"y": y, "x": x, "group": group})

## Create contrast variables for use in linear regression on `group` variable

In [178]:
# Helmert contrasts
group_helmert = patsy.contrasts.Helmert().code_without_intercept(list(set(group)))

# 2 main effects and 1 interaction contrast
group_factorial = patsy.contrasts.ContrastMatrix([[-1, -1, 1], [-1, 1, -1], [1, -1, -1], [1, 1, 1]], 
                                                 ["Main Effect 1", "Main Effect 2", "Interaction"])

## Add contrast variables to data frame

In [179]:
data1.loc[data1["group"] == "A", "main1"] = -1
data1.loc[data1["group"] == "B", "main1"] = -1
data1.loc[data1["group"] == "C", "main1"] = 1
data1.loc[data1["group"] == "D", "main1"] = 1

data1.loc[data1["group"] == "A", "main2"] = -1
data1.loc[data1["group"] == "B", "main2"] = 1
data1.loc[data1["group"] == "C", "main2"] = -1
data1.loc[data1["group"] == "D", "main2"] = 1

data1.loc[data1["group"] == "A", "interaction"] = 1
data1.loc[data1["group"] == "B", "interaction"] = -1
data1.loc[data1["group"] == "C", "interaction"] = -1
data1.loc[data1["group"] == "D", "interaction"] = 1

# Plots
## Boxplots

In [180]:
# boxplots of y by group
plotly.offline.iplot(
    plotly.express.box(data1, x = "group", y = "y")
)

In [181]:
# boxplots of x by group
plotly.offline.iplot(
    plotly.express.box(data1, x = "group", y = "x")
)

## Histograms

In [182]:
# y histograms by group
plotly.offline.iplot(
    plotly.express.histogram(data1, x = "y", facet_col = "group")
)

In [183]:
# x histograms by group
plotly.offline.iplot(
    plotly.express.histogram(data1, x = "x", facet_col = "group")
)

## Scatterplot

In [184]:
plotly.offline.iplot(
    plotly.express.scatter(data1, x = "x", y = "y", color = "group", trendline = "ols", facet_col = "group")
)

## Bars of `group` means with 95% confidence intervals

In [185]:
group_desc = data1.groupby("group")["y"].agg(["mean", "sem", "count"]).reset_index()
group_desc["df"] = group_desc["count"] - 1
group_desc["lower"] = group_desc["mean"] - scipy.stats.t.ppf(1 - 0.05 / 2, df = group_desc["df"]) * group_desc["sem"]
group_desc["upper"] = group_desc["mean"] + scipy.stats.t.ppf(1 - 0.05 / 2, df = group_desc["df"]) * group_desc["sem"]

# Plot
plotly.offline.iplot(
    plotly.express.bar(group_desc, x = "group", y = "mean", error_y_minus = "lower", error_y = "upper", color = "group")
)

# Descriptive Statistics

In [186]:
data1.groupby("group")[["x", "y"]].describe().round(2)

Unnamed: 0_level_0,x,x,x,x,x,x,x,x,y,y,y,y,y,y,y,y
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max,count,mean,std,min,25%,50%,75%,max
group,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2
A,25.0,3.05,0.89,0.83,2.65,3.17,3.44,5.01,25.0,2.33,1.21,0.17,1.74,2.61,3.28,4.14
B,25.0,3.76,1.16,0.55,3.1,3.73,4.37,6.45,25.0,3.1,1.47,0.8,2.15,2.79,4.17,7.0
C,25.0,4.23,0.93,2.92,3.58,4.23,4.8,6.11,25.0,3.3,1.08,-0.02,3.02,3.48,3.98,4.93
D,25.0,3.13,0.98,1.13,2.48,3.12,3.42,4.8,25.0,2.1,1.3,-0.12,1.26,2.37,2.99,4.83


# Correlation

In [187]:
# Save r and p-value
r1, pvalue1 = scipy.stats.pearsonr(x, y)

# Save degrees of freedom
ddf1 = len(x) - 2

# Compute t-statistic and degress of freedom, isf for upper tail of t distribution
t1 = scipy.stats.t.isf(pvalue1 / 2, df = ddf1)

# Compute standard error
se1 = r1 / t1

# Compute lower and upper confidence intervals
lower1, upper1 = (numpy.tanh(numpy.arctanh(r1) - 1 / math.sqrt(len(x) - 3) * scipy.stats.norm.ppf(1 - 0.05 / 2)),
                  numpy.tanh(numpy.arctanh(r1) + 1 / math.sqrt(len(x) - 3) * scipy.stats.norm.ppf(1 - 0.05 / 2)))

# Print most basic results
"r = {0}, 95%CI [{1}, {2}], t({3}) = {4}, p = {5}".format(r1.round(2), lower1.round(2), upper1.round(2), ddf1, t1.round(2), pvalue1.round(3))

'r = 0.73, 95%CI [0.62, 0.81], t(98) = 10.49, p = 0.0'

# Analyses
## Regression
### Fit linear regression

In [188]:
# regress y on numeric/continuous x
ols_fit1 = statsmodels.formula.api.ols("y ~ x", data = data1).fit()

### Results Summary

In [189]:
ols_fit1.summary()

0,1,2,3
Dep. Variable:,y,R-squared:,0.529
Model:,OLS,Adj. R-squared:,0.524
Method:,Least Squares,F-statistic:,110.0
Date:,"Mon, 13 Jan 2020",Prob (F-statistic):,1.06e-17
Time:,21:29:10,Log-Likelihood:,-133.91
No. Observations:,100,AIC:,271.8
Df Residuals:,98,BIC:,277.0
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,-0.4764,0.318,-1.500,0.137,-1.107,0.154
x,0.8996,0.086,10.488,0.000,0.729,1.070

0,1,2,3
Omnibus:,0.46,Durbin-Watson:,1.906
Prob(Omnibus):,0.795,Jarque-Bera (JB):,0.575
Skew:,-0.148,Prob(JB):,0.75
Kurtosis:,2.775,Cond. No.,13.5


## Fit linear regression

In [190]:
# helmert contrasts on group
ols_fit2 = statsmodels.formula.api.ols("y ~ C(group, group_helmert)", data = data1).fit()

### Results Summary

In [191]:
ols_fit2.summary()

0,1,2,3
Dep. Variable:,y,R-squared:,0.139
Model:,OLS,Adj. R-squared:,0.112
Method:,Least Squares,F-statistic:,5.174
Date:,"Mon, 13 Jan 2020",Prob (F-statistic):,0.00234
Time:,21:29:10,Log-Likelihood:,-164.05
No. Observations:,100,AIC:,336.1
Df Residuals:,96,BIC:,346.5
Df Model:,3,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,2.7084,0.127,21.264,0.000,2.456,2.961
"C(group, group_helmert)[H.B]",0.3861,0.180,2.143,0.035,0.029,0.744
"C(group, group_helmert)[H.A]",0.1929,0.104,1.855,0.067,-0.013,0.399
"C(group, group_helmert)[H.C]",-0.2012,0.074,-2.736,0.007,-0.347,-0.055

0,1,2,3
Omnibus:,0.832,Durbin-Watson:,1.821
Prob(Omnibus):,0.66,Jarque-Bera (JB):,0.38
Skew:,-0.045,Prob(JB):,0.827
Kurtosis:,3.288,Cond. No.,2.45


In [192]:
# factorial contrasts (2 main effects and 1 interaction)
ols_fit3 = statsmodels.formula.api.ols("y ~ C(group, group_factorial)", data = data1).fit()

## Results Summary

In [193]:
ols_fit3.summary()

0,1,2,3
Dep. Variable:,y,R-squared:,0.139
Model:,OLS,Adj. R-squared:,0.112
Method:,Least Squares,F-statistic:,5.174
Date:,"Mon, 13 Jan 2020",Prob (F-statistic):,0.00234
Time:,21:29:10,Log-Likelihood:,-164.05
No. Observations:,100,AIC:,336.1
Df Residuals:,96,BIC:,346.5
Df Model:,3,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,2.7084,0.127,21.264,0.000,2.456,2.961
"C(group, group_factorial)Main Effect 1",-0.0083,0.127,-0.065,0.948,-0.261,0.245
"C(group, group_factorial)Main Effect 2",-0.1046,0.127,-0.822,0.413,-0.357,0.148
"C(group, group_factorial)Interaction",-0.4907,0.127,-3.853,0.000,-0.744,-0.238

0,1,2,3
Omnibus:,0.832,Durbin-Watson:,1.821
Prob(Omnibus):,0.66,Jarque-Bera (JB):,0.38
Skew:,-0.045,Prob(JB):,0.827
Kurtosis:,3.288,Cond. No.,1.0


## Fit linear regression

In [194]:
ols_fit4 = statsmodels.formula.api.ols("y ~ main1 + main2 + interaction", data = data1).fit()

## Results Summary

In [195]:
ols_fit4.summary()

0,1,2,3
Dep. Variable:,y,R-squared:,0.139
Model:,OLS,Adj. R-squared:,0.112
Method:,Least Squares,F-statistic:,5.174
Date:,"Mon, 13 Jan 2020",Prob (F-statistic):,0.00234
Time:,21:29:10,Log-Likelihood:,-164.05
No. Observations:,100,AIC:,336.1
Df Residuals:,96,BIC:,346.5
Df Model:,3,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,2.7084,0.127,21.264,0.000,2.456,2.961
main1,-0.0083,0.127,-0.065,0.948,-0.261,0.245
main2,-0.1046,0.127,-0.822,0.413,-0.357,0.148
interaction,-0.4907,0.127,-3.853,0.000,-0.744,-0.238

0,1,2,3
Omnibus:,0.832,Durbin-Watson:,1.821
Prob(Omnibus):,0.66,Jarque-Bera (JB):,0.38
Skew:,-0.045,Prob(JB):,0.827
Kurtosis:,3.288,Cond. No.,1.0
