In [37]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
import statsmodels.formula.api as smf
from scipy import stats
import plotly.express as px
import statsmodels.api as sm
import math
from patsy import dmatrices

# data import and transformation

In [27]:
df = pd.read_csv("water_uptake_data.txt", sep="\t", dtype = "float64")
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32 entries, 0 to 31
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   timepoint  32 non-null     float64
 1   mean_fw    32 non-null     float64
 2   sd_fw      32 non-null     float64
dtypes: float64(3)
memory usage: 896.0 bytes


In [50]:
df = df.assign(sqrt_mean_fw = np.sqrt(df["mean_fw"]))
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32 entries, 0 to 31
Data columns (total 4 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   timepoint     32 non-null     float64
 1   mean_fw       32 non-null     float64
 2   sd_fw         32 non-null     float64
 3   sqrt_mean_fw  32 non-null     float64
dtypes: float64(4)
memory usage: 1.1 KB


# Plot

In [28]:
fig = px.scatter(df, x = "timepoint", y = "mean_fw", title = "Water uptake")
fig.show()

# Model

## Create endogenous and exogenous matrices
Explained in details here: https://www.statsmodels.org/stable/gettingstarted.html#design-matrices-endog-exog 

In [51]:
y, X = dmatrices('sqrt_mean_fw ~ timepoint', data=df, return_type='dataframe')

In [52]:
y[:3]

Unnamed: 0,sqrt_mean_fw
0,0.0
1,2.949576
2,3.464102


In [53]:
X[:5]

Unnamed: 0,Intercept,timepoint
0,1.0,0.0
1,1.0,2.0
2,1.0,4.0
3,1.0,6.0
4,1.0,8.0


## Fit linear model
A complete explanation: 
https://connor-johnson.com/2014/02/18/linear-regression-with-python/

In [54]:
lm_res = sm.OLS(endog = y, exog = X, missing = "raise").fit()

Intercept    3.285658
timepoint    0.048129
dtype: float64


In [55]:
lm_res.summary()

0,1,2,3
Dep. Variable:,sqrt_mean_fw,R-squared:,0.666
Model:,OLS,Adj. R-squared:,0.655
Method:,Least Squares,F-statistic:,59.73
Date:,"Tue, 26 Oct 2021",Prob (F-statistic):,1.27e-08
Time:,10:56:47,Log-Likelihood:,-31.632
No. Observations:,32,AIC:,67.26
Df Residuals:,30,BIC:,70.2
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,3.2857,0.228,14.392,0.000,2.819,3.752
timepoint,0.0481,0.006,7.728,0.000,0.035,0.061

0,1,2,3
Omnibus:,59.71,Durbin-Watson:,0.63
Prob(Omnibus):,0.0,Jarque-Bera (JB):,489.67
Skew:,-3.953,Prob(JB):,4.6700000000000004e-107
Kurtosis:,20.457,Cond. No.,70.5


R-squared = 1 - SSE/SST
with SSE: total sum of squared error (between predictions and observations)
with SST: total sum of squares

Null hypothesis: data can be modelled accuratelywith coefficients set at 0

In [56]:
lm_res.params

Intercept    3.285658
timepoint    0.048129
dtype: float64