In [9]:
import pandas
import statsmodels.api as sm
import statsmodels.formula.api as smf

data = pandas.read_csv("Concrete_Data_Yeh.csv")
data.head()

Unnamed: 0,cement,slag,flyash,water,superplasticizer,coarseaggregate,fineaggregate,age,csMPa
0,540.0,0.0,0.0,162.0,2.5,1040.0,676.0,28,79.99
1,540.0,0.0,0.0,162.0,2.5,1055.0,676.0,28,61.89
2,332.5,142.5,0.0,228.0,0.0,932.0,594.0,270,40.27
3,332.5,142.5,0.0,228.0,0.0,932.0,594.0,365,41.05
4,198.6,132.4,0.0,192.0,0.0,978.4,825.5,360,44.3


Složky jsou blíže popsány [zde](https://www.ebeton.cz/pojmy/cement-a-jeho-slozky/).

- Cement (cement), kg na m3
- Blast Furnace Slag (cgranulovaná vysokopecní struska), kg na m3
- Fly Ash (popílek), kg na m3
- Water (voda), kg na m3
- Superplasticizer (superplastifikátor), kg na m3, bližší info např. [zde](https://www.chemieprostavbu.cz/sika--superplastifikator-1l/)
- Coarse Aggregate (hrubé kamenivo), kg na m3
- Fine Aggregate (jemné kamenivo), kg na m3
- Age (stáří) ve dnech
- Concrete compressive strength -- quantitative -- MPa -- Output Variable

In [2]:
formula = "csMPa ~ cement + slag  + flyash + water + superplasticizer + coarseaggregate + fineaggregate + age"
mod = smf.ols(formula=formula, data=data)
res = mod.fit()
res.summary()

0,1,2,3
Dep. Variable:,csMPa,R-squared:,0.616
Model:,OLS,Adj. R-squared:,0.613
Method:,Least Squares,F-statistic:,204.3
Date:,"Fri, 19 May 2023",Prob (F-statistic):,6.29e-206
Time:,21:59:25,Log-Likelihood:,-3869.0
No. Observations:,1030,AIC:,7756.0
Df Residuals:,1021,BIC:,7800.0
Df Model:,8,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,-23.3312,26.586,-0.878,0.380,-75.500,28.837
cement,0.1198,0.008,14.113,0.000,0.103,0.136
slag,0.1039,0.010,10.247,0.000,0.084,0.124
flyash,0.0879,0.013,6.988,0.000,0.063,0.113
water,-0.1499,0.040,-3.731,0.000,-0.229,-0.071
superplasticizer,0.2922,0.093,3.128,0.002,0.109,0.476
coarseaggregate,0.0181,0.009,1.926,0.054,-0.000,0.037
fineaggregate,0.0202,0.011,1.887,0.059,-0.001,0.041
age,0.1142,0.005,21.046,0.000,0.104,0.125

0,1,2,3
Omnibus:,5.378,Durbin-Watson:,1.282
Prob(Omnibus):,0.068,Jarque-Bera (JB):,5.304
Skew:,-0.174,Prob(JB):,0.0705
Kurtosis:,3.045,Cond. No.,106000.0


In [3]:
data = pandas.read_csv("expenses.csv")
data

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.900,0,yes,southwest,16884.92400
1,18,male,33.770,1,no,southeast,1725.55230
2,28,male,33.000,3,no,southeast,4449.46200
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.880,0,no,northwest,3866.85520
...,...,...,...,...,...,...,...
1333,50,male,30.970,3,no,northwest,10600.54830
1334,18,female,31.920,0,no,northeast,2205.98080
1335,18,female,36.850,0,no,southeast,1629.83350
1336,21,female,25.800,0,no,southwest,2007.94500


In [4]:
formula = "charges ~ age + bmi"
mod = smf.ols(formula=formula, data=data)
res = mod.fit()
res.summary()

0,1,2,3
Dep. Variable:,charges,R-squared:,0.117
Model:,OLS,Adj. R-squared:,0.116
Method:,Least Squares,F-statistic:,88.6
Date:,"Fri, 19 May 2023",Prob (F-statistic):,7.390000000000001e-37
Time:,21:59:26,Log-Likelihood:,-14394.0
No. Observations:,1338,AIC:,28790.0
Df Residuals:,1335,BIC:,28810.0
Df Model:,2,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,-6424.8046,1744.091,-3.684,0.000,-9846.262,-3003.347
age,241.9308,22.298,10.850,0.000,198.187,285.674
bmi,332.9651,51.374,6.481,0.000,232.182,433.748

0,1,2,3
Omnibus:,321.874,Durbin-Watson:,2.01
Prob(Omnibus):,0.0,Jarque-Bera (JB):,592.574
Skew:,1.511,Prob(JB):,2.11e-129
Kurtosis:,4.223,Cond. No.,287.0


In [5]:
def smoker(row):
    if row["smoker"] == "yes":
        return 1
    else:
        return 0
def sex(row):
    if row["sex"] == "male":
        return 1
    else:
        return 0
data["smoker_number"] = data.apply(smoker, axis=1)
data["sex"] = data.apply(sex, axis=1)

formula = "charges ~  age + bmi + smoker_number"
mod = smf.ols(formula=formula, data=data)
res = mod.fit()
res.summary()

0,1,2,3
Dep. Variable:,charges,R-squared:,0.747
Model:,OLS,Adj. R-squared:,0.747
Method:,Least Squares,F-statistic:,1316.0
Date:,"Fri, 19 May 2023",Prob (F-statistic):,0.0
Time:,21:59:26,Log-Likelihood:,-13557.0
No. Observations:,1338,AIC:,27120.0
Df Residuals:,1334,BIC:,27140.0
Df Model:,3,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,-1.168e+04,937.569,-12.454,0.000,-1.35e+04,-9837.561
age,259.5475,11.934,21.748,0.000,236.136,282.959
bmi,322.6151,27.487,11.737,0.000,268.692,376.538
smoker_number,2.382e+04,412.867,57.703,0.000,2.3e+04,2.46e+04

0,1,2,3
Omnibus:,299.709,Durbin-Watson:,2.077
Prob(Omnibus):,0.0,Jarque-Bera (JB):,710.137
Skew:,1.213,Prob(JB):,6.25e-155
Kurtosis:,5.618,Cond. No.,289.0


In [6]:
smoker = pandas.get_dummies(data["smoker"])
smoker

Unnamed: 0,no,yes
0,False,True
1,True,False
2,True,False
3,True,False
4,True,False
...,...,...
1333,True,False
1334,True,False
1335,True,False
1336,True,False


In [7]:
data = pandas.merge(data, smoker, left_index=True, right_index=True)
data

Unnamed: 0,age,sex,bmi,children,smoker,region,charges,smoker_number,no,yes
0,19,0,27.900,0,yes,southwest,16884.92400,1,False,True
1,18,1,33.770,1,no,southeast,1725.55230,0,True,False
2,28,1,33.000,3,no,southeast,4449.46200,0,True,False
3,33,1,22.705,0,no,northwest,21984.47061,0,True,False
4,32,1,28.880,0,no,northwest,3866.85520,0,True,False
...,...,...,...,...,...,...,...,...,...,...
1333,50,1,30.970,3,no,northwest,10600.54830,0,True,False
1334,18,0,31.920,0,no,northeast,2205.98080,0,True,False
1335,18,0,36.850,0,no,southeast,1629.83350,0,True,False
1336,21,0,25.800,0,no,southwest,2007.94500,0,True,False


In [8]:
formula = "charges ~  age + bmi + yes"
mod = smf.ols(formula=formula, data=data)
res = mod.fit()
res.summary()

0,1,2,3
Dep. Variable:,charges,R-squared:,0.747
Model:,OLS,Adj. R-squared:,0.747
Method:,Least Squares,F-statistic:,1316.0
Date:,"Fri, 19 May 2023",Prob (F-statistic):,0.0
Time:,21:59:26,Log-Likelihood:,-13557.0
No. Observations:,1338,AIC:,27120.0
Df Residuals:,1334,BIC:,27140.0
Df Model:,3,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,-1.168e+04,937.569,-12.454,0.000,-1.35e+04,-9837.561
yes[T.True],2.382e+04,412.867,57.703,0.000,2.3e+04,2.46e+04
age,259.5475,11.934,21.748,0.000,236.136,282.959
bmi,322.6151,27.487,11.737,0.000,268.692,376.538

0,1,2,3
Omnibus:,299.709,Durbin-Watson:,2.077
Prob(Omnibus):,0.0,Jarque-Bera (JB):,710.137
Skew:,1.213,Prob(JB):,6.25e-155
Kurtosis:,5.618,Cond. No.,289.0
