In [2]:
import seaborn as sns
import pandas as pd

In [3]:
p_df = sns.load_dataset('penguins')

In [4]:
p_df.head()

Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex
0,Adelie,Torgersen,39.1,18.7,181.0,3750.0,Male
1,Adelie,Torgersen,39.5,17.4,186.0,3800.0,Female
2,Adelie,Torgersen,40.3,18.0,195.0,3250.0,Female
3,Adelie,Torgersen,,,,,
4,Adelie,Torgersen,36.7,19.3,193.0,3450.0,Female


In [5]:
p_df = p_df.dropna()

## X-y split

In [6]:
X = p_df[['flipper_length_mm']]
y = p_df[['body_mass_g']]

## Train-test split

In [7]:
from sklearn.model_selection import train_test_split

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 100)

## Creating our model

In [9]:
from sklearn.linear_model import LinearRegression

In [10]:
lm = LinearRegression()

In [11]:
model = lm.fit(X_train, y_train)

In [12]:
model.coef_

array([[48.46455556]])

In [13]:
model.intercept_

array([-5538.16645549])

In [14]:
import numpy as np

## Predict the weight of a random penguin

In [15]:
random_peng = np.array([215]).reshape(-1,1)

In [16]:
model.predict(random_peng)



array([[4881.71298884]])

## Create "real vs predicted" DataFrame

In [17]:
y_pred = np.array(model.predict(X_test))

In [18]:
y_test.reset_index(inplace=True)

In [19]:
pd.DataFrame(y_pred)[0]

0     3524.705433
1     4397.067433
2     4881.712989
3     3670.099100
4     4590.925656
         ...     
79    4639.390211
80    3573.169989
81    3330.847211
82    3427.776322
83    3960.886433
Name: 0, Length: 84, dtype: float64

In [20]:
y_pred.shape

(84, 1)

In [21]:
real_vs_pred = pd.DataFrame({'y_test': y_test['body_mass_g'], 'y_pred':pd.DataFrame(y_pred)[0]})

In [22]:
real_vs_pred

Unnamed: 0,y_test,y_pred
0,3350.0,3524.705433
1,4550.0,4397.067433
2,4850.0,4881.712989
3,3450.0,3670.099100
4,4600.0,4590.925656
...,...,...
79,4550.0,4639.390211
80,3050.0,3573.169989
81,3075.0,3330.847211
82,3700.0,3427.776322


## Compute the MSE

In [23]:
from sklearn.metrics import mean_squared_error as mse

In [24]:
mse(real_vs_pred['y_test'], real_vs_pred['y_pred'])

165251.7722003463

## Using Statsmodel nb

In [39]:
import statsmodels.api as sm

In [40]:
X_train_const = sm.add_constant(X_train.to_numpy())

model = sm.OLS(y_train, X_train_const).fit()
predictions_train = model.predict(X_train_const)

X_test_const = sm.add_constant(X_test)
y_pred = model.predict(X_test_const)
print_model = model.summary()

print_model

0,1,2,3
Dep. Variable:,body_mass_g,R-squared:,0.75
Model:,OLS,Adj. R-squared:,0.749
Method:,Least Squares,F-statistic:,741.0
Date:,"Wed, 08 Nov 2023",Prob (F-statistic):,2.5799999999999998e-76
Time:,15:29:47,Log-Likelihood:,-1837.7
No. Observations:,249,AIC:,3679.0
Df Residuals:,247,BIC:,3687.0
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,-5538.1665,358.124,-15.464,0.000,-6243.532,-4832.801
x1,48.4646,1.780,27.221,0.000,44.958,51.971

0,1,2,3
Omnibus:,4.194,Durbin-Watson:,2.035
Prob(Omnibus):,0.123,Jarque-Bera (JB):,3.942
Skew:,0.304,Prob(JB):,0.139
Kurtosis:,3.107,Cond. No.,2920.0
