In [66]:
from random import shuffle
import polars as pl
import altair as alt

### Prepare dataset


In [67]:
data = [
    (165, 60), (170, 65), (175, 75), (160, 50), (180, 80),
    (172, 68), (178, 72), (155, 48), (168, 58), (177, 70),
    (169, 64), (176, 78), (182, 85), (163, 55), (173, 69),
    (171, 66), (159, 52), (174, 74), (167, 62), (179, 78),
    (164, 57), (161, 53), (166, 59), (162, 54), (158, 50),
    (174, 77), (168, 63), (177, 73), (165, 61), (172, 70),
    (176, 76), (180, 82), (159, 49), (173, 67), (178, 74),
    (182, 88), (170, 66), (175, 79), (169, 65), (160, 54),
    (164, 56), (162, 52), (166, 60), (180, 83), (170, 68),
    (157, 45), (161, 58), (175, 75), (171, 64), (177, 72),
    (179, 81), (174, 78), (163, 59), (168, 62), (165, 66),
    (164, 61), (172, 69), (176, 76), (158, 48), (169, 64),
    (171, 70), (174, 74), (167, 65), (180, 82), (165, 63),
    (178, 79), (159, 55), (162, 57), (169, 67), (173, 71),
    (166, 60), (157, 50), (164, 62), (172, 74), (176, 77),
    (180, 81), (161, 56), (175, 72), (170, 69), (178, 78),
    (165, 64), (159, 52), (173, 70), (167, 63), (172, 67),
    (180, 86), (176, 74), (164, 58), (170, 68), (167, 66),
    (171, 65), (160, 53), (178, 82), (182, 85), (159, 51),
    (163, 55), (166, 61), (165, 59), (176, 72), (171, 73),
    # (177, 77), (173, 68), (174, 76), (170, 66), (161, 58),
    # (162, 54), (168, 60), (164, 59), (177, 80), (175, 73)
]

shuffle(data)

train_data_length = round(len(data) * 0.75)

train_data = data[:train_data_length]
test_data = data[train_data_length:]

train_df = pl.DataFrame(
    data=train_data,
    schema=['height', 'weight'],
    orient='row'
)

test_df = pl.DataFrame(
    data=test_data,
    schema=['height', 'weight'],
    orient='row'
)

display(train_df)
display(test_df)

height,weight
i64,i64
170,69
162,54
177,73
163,55
178,72
…,…
160,53
171,65
170,65
169,65


height,weight
i64,i64
178,74
176,74
172,67
182,85
170,66
…,…
173,71
175,75
171,70
176,72


### Calculate weight prediction


In [68]:
height_mean = train_df['height'].mean()
weight_mean = train_df['weight'].mean()

numerator = ((train_df['height'] - height_mean) *
             (train_df['weight'] - weight_mean)).sum()
denominator = ((train_df['height'] - height_mean) ** 2).sum()

slope = numerator / denominator
intercept = weight_mean - slope * height_mean

test_df = test_df.with_columns([
    (slope * pl.col('height') + intercept).alias('weight_prediction')
])

test_df

height,weight,weight_prediction
i64,i64,f64
178,74,78.511687
176,74,75.640552
172,67,69.898283
182,85,84.253957
170,66,67.027148
…,…,…
173,71,71.33385
175,75,74.204985
171,70,68.462715
176,72,75.640552


### Visualize linear regression


In [69]:
scatter = alt.Chart(test_df).mark_point(color='blue').encode(
    x='height',
    y='weight',
    tooltip=['height', 'weight'],
)

line = alt.Chart(test_df).mark_line(color='red').encode(
    x='height',
    y=alt.Y('weight_prediction', title='Weight'),
    tooltip=['height', 'weight_prediction']
)

chart = (scatter + line).properties(
    width=500,
    title='Weight by Height - Linear Regression',

).interactive()
chart

### Test predict


In [73]:
height = 178
print(f'Height = {height}')
print(f'Prediction = {slope * height + intercept}')
test_df

Height = 178
Prediction = 78.511687028258


height,weight,weight_prediction
i64,i64,f64
178,74,78.511687
176,74,75.640552
172,67,69.898283
182,85,84.253957
170,66,67.027148
…,…,…
173,71,71.33385
175,75,74.204985
171,70,68.462715
176,72,75.640552
