In [1]:
import streamlit as st
import numpy as np
import pandas as pd
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from mlxtend.evaluate import bias_variance_decomp
import matplotlib.pyplot as plt

In [2]:
data = pd.read_csv('https://raw.githubusercontent.com/mwaskom/seaborn-data/refs/heads/master/diamonds.csv')

In [8]:
def calc_metrics(x, y, degree = 2, nruns = 50):
    x, y = np.array(x), np.array(y)
    # Reshaping if only one feature is given
    if x.size / y.size == 1:
        x = x.reshape(-1, 1)
    # Train-test split
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.33, random_state=123)
    # Polynomial transformation
    poly = PolynomialFeatures(degree)
    x_train_poly = poly.fit_transform(x_train)
    x_test_poly = poly.transform(x_test)
    mse, bias, var = bias_variance_decomp(LinearRegression(), x_train_poly, y_train, x_test_poly, y_test, loss='mse', num_rounds=nruns, random_seed=123)
    return bias, var

In [4]:
data

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
0,0.23,Ideal,E,SI2,61.5,55.0,326,3.95,3.98,2.43
1,0.21,Premium,E,SI1,59.8,61.0,326,3.89,3.84,2.31
2,0.23,Good,E,VS1,56.9,65.0,327,4.05,4.07,2.31
3,0.29,Premium,I,VS2,62.4,58.0,334,4.20,4.23,2.63
4,0.31,Good,J,SI2,63.3,58.0,335,4.34,4.35,2.75
...,...,...,...,...,...,...,...,...,...,...
53935,0.72,Ideal,D,SI1,60.8,57.0,2757,5.75,5.76,3.50
53936,0.72,Good,D,SI1,63.1,55.0,2757,5.69,5.75,3.61
53937,0.70,Very Good,D,SI1,62.8,60.0,2757,5.66,5.68,3.56
53938,0.86,Premium,H,SI2,61.0,58.0,2757,6.15,6.12,3.74


In [9]:
out_dict = {
    'degree' : [],
    'carat' : [],
    'depth' : [],
    'table' : [],
    'bias' : [],
    'variance' : []
}
for degree in range(1, 21):
    for car in range(0, 2):
        for dep in range(0, 2):
            for tab in range(0, 2):
                var_list = []
                if car == 1:
                    var_list.append('carat')
                if dep == 1:
                    var_list.append('depth')
                if tab == 1:
                    var_list.append('table')
                if len(var_list) != 0:
                    bias, variance = calc_metrics(data[var_list].values, data['price'].values, degree)
                    out_dict['degree'].append(degree)
                    out_dict['carat'].append(car)
                    out_dict['depth'].append(dep)
                    out_dict['table'].append(tab)
                    out_dict['bias'].append(bias)
                    out_dict['variance'].append(variance)

df = pd.DataFrame(out_dict)
df.to_csv('simulated_data.csv',index=False)