In [8]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
import altair as alt
import os
import sys
import numpy as np
cur_dir = os.getcwd()
SRC_PATH = cur_dir[
    : cur_dir.index("arm_balance") + len("arm_balance")
]
if SRC_PATH not in sys.path:
    sys.path.append(SRC_PATH)
from src.utils.utils import *


In [9]:
DATA_DIR = "../../data/raw_data/"
files = os.listdir(DATA_DIR)
files = [os.path.join(DATA_DIR, f) for f in files]
np.random.seed(123)
np.random.shuffle(files)
TEST_SIZE = 0.3
sample_sizes = np.array([1, 100, 400, 900,1000, 1600, 2500, 5000,7500, 10000])
results = []

for size in sample_sizes:
    sample = get_data(files, size)
    train_df, test_df = train_test_split(sample,test_size = TEST_SIZE, shuffle = False)
    X_train, y_train = train_df['1'].to_numpy().reshape(-1,1) ,train_df['y']
    X_test, y_test = test_df['1'].to_numpy().reshape(-1,1) ,test_df['y']
    lr = LinearRegression()
    lr.fit(X_train, y_train)
    results.append(np.average(np.abs(lr.predict(X_test) - y_test)))



In [10]:
accuracies_ratio = [1/(res/results[0]) for res in results]

size_improved = [(res/sample_sizes[0]) for res in sample_sizes]

res_df = pd.DataFrame({'error': results, 
                       'accuracy': accuracies_ratio, 
                       'sample size':sample_sizes,
                       'size improved': size_improved,
                       'fit' :0.1*sample_sizes**0.5,})

In [11]:
res_df

Unnamed: 0,error,accuracy,sample size,size improved,fit
0,5.125261,1.0,1,1.0,0.1
1,0.668193,7.670337,100,100.0,1.0
2,0.684692,7.485497,400,400.0,2.0
3,0.811041,6.319364,900,900.0,3.0
4,0.791277,6.477199,1000,1000.0,3.162278
5,0.619499,8.27324,1600,1600.0,4.0
6,0.498406,10.283311,2500,2500.0,5.0
7,0.555233,9.230832,5000,5000.0,7.071068
8,0.585287,8.756833,7500,7500.0,8.660254
9,0.599302,8.55205,10000,10000.0,10.0


In [12]:
base = alt.Chart(res_df, title = 'Model Performance As Sample Size Increases').encode(
    alt.X('sample size', axis=alt.Axis(title = 'Sample Size',))
)

area = base.mark_point(color='#57A44C').encode(
    alt.Y('error',
          axis=alt.Axis(title='Error (g)', titleColor='#57A44C')),
)

line = base.mark_point().encode(
    alt.Y('accuracy',
          axis=alt.Axis(title='Accuracy Ratio Improved', titleColor='#5276A7'))
)
area = area + area.transform_regression('sample size','error', method='pow').mark_line(color='#57A44C')
line = line + line.transform_regression('sample size','accuracy', method='pow').mark_line(stroke='#5276A7')

plot = alt.layer(area, line).resolve_scale(
    y = 'independent'
)
save_chart(plot, '../../report/assets/clt.png')
plot