# Real-world data experiment

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import joblib

from sklearn.linear_model import Lasso,LinearRegression

In [2]:
np.random.seed(0)

# Loading data

In [3]:
n_averaging_locations = 2
n_bins = 5

In [4]:
df_LRCS = joblib.load('data/batteries/dfs/df_LRCS_bins_'+str(n_bins)+'.pkl')
df_LRCSWMG = joblib.load('data/batteries/dfs/df_LRCSWMG_20221026_21457_17512.pkl')

In [5]:
n_CG_LRCS = len(df_LRCS['Comma gap (µm)'].unique())
n_ML_LRCS = n_bins
dom_CG_LRCS = [75,100,200]
dom_ML_LRCS = np.arange(n_ML_LRCS)

# Learning before abstraction

In [6]:
def eval_metric(model,Xte,yte,Xout,roundpred=True):
    if roundpred:
        preds = np.round(model.predict(Xte.reshape(-1,1)))
    else:
        preds = model.predict(Xte.reshape(-1,1))
    mses = (preds - yte)**2
    print('MSE (with {0} out): {1} ({2})'.format(Xout,np.mean(mses),np.std(mses)))
    print('${0:.2f}\pm{1:.2f}$'.format(np.mean(mses),np.std(mses)))
    
    return np.mean(mses)

## Learning on LRCS data: one-X out

In [7]:
def select_one_X_out(X,y,cond):
    Xte = np.array(X[cond])
    yte = np.array(y[cond])

    Xtr = np.array(X[np.logical_not(cond)])
    ytr = np.array(y[np.logical_not(cond)])
    
    return Xtr,ytr,Xte,yte

### Lasso

In [8]:
scores_a = []

for cg in dom_CG_LRCS:
    cond = df_LRCS['Comma gap (µm)']==cg
    
    Xtr,ytr,Xte,yte = select_one_X_out(df_LRCS['Comma gap (µm)'],df_LRCS['binned ML'],cond)
    
    model = Lasso().fit(Xtr.reshape(-1,1),ytr)
    score = eval_metric(model,Xte,yte,cg,True)
    scores_a.append(score)
    
print('Overall MSE: {0} ({1})'.format(np.mean(scores_a),np.std(scores_a)))
print('${0:.2f}\pm{1:.2f}$'.format(np.mean(scores_a),np.std(scores_a)))

MSE (with 75 out): 1.375 (0.9921567416492215)
$1.38\pm0.99$
MSE (with 100 out): 0.0 (0.0)
$0.00\pm0.00$
MSE (with 200 out): 4.208333333333333 (0.9991315673568165)
$4.21\pm1.00$
Overall MSE: 1.861111111111111 (1.7520931045220114)
$1.86\pm1.75$


# Learning after abstraction

## Learning on LRCS+WMG data: one-X out with WMG providing the missing support

### Lasso

In [9]:
scores_b = []

for cg in dom_CG_LRCS:
    cond0 = df_LRCS['Comma gap (µm)']==cg
    cond1 = list(cond0) + [False]*(len(df_LRCSWMG)-len(cond0))
    
    Xtr,ytr,Xte,yte = select_one_X_out(df_LRCSWMG['Comma gap (µm)'],df_LRCSWMG['binned ML'],cond1)
    
    model = Lasso().fit(Xtr.reshape(-1,1),ytr)
    score = eval_metric(model,Xte,yte,cg,False)
    scores_b.append(score)
    
print('Overall MSE: {0} ({1})'.format(np.mean(scores_b),np.std(scores_b)))
print('${0:.2f}\pm{1:.2f}$'.format(np.mean(scores_b),np.std(scores_b)))

MSE (with 75 out): 0.5781093959085035 (0.7008871711757073)
$0.58\pm0.70$
MSE (with 100 out): 0.0004050509189137569 (0.0)
$0.00\pm0.00$
MSE (with 200 out): 0.06668892415658287 (0.24854923900700404)
$0.07\pm0.25$
Overall MSE: 0.21506779032800005 (0.25813148264621427)
$0.22\pm0.26$


## Learning on LRCS+WMG data: one X-out with WMG not providing the missing support

### Lasso

In [10]:
scores_c = []

for cg in dom_CG_LRCS:
    cond = df_LRCSWMG['Comma gap (µm)']==cg
    
    Xtr,ytr,Xte,yte = select_one_X_out(df_LRCSWMG['Comma gap (µm)'],df_LRCSWMG['binned ML'],cond)
    
    model = Lasso().fit(Xtr.reshape(-1,1),ytr)
    score = eval_metric(model,Xte,yte,cg,False)
    scores_c.append(score)
    
print('Overall MSE: {0} ({1})'.format(np.mean(scores_c),np.std(scores_c)))
print('${0:.2f}\pm{1:.2f}$'.format(np.mean(scores_c),np.std(scores_c)))

MSE (with 75 out): 0.5607826658850772 (0.38584886995658146)
$0.56\pm0.39$
MSE (with 100 out): 0.5512486399325527 (1.405299836257161)
$0.55\pm1.41$
MSE (with 200 out): 2.5626978817343646 (0.6969472297309359)
$2.56\pm0.70$
Overall MSE: 1.224909729183998 (0.9459670819645568)
$1.22\pm0.95$
