## Cross Validation for Bias + RMSE Estimates

In [1]:
import numpy as np
import pandas as pd 
import random

from sklearn.metrics import root_mean_squared_error, mean_squared_error
from sklearn.ensemble import RandomForestRegressor
from itertools import zip_longest

In [2]:
#import dataset
ta_ed = pd.read_csv("~/hollings/data/8clovlap.csv")
ta_ed2 = pd.read_csv("~/hollings/data/8clovlap2.csv")

## Overall Bias Estimates

#### Determining how many cruises need to be in each CV group

In [27]:
cr_lst = ta_ed2.Cruise.unique().tolist()

In [52]:
361/5, 72*5

(72.2, 360)

In [26]:
random.seed(6)
tst = random.sample(cr_lst, len(cr_lst))
grps = zip(*(iter(tst),) * 72)
grp_lst = list(grps)

In [5]:
#making the different cv groups
grp_dict = {}
for idx, grp in enumerate(grp_lst):
    jdx = list(grp_lst[idx])
    grp_dict[idx] = jdx

grp_dict[4].extend([1076.0])

grp_data_dict = {}
for idx in grp_dict.keys():
    j = ta_ed[ta_ed["Cruise"].isin(grp_dict[idx])]
    grp_data_dict[idx] = j

cv_splitdt = {}
for idx in grp_data_dict.keys():
    test_set =  grp_data_dict[idx]
    train_set = ta_ed[~ta_ed.index.isin(test_set.index)]
    cv_splitdt[idx] = [train_set.index, test_set.index]

cv_datadt = {}
for idx in cv_splitdt.keys():
    ts = ta_ed.iloc[cv_splitdt[idx][1]]
    tr = ta_ed.iloc[cv_splitdt[idx][0]]
    cv_datadt[idx] = [tr,ts]

cv_tot = {}
for idx in cv_datadt.keys():
    cv_cldt = {}
    for i in range(8):
        match_tr = []
        match_ts = []
        match_tr = cv_datadt[idx][0][cv_datadt[idx][0]['Cluster'] == i]
        match_ts = cv_datadt[idx][1][cv_datadt[idx][1]['Cluster'] == i]
        cv_cldt[i] = [match_tr, match_ts]
    cv_tot[idx] = cv_cldt

In [None]:
clst = []
for i in grp_dict.keys():
    clst.extend(grp_dict[i])

len(clst)
clst.sort()
clst

In [134]:
ts, tr

(        Cruise  region  Latitude  Longitude  Bottom Depth   Depth       T  \
 468      272.0     8.0    32.749     133.11         144.0     8.0  23.233   
 469      272.0     8.0    32.749     133.11         144.0    23.0  22.534   
 470      272.0     8.0    32.749     133.11         144.0    52.0  20.864   
 471      272.0     8.0    32.749     133.11         144.0    77.0  19.765   
 472      272.0     8.0    32.749     133.11         144.0   102.0  17.808   
 ...        ...     ...       ...        ...           ...     ...     ...   
 225187   348.0     8.0   -58.000    -170.00        2038.0   805.0   2.562   
 225188   348.0     8.0   -58.000    -170.00        2038.0  1007.0   2.444   
 225189   348.0     8.0   -58.000    -170.00        2038.0  1256.0   2.320   
 225190   348.0     8.0   -58.000    -170.00        2038.0  1506.0   2.198   
 225191   348.0     8.0   -58.000    -170.00        2038.0  2000.0   1.883   
 
              S  Oxygen  Nitrate  ...     Cluster 1      Clust

In [135]:
cv_splitdt

{0: [Index([    57,     58,     59,     60,     61,     62,     63,     64,     65,
             66,
         ...
         233425, 233426, 233427, 233428, 233429, 233430, 233431, 233432, 233433,
         233434],
        dtype='int64', length=180789),
  Index([     0,      1,      2,      3,      4,      5,      6,      7,      8,
              9,
         ...
         226936, 226937, 226938, 226939, 226940, 226941, 226942, 226943, 226944,
         226945],
        dtype='int64', length=52646)],
 1: [Index([     0,      1,      2,      3,      4,      5,      6,      7,      8,
              9,
         ...
         233425, 233426, 233427, 233428, 233429, 233430, 233431, 233432, 233433,
         233434],
        dtype='int64', length=187177),
  Index([  3867,   3868,   3869,   3870,   3871,   3872,   3873,   3874,   3875,
           3876,
         ...
         232536, 232537, 232538, 232539, 232540, 232541, 232542, 232543, 232544,
         232545],
        dtype='int64', length=46258)]

#### Create the CV folds 

In [1]:
#list(ta_ed2.columns)

In [28]:
## 8cl split
clust1 = ta_ed2.loc[ta_ed2["Cluster"] ==0]
clust2 = ta_ed2.loc[ta_ed2["Cluster"] ==1]
clust3 = ta_ed2.loc[ta_ed2["Cluster"] ==2]
clust4 = ta_ed2.loc[ta_ed2["Cluster"] ==3]
clust5 = ta_ed2.loc[ta_ed2["Cluster"] ==4]
clust6 = ta_ed2.loc[ta_ed2["Cluster"] ==5]
clust7 = ta_ed2.loc[ta_ed2["Cluster"] ==6]
clust8 = ta_ed2.loc[ta_ed2["Cluster"] ==7]

cluster_lst = [clust1, clust2, clust3, clust4, clust5, clust6, clust7, clust8]

In [2]:
#Checking to see if this was done correctly 
#clust1.Cluster
#len(cv_tot[0][0][0]), len(cv_tot[0][0][1]), len(cv_tot[0][0][0]) + len(cv_tot[0][0][1])
#b[0][0].index

In [29]:
def cv_split(cr_lst, df): ##this output is equivalent to splitted data
    random.seed(6)
    tst = random.sample(cr_lst, len(cr_lst))
    grps = zip(*(iter(tst),) * 72)
    grp_lst = list(grps)

#making the different cv groups
    grp_dict = {}
    for idx, grp in enumerate(grp_lst):
        jdx = list(grp_lst[idx])
        grp_dict[idx] = jdx
    grp_dict[4].extend([1076.0])
        
#turn the list of groups --> dataframes
    grp_data_dict = {}
    for idx in grp_dict.keys():
        j = df[df["Cruise"].isin(grp_dict[idx])]
        grp_data_dict[idx] = j

#make train and test sets for each fold 
    cv_splitdt = {}
    for idx in grp_data_dict.keys():
        test_set =  grp_data_dict[idx]
        train_set = df[~df.index.isin(test_set.index)]
        cv_splitdt[idx] = [train_set.index, test_set.index]

#need to sep by cluster here 
    cv_datadt = {}
    for idx in cv_splitdt.keys():
        ts = df.iloc[cv_splitdt[idx][1]]
        tr = df.iloc[cv_splitdt[idx][0]]
        cv_datadt[idx] = [tr,ts]

    cv_tot = {}
    for idx in cv_datadt.keys():
        cv_cldt = {}
        for i in range(8):
            match_tr = []
            match_ts = []
            match_tr = cv_datadt[idx][0][cv_datadt[idx][0]['Cluster'] == i]
            match_ts = cv_datadt[idx][1][cv_datadt[idx][1]['Cluster'] == i]
            cv_cldt[i] = [match_tr.index, match_ts.index]
        cv_tot[idx] = cv_cldt
    return cv_tot

In [6]:
#Testing to see if the function worked
#cv_tot[0][0][0].TA

In [32]:
ex = cv_split(cr_lst, ta_ed2)

In [5]:
#Applying the splitting function
#ex[0]

In [None]:
## checking that the train and test indices really are split properly 
list(cv_train_dt[0].columns)
qa = list(cv_train_dt[0].index)
qa1 = list(cv_test_dt[0].index)

sm = []
for i in range(len(qa)):
    k = (qa[i] == qa1[i])
    sm.append(k)
print(np.unique(sm))

In [34]:
def root_mean_squared_error(y_true, y_pred):
    return np.sqrt(mean_squared_error(y_true, y_pred))
    
def make_features(c_lst):
    ft_dict = {}
    for idx, cluster in enumerate(c_lst):
        X1 = cluster["S"] #0
        X2 = cluster["PT"] #1
        X3 = cluster["Nitrate"] #2
        X4 = cluster["AOU"] #3
        X5 = cluster["Silicate"] #4
        #lat = cluster["Latitude"] #5
        lon20= np.cos(np.deg2rad(cluster["Longitude"] - 20))#6
        lon110= np.cos(np.deg2rad(cluster["Longitude"] - 110)) #7
        depth = cluster["Depth"] #8
        order = cluster["order"] #9
        y = cluster["TA"] #10
        cruise = cluster["Cruise"] #11
        ft_dict[idx]=[X1, X2, X3, X4, X5, lon20, lon110, depth, order, y, cruise]
    return ft_dict

##this output is equivalent to splitted data
def cv_split(cr_lst):
    random.seed(6)
    tst = random.sample(cr_lst, len(cr_lst))
    grps = zip(*(iter(tst),) * 72)
    grp_lst = list(grps)

#making the different cv groups
    grp_dict = {}
    for idx, grp in enumerate(grp_lst):
        jdx = list(grp_lst[idx])
        grp_dict[idx] = jdx
    grp_dict[4].extend([1076.0])

#turn the list of groups --> dataframes
    grp_data_dict = {}
    for idx in grp_dict.keys():
        j = ta_ed[ta_ed["Cruise"].isin(grp_dict[idx])]
        grp_data_dict[idx] = j

#make train and test sets for each fold 
    cv_splitdt = {}
    for idx in grp_data_dict.keys():
        test_set =  grp_data_dict[idx]
        train_set = ta_ed[~ta_ed.index.isin(test_set.index)]
        cv_splitdt[idx] = [train_set.index, test_set.index]

#need to sep by cluster here 
    cv_datadt = {}
    for idx in cv_splitdt.keys():
        ts = ta_ed.iloc[cv_splitdt[idx][1]]
        tr = ta_ed.iloc[cv_splitdt[idx][0]]
        cv_datadt[idx] = [tr,ts]

    cv_tot = {}
    for idx in cv_datadt.keys():
        cv_cldt = {}
        for i in range(8):
            match_tr = []
            match_ts = []
            match_tr = cv_datadt[idx][0][cv_datadt[idx][0]['Cluster'] == i]
            match_ts = cv_datadt[idx][1][cv_datadt[idx][1]['Cluster'] == i]
            cv_cldt[i] = [match_tr.index, match_ts.index]
        cv_tot[idx] = cv_cldt
    return cv_tot
    
#drop y and cruise col here (need y not to be shaped this way and dont need cruise col anymore)
def rearrange(splitted_data, ft_dict):
    reshaped_data = {}
    for idx in splitted_data.keys(): #going into each of the keys 
        train_index, test_index = splitted_data[idx]
        feat_values = []
        for i in range(len(ft_dict[idx])-3): ##added a "-3" to get rid of last three elements but idk that will acutally work
            re_values = []
            train_arr = ft_dict[idx][i][train_index]
            train=np.array(train_arr)
            train.reshape(-1,1)
        
            test_arr = ft_dict[idx][i][test_index]
            test = np.array(test_arr)
            test.reshape(-1,1)
            
            re_values.append(train)
            re_values.append(test)
            feat_values.append(re_values)
        reshaped_data[idx] = feat_values
    return reshaped_data

def y_capture(splitted_data, ft_dict):
    y_dict = {}
    for idx in splitted_data.keys(): #going into each of the keys 
        train_index, test_index = splitted_data[idx]
        y_train = ft_dict[idx][9][train_index]
        y_test = ft_dict[idx][9][test_index]
        y_dict[idx] = (y_train, y_test)
    return y_dict

def order_capture(splitted_data, ft_dict):
    order_dict = {}
    for idx in splitted_data.keys(): #going into each of the keys 
        train_index, test_index = splitted_data[idx]
        order_train = ft_dict[idx][8][train_index]
        order_test = ft_dict[idx][8][test_index]
        order_dict[idx] = (order_train, order_test)
    return order_dict
    
def make_clust_eq(reshaped_dict):
    tpose_dt = {}
    for idx in reshaped_dict.keys(): #going into each cluster
        final_set = []
        train_eq_clust = []
        test_eq_clust = []
        for k in range(len(reshaped_dict[idx])):
            train_eq_clust.append(reshaped_dict[idx][k][0]) #this is where the problem is 
            train_clust_trans = np.transpose(train_eq_clust)
            test_eq_clust.append(reshaped_dict[idx][k][1]) #this is where the problem is 
            test_clust_trans = np.transpose(test_eq_clust)
        final_set.append(train_clust_trans)
        final_set.append(test_clust_trans)
        tpose_dt[idx] = final_set
    return tpose_dt

def rfr(tpose_dt, y_dict):
    train_rmse_lst = []
    tst_rmse_lst = []
    for idx in tpose_dt.keys():
        rf = RandomForestRegressor()
        rf.fit(tpose_dt[idx][0], y_dict[idx][0])
        
        train_pred = rf.predict(tpose_dt[idx][0])
        train_rmse = root_mean_squared_error(y_dict[idx][0], train_pred)
        train_rmse_lst.append(train_rmse)
        
        test_pred = rf.predict(tpose_dt[idx][1])
        tst_rmse = root_mean_squared_error(y_dict[idx][1], test_pred)
        tst_rmse_lst.append(tst_rmse)
    return train_rmse_lst, tst_rmse_lst
    
def preds(tpose_dt, y_dict):
    train_preds = {}
    test_preds = {}
    for idx in tpose_dt.keys():
        rf = RandomForestRegressor()
        rf.fit(tpose_dt[idx][0], y_dict[idx][0])
        train_pred = rf.predict(tpose_dt[idx][0])
        train_preds[idx] = train_pred
        test_pred = rf.predict(tpose_dt[idx][1])
        test_preds[idx] = test_pred
    return train_preds, test_preds

def comb_ords_preds(test_preds, order_dict, y_dict):
    clnum = []
    ord_tst = []
    pred_tst = []
    actual_y = []
    for idx in range(len(order_dict)):
        cl_len = [idx]*len(order_dict[idx][1])
        ord_tst.extend(order_dict[idx][1])
        clnum.extend(cl_len)
    for idx in range(len(y_dict)):
        actual_y.extend(y_dict[idx][1])
    for i in range(len(test_preds)):
        pred_tst.extend(test_preds[i])
    comb_df = pd.DataFrame( {"Order_Num": ord_tst,
                             "RFR1": pred_tst,
                             "Y_test": actual_y,
                             "Cluster": clnum})
    return comb_df

### CV 1

In [35]:
b = make_features(cluster_lst)

In [36]:
d = rearrange(ex[0],b)
e = y_capture(ex[0],b)
order = order_capture(ex[0],b)
f = make_clust_eq(d)
g = rfr(f,e)
h = preds(f,e)

In [39]:
on = g

In [47]:
one = comb_ords_preds(h[1], order, e)

In [48]:
one["cv"] = 1

In [49]:
one

Unnamed: 0,Order_Num,RFR1,Y_test,Cluster,cv
0,116908.0,2299.279000,2299.0,0,1
1,116909.0,2299.275000,2301.1,0,1
2,116910.0,2300.042000,2301.5,0,1
3,116911.0,2302.530000,2304.8,0,1
4,116912.0,2303.789000,2301.8,0,1
...,...,...,...,...,...
54515,162954.0,2394.927000,2390.2,7,1
54516,162955.0,2392.673000,2391.0,7,1
54517,162956.0,2390.647000,2387.4,7,1
54518,162957.0,2384.599000,2383.7,7,1


## CV 2

In [50]:
b1 = make_features(cluster_lst)
d1 = rearrange(ex[1],b1)
e1 = y_capture(ex[1],b1)
order1 = order_capture(ex[1],b1)
f1 = make_clust_eq(d1)
g1 = rfr(f1,e1)
h1 = preds(f1,e1)

In [51]:
tw=g1

In [52]:
tw

([1.2444097867701127,
  4.894137446674277,
  1.6570476113777006,
  1.5283991383298567,
  2.26651264037831,
  2.1082002427512663,
  1.3178106853314722,
  1.410460132528376],
 [4.780764256259536,
  36.13580470772114,
  6.007562121409623,
  5.318623435594565,
  13.661241889708052,
  7.76857351563578,
  4.158811970509429,
  6.505131040062277])

In [53]:
two = comb_ords_preds(h1[1], order1, e1)

In [54]:
two["cv"] = 2

## CV 3

In [55]:
b2 = make_features(cluster_lst)
d2 = rearrange(ex[2],b2)
e2 = y_capture(ex[2],b2)
order2 = order_capture(ex[2],b2)
f2 = make_clust_eq(d2)
g2 = rfr(f2,e2)
h2 = preds(f2,e2)

In [56]:
th=g2

In [57]:
th

([1.1600648216467442,
  5.801173161743722,
  1.6858923900879323,
  1.5166855623135271,
  2.37289015000814,
  2.03525638376589,
  1.255990183746004,
  1.5308190628072926],
 [5.606010799447317,
  20.53914721301718,
  5.98892396925996,
  5.399105016719821,
  7.638586811845952,
  7.731757388082069,
  4.679138101586994,
  5.947183005159413])

In [58]:
three = comb_ords_preds(h2[1], order2, e2)

In [59]:
three["cv"] = 3

## CV 4

In [60]:
b3 = make_features(cluster_lst)
d3 = rearrange(ex[3],b3)
e3 = y_capture(ex[3],b3)
order3 = order_capture(ex[3],b3)
f3 = make_clust_eq(d3)
g3 = rfr(f3,e3)
h3 = preds(f3,e3)

In [61]:
fo = g3

In [62]:
four = comb_ords_preds(h3[1], order3, e3)

In [63]:
four["cv"] = 4

## CV 5

In [64]:
b4 = make_features(cluster_lst)
d4 = rearrange(ex[4],b4)
e4 = y_capture(ex[4],b4)
order4 = order_capture(ex[4],b4)
f4 = make_clust_eq(d4)
g4 = rfr(f4,e4)
h4 = preds(f4,e4)

In [65]:
fi = g4

In [66]:
five = comb_ords_preds(h4[1], order4, e4)

In [67]:
five["cv"] = 5

## Adding a Bias Column to the CV Table

In [68]:
all_cv = pd.concat([one, two, three, four, five])

In [69]:
all_cv["bias"] = all_cv["RFR1"] - all_cv["Y_test"]
all_cv.groupby("Cluster").mean()

Unnamed: 0_level_0,Order_Num,RFR1,Y_test,cv,bias
Cluster,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,148090.747783,2325.090627,2325.10407,2.885986,-0.013443
1,100976.51634,2169.117336,2171.102326,2.191176,-1.98499
2,83886.128873,2345.459833,2345.462606,2.906644,-0.002773
3,71203.811346,2342.293086,2342.247686,2.887654,0.045401
4,79208.325758,2262.865344,2263.811,2.287584,-0.945656
5,67549.166037,2321.103969,2321.083901,2.850985,0.020068
6,48981.864166,2349.278112,2349.285691,3.228815,-0.00758
7,150036.290943,2337.547288,2337.378366,2.575523,0.168923


## RMSE averages

In [83]:
rmls = [on[1], tw[1], th[1], fo[1], fi[1]]

In [86]:
rmse_df = pd.DataFrame(on[1])

In [93]:
rmse_df["1"] = tw[1]
rmse_df["2"] = th[1]
rmse_df["3"] = fo[1]
rmse_df["4"] = fi[1]

In [95]:
rmse_df["avg"] = np.mean(rmse_df, axis =1)

In [96]:
rmse_df

Unnamed: 0,0,1,2,3,4,avg
0,4.099864,4.780764,5.606011,3.742733,3.916765,4.429228
1,22.234496,36.135805,20.539147,18.847833,14.553259,22.462108
2,5.269306,6.007562,5.988924,5.442247,4.590867,5.459781
3,5.114138,5.318623,5.399105,4.716471,5.797535,5.269174
4,8.395583,13.661242,7.638587,10.470499,5.554151,9.144012
5,6.278544,7.768574,7.731757,8.024167,4.718473,6.904303
6,4.026153,4.158812,4.679138,3.772253,3.714001,4.070072
7,4.840192,6.505131,5.947183,4.366396,9.384188,6.208618


### Editing Functions

#### Fixing the Y capture function

In [115]:
y_dict = {}
for idx in ex[0].keys(): #going into each of the keys 
    train_index, test_index = ex[0][idx]
    y_train = b[idx][9][train_index]
    y_test = b[idx][9][test_index]
    y_dict[idx] = (y_train, y_test)

#### Fixing Rearrange Function

In [None]:
## fixing rearrange function
reshaped_data = {}
for idx in ex[0].keys(): #going into each of the keys 
    train_index, test_index = ex[0][idx]
    feat_values = []
    for i in range(len(b[idx])-3): ##added a "-3" to get rid of last three elements but idk that will acutally work
        re_values = []    
        
        train_arr = b[idx][i][train_index]
        train=np.array(train_arr)
        train.reshape(-1,1)
        
        test_arr = b[idx][i][test_index]
        test = np.array(test_arr)
        test.reshape(-1,1)
        
        re_values.append(train)
        re_values.append(test)
        feat_values.append(re_values)
        
    reshaped_data[idx] = feat_values

In [7]:
#b

In [156]:
cv_split(cr_lst)[0]

{0: [Index([    57,     58,     59,     60,     61,     62,     63,     64,     65,
             66,
         ...
         208807, 208808, 208809, 208810, 208811, 208812, 208813, 208814, 208815,
         208816],
        dtype='int64', length=39074),
  Index([  5851,   5852,   5853,   5854,   5855,   5856,   5857,   5858,   5859,
           5860,
         ...
         209472, 209473, 209474, 209475, 209476, 209477, 209478, 209479, 209480,
         209481],
        dtype='int64', length=7891)],
 1: [Index([   228,    229,    230,    231,    232,    233,    234,    235,    236,
            237,
         ...
         211678, 211679, 211680, 211681, 211682, 211683, 211684, 211685, 211686,
         211687],
        dtype='int64', length=29255),
  Index([  6576,   6577,   6578,   6579,   6580,   6581,   6582,   6583,   6584,
           6585,
         ...
         205690, 205691, 205692, 205693, 205694, 205695, 205696, 205697, 205698,
         205699],
        dtype='int64', length=8143)],
 2

In [153]:
b[0][0]

115954    33.8840
115955    33.8810
115956    33.9180
115957    34.0530
115958    34.4340
           ...   
232418    34.6728
232419    34.6777
232420    34.6830
232421    34.6822
232422    34.6826
Name: S, Length: 24804, dtype: float64

In [98]:
len(reshaped_data[0][0][0]), len(reshaped_data[0][0][1])

(39074, 7891)

In [69]:
train_index

Index([    57,     58,     59,     60,     61,     62,     63,     64,     65,
           66,
       ...
       208807, 208808, 208809, 208810, 208811, 208812, 208813, 208814, 208815,
       208816],
      dtype='int64', length=39074)

In [74]:
test_index

Index([  5851,   5852,   5853,   5854,   5855,   5856,   5857,   5858,   5859,
         5860,
       ...
       209472, 209473, 209474, 209475, 209476, 209477, 209478, 209479, 209480,
       209481],
      dtype='int64', length=7891)

In [73]:
b[0][0].index

Index([    57,     58,     59,     60,     61,     62,     63,     64,     65,
           66,
       ...
       209472, 209473, 209474, 209475, 209476, 209477, 209478, 209479, 209480,
       209481],
      dtype='int64', length=46965)

In [75]:
b[0][1].index

Index([    57,     58,     59,     60,     61,     62,     63,     64,     65,
           66,
       ...
       209472, 209473, 209474, 209475, 209476, 209477, 209478, 209479, 209480,
       209481],
      dtype='int64', length=46965)