# Creating Overlapping Clusters 

In [None]:
# import libraries 
import pandas as pd
import numpy as np
import cartopy.crs as ccrs
import cartopy.feature as cfeature
import matplotlib.pyplot as plt
from scipy.stats import randint 

from sklearn.mixture import GaussianMixture 
from sklearn.model_selection import RandomizedSearchCV, train_test_split, GroupShuffleSplit, cross_val_score,KFold, GridSearchCV

from sklearn.metrics import root_mean_squared_error, mean_squared_error, silhouette_score, accuracy_score
from sklearn.ensemble import RandomForestRegressor

In [None]:
# load datasets
ta_ed = pd.read_csv("~/hollings/data/ta_orderconv.csv")
tst1 = pd.read_csv("~/hollings/data/8clovlap.csv")

In [None]:
#OPTIMAL CLUSTER ALG: 8 CLUSTERS,NO LON, DEPTH as an additional predictor
gmm_feat = ["Latitude", "SST", "SSS", "Bottom Depth"]
gmm = GaussianMixture(n_components = 8, covariance_type = "full", random_state = 42)
gmm.fit(ta_ed[gmm_feat])

components = gmm.predict(ta_ed[gmm_feat])
prob = gmm.predict_proba(ta_ed[gmm_feat])

clust1 = ta_ed.loc[ta_ed["Cluster"] ==0]
clust2 = ta_ed.loc[ta_ed["Cluster"] ==1]
clust3 = ta_ed.loc[ta_ed["Cluster"] ==2]
clust4 = ta_ed.loc[ta_ed["Cluster"] ==3]
clust5 = ta_ed.loc[ta_ed["Cluster"] ==4]
clust6 = ta_ed.loc[ta_ed["Cluster"] ==5]
clust7 = ta_ed.loc[ta_ed["Cluster"] ==6]
clust8 = ta_ed.loc[ta_ed["Cluster"] ==7]

cluster_lst = [clust1, clust2, clust3, clust4, clust5, clust6, clust7, clust8]

### Functions

In [None]:
def root_mean_squared_error(y_true, y_pred):
    return np.sqrt(mean_squared_error(y_true, y_pred))
    
def make_features(c_lst):
    ft_dict = {}
    for idx, cluster in enumerate(c_lst):
        X1 = cluster["S"] #0
        X2 = cluster["PT"] #1
        X3 = cluster["Nitrate"] #2
        X4 = cluster["AOU"] #3
        X5 = cluster["Silicate"] #4
        #lat = cluster["Latitude"] #5
        lon20= np.cos(np.deg2rad(cluster["Longitude"] - 20))#6
        lon110= np.cos(np.deg2rad(cluster["Longitude"] - 110)) #7
        depth = cluster["Depth"] #8
        order = cluster["order"] #9
        y = cluster["TA"] #10
        cruise = cluster["Cruise"] #11
        ft_dict[idx]=[X1, X2, X3, X4, X5, lon20, lon110, depth, order, y, cruise]
    return ft_dict

#TRAIN-TEST SPLIT: split the data for each cluster
def split_data(ft_dict):
    split_dict = {}
    for idx in ft_dict.keys(): #this is going thru each cluster
        features = ft_dict[idx]
        X, y, cruise = features[0], features[8], features[9]
        clustersp = GroupShuffleSplit(n_splits = 1, test_size = 0.2, random_state = 40) #this is fine
        ind = list(clustersp.split(X, y, cruise))
        train_ind, test_ind = ind[0][0], ind[0][1]
        split_dict[idx] = [train_ind, test_ind]
    return split_dict

#drop y and cruise col here (need y not to be shaped this way and dont need cruise col anymore)
def rearrange(splitted_data, ft_dict):
    reshaped_data = {}
    for idx in splitted_data.keys(): #going into each of the keys 
        train_index, test_index = splitted_data[idx]
        feat_values = []
        for i in range(len(ft_dict[idx])-3): ##added a "-3" to get rid of last three elements 
            re_values = []
            train = np.array(ft_dict[idx][i])[train_index]
            train.reshape(-1,1)
            test = np.array(ft_dict[idx][i])[test_index]
            test.reshape(-1,1)
            re_values.append(train)
            re_values.append(test)
            feat_values.append(re_values)
        reshaped_data[idx] = feat_values
    return reshaped_data

def y_capture(splitted_data, ft_dict):
    y_dict = {}
    for idx in splitted_data.keys(): #going into each of the keys 
        train_index, test_index = splitted_data[idx]
        y_train = ft_dict[idx][9].iloc[train_index]
        y_test = ft_dict[idx][9].iloc[test_index]
        y_dict[idx] = (y_train, y_test)
    return y_dict

def order_capture(splitted_data, ft_dict):
    order_dict = {}
    for idx in splitted_data.keys(): #going into each of the keys 
        train_index, test_index = splitted_data[idx]
        order_train = ft_dict[idx][8].iloc[train_index]
        order_test = ft_dict[idx][8].iloc[test_index]
        order_dict[idx] = (order_train, order_test)
    return order_dict
    
def make_clust_eq(reshaped_dict):
    tpose_dt = {}
    for idx in reshaped_dict.keys(): #going into each cluster
        final_set = []
        train_eq_clust = []
        test_eq_clust = []
        for k in range(len(reshaped_dict[idx])):
            train_eq_clust.append(reshaped_dict[idx][k][0]) 
            train_clust_trans = np.transpose(train_eq_clust)
            test_eq_clust.append(reshaped_dict[idx][k][1]) 
            test_clust_trans = np.transpose(test_eq_clust)
        final_set.append(train_clust_trans)
        final_set.append(test_clust_trans)
        tpose_dt[idx] = final_set
    return tpose_dt

def rfr_testset(tpose_dt, y_dict):
    rmse_lst = []
    for idx in tpose_dt.keys():
        rf = RandomForestRegressor()
        rf.fit(tpose_dt[idx][0], y_dict[idx][0])
        y_pred = rf.predict(tpose_dt[idx][0])
        rmse = root_mean_squared_error(y_dict[idx][1], y_pred)
        rmse_lst.append(rmse)
    return rmse_lst

def rfr_rflst(tpose_dt, y_dict):
    rmse_lst = []
    rf_lst = []
    norm_preds = {}
    for idx in tpose_dt.keys():
        rf = RandomForestRegressor()
        rf.fit(tpose_dt[idx][0], y_dict[idx][0])
        rf_lst.append(rf)
        y_pred = rf.predict(tpose_dt[idx][0])
        rmse = root_mean_squared_error(y_dict[idx][1], y_pred)
        rmse_lst.append(rmse)
        norm_preds[idx] = y_pred
    return rmse_lst, rf_lst, norm_preds

def rfr(tpose_dt, y_dict):
    train_rmse_lst = []
    tst_rmse_lst = []
    for idx in tpose_dt.keys():
        rf = RandomForestRegressor()
        rf.fit(tpose_dt[idx][0], y_dict[idx][0])
        
        train_pred = rf.predict(tpose_dt[idx][0])
        train_rmse = root_mean_squared_error(y_dict[idx][0], train_pred)
        train_rmse_lst.append(train_rmse)
        
        test_pred = rf.predict(tpose_dt[idx][1])
        tst_rmse = root_mean_squared_error(y_dict[idx][1], test_pred)
        tst_rmse_lst.append(tst_rmse)
    return train_rmse_lst, tst_rmse_lst

def rfr_cv(tpose_dt, y_dict):
    train_rmse_lst = []
    tst_rmse_lst = []
    rf_lst = []
    for idx in tpose_dt.keys():
        rf = RandomForestRegressor()
        rf_lst.append(rf)
        rf.fit(tpose_dt[idx][0], y_dict[idx][0])
        
        train_pred = rf.predict(tpose_dt[idx][0])
        train_rmse = root_mean_squared_error(y_dict[idx][0], train_pred)
        train_rmse_lst.append(train_rmse)
        
        test_pred = rf.predict(tpose_dt[idx][1])
        tst_rmse = root_mean_squared_error(y_dict[idx][1], test_pred)
        tst_rmse_lst.append(tst_rmse)
    return train_rmse_lst, tst_rmse_lst, rf_lst


def rfr_params(tpose_dt, y_dict):
    train_rmse_lst = []
    tst_rmse_lst = []
    rf_lst = []
    for idx in tpose_dt.keys():
        rf = RandomForestRegressor()
        new = rf.fit(tpose_dt[idx][0], y_dict[idx][0])
        rf_lst.append(new)
        
        train_pred = rf.predict(tpose_dt[idx][0])
        train_rmse = root_mean_squared_error(y_dict[idx][0], train_pred)
        train_rmse_lst.append(train_rmse)
        
        test_pred = rf.predict(tpose_dt[idx][1])
        tst_rmse = root_mean_squared_error(y_dict[idx][1], test_pred)
        tst_rmse_lst.append(tst_rmse)
    return train_rmse_lst, tst_rmse_lst, rf_lst

#### Full Covariance for GMM used

In [None]:
b = make_features(cluster_lst)
c = split_data(b)
d = rearrange(c,b)
e = y_capture(c,b)
f = make_clust_eq(d)
full = rfr(f,e)

In [None]:
cluster_colors = {
    0: 'indigo',
    1: 'red',
    2: 'blue',
    3: 'gold',
    4: 'mediumorchid',
    5: 'teal',
    6: 'pink',
    7: 'brown',
}

colors = ta_ed["Cluster"].map(cluster_colors).tolist()

fig, ax = plt.subplots(figsize=(14, 14))

ax = plt.axes(projection=ccrs.PlateCarree(central_longitude=180))
ax.coastlines(resolution='110m', color='black', linewidth=1)
ax.add_feature(cfeature.LAND, facecolor = "gainsboro")
ax.add_feature(cfeature.OCEAN)
ax.set_extent([120,-70,-60,60])
ax.scatter(ta_ed["Longitude"], ta_ed["Latitude"], c=colors, s=5, transform=ccrs.PlateCarree())

#ax.scatter(ta_ed["Longitude"], ta_ed["Latitude"],2, c = colors, transform=ccrs.PlateCarree())
ax.set_title("Pacific Ocean Clusters")

handles = [plt.Line2D([0], [0], marker='o', color=color, markersize=5, label=f'Cluster {cluster}') for cluster, color in cluster_colors.items()]
ax.legend(handles=handles, loc='upper right', title='Clusters', framealpha =1, fontsize='15', title_fontsize='15')
#plt.show()
plt.savefig("tst1.png", dpi = 400)

### Making overlapping clusters

In [None]:
join8=ta_ed.join(prob_df) #og df without any overlaps, size = 179894 rows
##for loop to make the overlap
new_rows = []
for i in range(len(join8)): #go thru each row
    cur_clust = join8["Cluster"][i]
    for j in range(8):
        if j != cur_clust and prob_df.iloc[i,j] > .10:
            row = join8.iloc[i].copy()
            row["Cluster"] = j
            new_rows.append(row)
join_new = join8._append(new_rows).reset_index(drop=True) #new df with overlaps (using the code above) and the size is 233435 rows

In [None]:
ords_counts = join_new["order"].value_counts()
ords_cts = ords_counts.to_frame()
ords_cts = ords_cts.reset_index()
ords_cts = ords_cts.sort_values(by = "order")

In [None]:
tst2 = join_new.merge(ords_cts, on = "order")
#len(tst1[tst1["count"] == 2])+len(tst1[tst1["count"] == 3])+ len(tst1[tst1["count"] == 4])+ len(tst1[tst1["count"] == 1])
#tst2 = ords_cts["count"]

In [None]:
##for loop to given the data points that are in several clusters a different color
color_assign = []
op = 8

for i in range(len(tst2)):
    ctr = tst2["count"][i]
    cur_col = join_new["Cluster"][i]
    if ctr == 1:
        color_assign.append(cur_col)
    else:
        color_assign.append(op)  

In [None]:
tst2["overlap_cl"] = color_assign
tst2["overlap_cl"] = tst2["overlap_cl"].astype("int")

In [None]:
#Map to see the overlapping clusters
ax = plt.axes(projection=ccrs.PlateCarree(central_longitude=180))
ax.coastlines()
ax.set_extent([120,-70,-60,60])
ax.scatter(tst1["Longitude"], tst1["Latitude"],.5, c = tst1["overlap_cl"], transform=ccrs.PlateCarree())
ax.set_title("Overlapping Clusters for 8 Clusters")
#plt.savefig(f'{jdx+1}Clusters.png')
ax.set_label("Label via method")
#plt.legend()
#plt.show()
plt.savefig("full8cl.png")

#Larger map 
cluster_colors = {
    0: 'indigo',
    1: 'red',
    2: 'blue',
    3: 'gold',
    4: 'mediumorchid',
    5: 'teal',
    6: 'pink',
    7: 'brown',
    8: 'papayawhip',
}

colors = tst2["overlap_cl"].map(cluster_colors).tolist()

fig, ax = plt.subplots(figsize=(14, 14))

ax = plt.axes(projection=ccrs.PlateCarree(central_longitude=180))
ax.coastlines(resolution='110m', color='black', linewidth=1)
ax.add_feature(cfeature.LAND, facecolor = "gainsboro")
ax.add_feature(cfeature.OCEAN)
ax.set_extent([120,-70,-60,60])
ax.scatter(tst2["Longitude"], tst2["Latitude"], c=colors, s=5, transform=ccrs.PlateCarree())

#ax.scatter(ta_ed["Longitude"], ta_ed["Latitude"],2, c = colors, transform=ccrs.PlateCarree())
ax.set_title("Overlapping Pacific Ocean Clusters")

handles = [plt.Line2D([0], [0], marker='o', color=color, markersize=5, label=f'Cluster {cluster}') for cluster, color in cluster_colors.items()]
ax.legend(handles=handles, loc='upper right', title='Clusters', framealpha =1, fontsize='15', title_fontsize='15')
#plt.show()
plt.savefig("overlap.png", dpi = 400)

In [None]:
#Downloading a new csv with the updated cluster values
tst2.to_csv("8clovlap2.csv", index = False)

### CV Testing 

In [None]:
#Separating the clusters out
clust1a = tst2.loc[tst2["Cluster"] ==0]
clust2a = tst2.loc[tst2["Cluster"] ==1]
clust3a = tst2.loc[tst2["Cluster"] ==2]
clust4a = tst2.loc[tst2["Cluster"] ==3]
clust5a = tst2.loc[tst2["Cluster"] ==4]
clust6a = tst2.loc[tst2["Cluster"] ==5]
clust7a = tst2.loc[tst2["Cluster"] ==6]
clust8a = tst2.loc[tst2["Cluster"] ==7]

cluster_lsta = [clust1a, clust2a, clust3a, clust4a, clust5a, clust6a, clust7a, clust8a]

In [None]:
### Map of Overlapping Clusters Separated Out
# FULL COV 
for idx, data in enumerate(cluster_lsta):
    ax = plt.axes(projection=ccrs.PlateCarree(central_longitude=180))
    ax.coastlines()
    ax.set_extent([120,-70,-60,60])
    ax.scatter(data["Longitude"], data["Latitude"],.5, c = data["overlap_cl"], transform=ccrs.PlateCarree())
    ax.set_title(f'Overlapped Clustering for Cluster #{idx+1}')
    plt.show()

In [None]:
ba = make_features(cluster_lsta)
ca = split_data(ba)
da = rearrange(ca,ba)
ea = y_capture(ca,ba)
orders = order_capture(ca,ba)
fa = make_clust_eq(da)
q = rfr_cv(fa,ea)

par = rfr_params(fa,ea)

#### Cross Validation for Each Cluster

In [None]:
# for full covariance
for i in range(len(q[2])):
    kf = KFold(n_splits = 5)
    X = fa[i][0]
    y = ea[i][0]
    scores = cross_val_score(q[2][i], X, y, cv=kf)
    print("Cross Validation Scores: ", scores)
    print("Average CV Score: ", scores.mean())
    print("Number of CV Scores used in Average: ", len(scores))

In [None]:
#for test set full covariance
for i in range(len(q[2])):
    print(q[2][i].score(fa[i][1], ea[i][1]))

### Hyperparameter Tuning

In [None]:
def hyp_tr(tpose_dt, y_dict):
    p_dist = {"n_estimators": range(50,500),
          "max_depth": range(1,20),
          "max_features": range(1,9)}
    hyp_dict = {}
    for idx in tpose_dt.keys(): 
        rf = RandomForestRegressor()
        rs = HalvingGridSearchCV(rf, param_grid = p_dist, cv=5)
        res = rs.fit(tpose_dt[idx][0], y_dict[idx][0])
        hyp_dict[idx] = [list(res.best_params_.values())[0],list(res.best_params_.values())[1], list(res.best_params_.values())[2]]
    return hyp_dict

In [None]:
b = make_features(cluster_lsta)
c = split_data(b)
d = rearrange(c,b)
e = y_capture(c,b)
f = make_clust_eq(d)
g = rfr(f,e) #takes up a lot of RAM

In [None]:
h = hyp_tr(f,e)

In [None]:
def best_model(tpose_dt, y_dict, hyp_dict):
    best_rmse = {}
    best_dict = {}
    for idx in tpose_dt.keys():
        best_rf = RandomForestRegressor(max_depth=hyp_dict[idx][0], n_estimators=hyp_dict[idx][1], max_features = hyp_dict[idx][2])
        best_rf.fit(tpose_dt[idx][0], y_dict[idx][0])
        best_preds = best_rf.predict(tpose_dt[idx][1])
        rmse = root_mean_squared_error(y_dict[idx][1], best_preds)
        best_rmse[idx] = rmse
        best_dict[idx] = best_preds
    return best_rmse, best_dict

In [None]:
i = best_model(f,e,h)

In [None]:
### Getting TA Predictions

In [None]:
def preds(tpose_dt, y_dict):
    train_preds = {}
    test_preds = {}
    for idx in tpose_dt.keys():
        rf = RandomForestRegressor()
        rf.fit(tpose_dt[idx][0], y_dict[idx][0])
        train_pred = rf.predict(tpose_dt[idx][0])
        train_preds[idx] = train_pred
        test_pred = rf.predict(tpose_dt[idx][1])
        test_preds[idx] = test_pred
    return train_preds, test_preds

In [None]:
def comb_ords_preds(test_preds, order_dict, y_dict):
    clnum = []
    ord_tst = []
    pred_tst = []
    actual_y = []
    for idx in range(len(orders)):
        cl_len = [idx]*len(orders[idx][1])
        ord_tst.extend(orders[idx][1])
        clnum.extend(cl_len)
    for idx in range(len(y_dict)):
        actual_y.extend(y_dict[idx][1])
    for i in range(len(test_preds)):
        pred_tst.extend(test_preds[i])
    comb_df = pd.DataFrame( {"Order_Num": ord_tst,
                             "RFR1": pred_tst,
                             "Y_test": actual_y,
                             "Cluster": clnum})
    return comb_df

In [None]:
# Testing the functions
#first trying on overlapped 8cl, diag cov, no hypertuning
trn, tsty = preds(fa,ea)

#checking lengths to make sure there are no issues with test preds

ordlen = []
for i in range(len(orders)):
    k=(len(orders[i][1]))
    ordlen.append(k)
sum(ordlen)

#for i in range(len(samplepred1[1])):
    #print(len(samplepred1[1][i]))

In [None]:
#Saving the predictions to a new csv (uncomment the next line to do so)
#comb_ords_preds(tsty, orders, ea).to_csv("rfrpredsdiag.csv", index = False)