In [2]:
import pandas as pd
import numpy as np
from sklearn.neural_network import MLPRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score


In [18]:
ds = pd.read_csv('formatted_data - Sheet1.csv')
ds.drop(labels=['Sample Name','Sample Number','File Name'], inplace=True, axis=1)
ds.round(0)
print(len(ds))
ds.dropna(inplace=True)
print(ds.head())
print(len(ds))
print(ds.iloc[0])

3208
   True R  True G  True B  Observed R  Observed G  Observed B  Red R  Red G  \
0     250     226     216       211.0       185.0       173.0  191.0   48.0   
1     250     226     216       185.0       174.0       172.0  169.0   37.0   
2     249     231     215       254.0       254.0       239.0  176.0   53.0   
3     249     231     215       234.0       231.0       222.0  181.0   72.0   
4     249     231     215       214.0       211.0       202.0  164.0   60.0   

   Red B  Green R  Green G  Green B  Blue R  Blue G  Blue B  
0   47.0      6.0     76.0     35.0    51.0    60.0   114.0  
1   40.0      4.0     65.0     30.0    36.0    48.0   101.0  
2   48.0     50.0    106.0     69.0    29.0    52.0   123.0  
3   73.0     63.0    116.0     92.0    57.0    78.0   148.0  
4   62.0     51.0    104.0     80.0    45.0    66.0   133.0  
3174
True R        250.0
True G        226.0
True B        216.0
Observed R    211.0
Observed G    185.0
Observed B    173.0
Red R         191.0
Red

In [19]:
X = ds.iloc[:, 3:].values
y = ds.iloc[:, 0:3].values

In [20]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
X_train

array([[239., 182., 164., ..., 131., 143., 192.],
       [204., 169., 157., ...,  15.,  30.,  85.],
       [197., 144., 165., ...,  34.,  63.,  95.],
       ...,
       [186., 170., 191., ...,  32.,  52., 114.],
       [198., 156., 144., ...,   3.,  24.,  90.],
       [195., 162., 158., ...,  90., 108., 150.]])

In [3]:
# round all values to nearest whole number
# exact accuracy
# within 5 accuracy
def accuracy(preds, y_test):
  total_correct = 0
  total_within_5 = 0
  ind_correct = 0
  ind_within_5 = 0
  for i, pred in enumerate(preds):
      pred = np.round(pred)
      actual = y_test[i]
      if np.array_equal(pred, actual):
          total_correct += 1
      if np.all(np.abs(pred - actual) <= 5):
          total_within_5 += 1
      ind_results = np.abs(pred - actual)
      # print(ind_results)
      ind_correct += np.sum(ind_results == 0)
      ind_within_5 += np.sum(ind_results <= 5)

  print(f'Exact accuracy: {total_correct / len(preds)}')
  print(f'Within 5 accuracy: {total_within_5 / len(preds)}')
  print(f'Individual exact accuracy: {ind_correct / len(preds) / 3}')

  return total_correct / len(preds), total_within_5 / len(preds), ind_correct / len(preds) / 3

# MLP

In [21]:
mlpr = MLPRegressor(
    hidden_layer_sizes=(200, 200, 200, 200),
    activation='relu',
    solver='adam',
    max_iter=5000,
    random_state=42)


In [22]:
mlpr.fit(X_train, y_train)

In [23]:
r2 = mlpr.score(X_test, y_test) # this gives r2_score, coefficient of determination, not accuracy
mlp_preds = mlpr.predict(X_test)
print(r2)

0.6516752220222485


In [24]:
mse = mean_squared_error(y_test, mlp_preds)
print(mse)
rmse = np.sqrt(mse)
print(rmse)

330.33608785079383
18.175150284132283


In [25]:
accuracy(mlp_preds, y_test)
# getting 11.02% within-5 acc on all data w/ (4x200, relu, adam, 42), 0 exact, 3.5% individual exact accuracy

Exact accuracy: 0.0
Within 5 accuracy: 0.11023622047244094
Individual exact accuracy: 0.03517060367454068


(0.0, 0.11023622047244094, np.float64(0.03517060367454068))

# Random Forest

In [26]:
from sklearn.ensemble import RandomForestRegressor


In [27]:
rf = RandomForestRegressor(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)

In [28]:
rf_preds = rf.predict(X_test)
r2 = rf.score(X_test, y_test)
print(r2)

0.7862679764793489


In [29]:
mse = mean_squared_error(y_test, rf_preds)
print(mse)
rmse = np.sqrt(mse)
print(rmse)

211.58806005249346
14.546066824145058


In [30]:
accuracy(rf_preds, y_test)
# got 36.7% within 5, 3.5% exact on (100, 42), and only barely better on (200, 42), and 13.3% individual exact

Exact accuracy: 0.03464566929133858
Within 5 accuracy: 0.3637795275590551
Individual exact accuracy: 0.13280839895013122


(0.03464566929133858, 0.3637795275590551, np.float64(0.13280839895013122))

# XGBoost

In [31]:
from xgboost import XGBRegressor, XGBClassifier

In [32]:
xgb = XGBRegressor(n_estimators=100, random_state=42)
xgb.fit(X_train, y_train)

In [33]:
xgb_preds = xgb.predict(X_test)
r2 = xgb.score(X_test, y_test)
print(r2)

0.7653927206993103


In [34]:
mse = mean_squared_error(y_test, xgb_preds)
print(mse)
rmse = np.sqrt(mse)
print(rmse)

228.7204132080078
15.12350532145269


In [35]:
accuracy(xgb_preds, y_test)
# 35% within-5 on defaults, 1.7% exact accuracy, 11.3% individual exact accuracy

Exact accuracy: 0.01732283464566929
Within 5 accuracy: 0.34960629921259845
Individual exact accuracy: 0.11286089238845144


(0.01732283464566929, 0.34960629921259845, np.float64(0.11286089238845144))

# CV Tests

In [10]:
test_data = pd.read_csv('formatted_data - Sheet1.csv')
test_data.round(0)
test_data.drop(labels=['Sample Name','File Name'], inplace=True, axis=1)

print(len(test_data))
test_data.dropna(inplace=True)
print(test_data.head())
print(len(test_data))
print(test_data.iloc[0])

cvX = test_data.iloc[:, 6:].values
cvy = test_data.iloc[:, 3:6].values
cvX_train, cvX_test, cvy_train, cvy_test = train_test_split(cvX, cvy, test_size=0.2, random_state=42)

3208
  Sample Number  True R  True G  True B  Observed R  Observed G  Observed B  \
0       2001-2C     250     226     216       211.0       185.0       173.0   
1       2001-2C     250     226     216       185.0       174.0       172.0   
2       7002_17     249     231     215       254.0       254.0       239.0   
3       7002_17     249     231     215       234.0       231.0       222.0   
4       7002_17     249     231     215       214.0       211.0       202.0   

   Red R  Red G  Red B  Green R  Green G  Green B  Blue R  Blue G  Blue B  
0  191.0   48.0   47.0      6.0     76.0     35.0    51.0    60.0   114.0  
1  169.0   37.0   40.0      4.0     65.0     30.0    36.0    48.0   101.0  
2  176.0   53.0   48.0     50.0    106.0     69.0    29.0    52.0   123.0  
3  181.0   72.0   73.0     63.0    116.0     92.0    57.0    78.0   148.0  
4  164.0   60.0   62.0     51.0    104.0     80.0    45.0    66.0   133.0  
3174
Sample Number    2001-2C
True R               250
True G   

In [11]:
print(cvX.shape)
print(cvy.shape)
test_data['Sample Number']

(3174, 10)
(3174, 3)


Unnamed: 0,Sample Number
0,2001-2C
1,2001-2C
2,7002_17
3,7002_17
4,7002_17
...,...
3203,P480-5
3204,P460-6
3205,P460-6
3206,P490-7


In [53]:
# xgboost with cross-validation
from sklearn.model_selection import GroupKFold, KFold
from sklearn.model_selection import cross_val_score, cross_validate

results = []

gkf = GroupKFold(n_splits=5)
kf = KFold(n_splits=5)
for train, test in gkf.split(cvX, cvy, groups=test_data['Sample Number']):

# for train, test in kf.split(cvX, cvy):
    cX_train, cX_test = cvX[train], cvX[test]
    cy_train, cy_test = cvy[train], cvy[test]
    # print(cX_train)
    # print(cy_train)
    # print(cX_test)
    # print(cy_test)
    xgb = XGBRegressor(n_estimators=100, random_state=42)
    xgb.fit(cX_train, cy_train)
    xgb_preds = xgb.predict(cX_test)
    r2 = xgb.score(cX_test, cy_test)
    mse = mean_squared_error(cy_test, xgb_preds)
    rmse = np.sqrt(mse)
    acc, w5_acc, ind_acc = accuracy(xgb_preds, cy_test)
    results.append([r2, mse, rmse, acc, w5_acc, ind_acc])
    print(results[-1])




Exact accuracy: 0.0
Within 5 accuracy: 0.09606299212598425
Individual exact accuracy: 0.042519685039370085
[0.8107564058340585, 234.3741750083946, np.float64(15.309283948258148), 0.0, 0.09606299212598425, np.float64(0.042519685039370085)]
Exact accuracy: 0.0
Within 5 accuracy: 0.07086614173228346
Individual exact accuracy: 0.03727034120734908
[0.7623060149656479, 343.3597905800027, np.float64(18.529970064196075), 0.0, 0.07086614173228346, np.float64(0.03727034120734908)]
Exact accuracy: 0.0
Within 5 accuracy: 0.048818897637795275
Individual exact accuracy: 0.03727034120734908
[0.7570266647574635, 476.73378793967066, np.float64(21.834234310817283), 0.0, 0.048818897637795275, np.float64(0.03727034120734908)]
Exact accuracy: 0.0
Within 5 accuracy: 0.12755905511811025
Individual exact accuracy: 0.044619422572178484
[0.7295216561043837, 278.7153533186129, np.float64(16.69477023856911), 0.0, 0.12755905511811025, np.float64(0.044619422572178484)]
Exact accuracy: 0.0
Within 5 accuracy: 0.04258

In [57]:
scores = cross_validate(xgb, cvX, cvy, scoring=('r2','neg_mean_squared_error', 'neg_root_mean_squared_error', ), cv=gkf, groups=test_data['Sample Number'])

print(scores)
print(np.mean(scores['test_neg_mean_squared_error']))
print(np.mean(scores['test_neg_root_mean_squared_error']))
print(np.mean(scores['test_r2']))

{'fit_time': array([2.04668593, 0.63718152, 0.59016442, 0.61888218, 0.60968184]), 'score_time': array([0.00825191, 0.00836301, 0.00837493, 0.00827932, 0.00809789]), 'test_r2': array([0.81075641, 0.76230601, 0.75702666, 0.72952166, 0.79290702]), 'test_neg_mean_squared_error': array([-234.37417501, -343.35979058, -476.73378794, -278.71535332,
       -315.93964033]), 'test_neg_root_mean_squared_error': array([-15.19234527, -18.14375656, -21.64042901, -16.47348626,
       -17.69174154])}
-329.8245494344691
-17.828351727514878
0.7705035526079259


In [17]:
xgbrf = XGBRFRegressor(n_estimators=100, random_state=42)
gbrf_cv = cross_validate(xgbrf, cvX, cvy, scoring=('r2','neg_mean_squared_error', 'neg_root_mean_squared_error'), cv=gkf, groups=test_data['Sample Number'])
print(xgbrf_cv)

{'fit_time': array([3.39627743, 0.65032458, 0.69937468, 0.69822311, 0.64615321]), 'score_time': array([0.00890732, 0.00844216, 0.00907373, 0.01010561, 0.00954247]), 'test_r2': array([0.74673749, 0.70238458, 0.67595235, 0.66636765, 0.74531443]), 'test_neg_mean_squared_error': array([-314.96745021, -430.70929046, -637.06406747, -355.21274286,
       -386.66577863]), 'test_neg_root_mean_squared_error': array([-17.69611675, -20.50066827, -24.98343708, -18.69034085,
       -19.57561832])}


In [58]:
# RF with cross-validation
from sklearn.model_selection import GroupKFold
from sklearn.model_selection import cross_val_score, cross_validate

results = []

gkf = GroupKFold(n_splits=5)
for train, test in gkf.split(cvX, cvy, groups=test_data['Sample Number']):
    cX_train, cX_test = cvX[train], cvX[test]
    cy_train, cy_test = cvy[train], cvy[test]
    # print(cX_train)
    # print(cy_train)
    # print(cX_test)
    # print(cy_test)
    rf = RandomForestRegressor(n_estimators=100, random_state=42)
    rf.fit(cX_train, cy_train)
    rf_preds = rf.predict(cX_test)
    r2 = rf.score(cX_test, cy_test)
    mse = mean_squared_error(cy_test, rf_preds)
    rmse = np.sqrt(mse)
    acc, w5_acc, ind_acc = accuracy(rf_preds, cy_test)
    results.append([r2, mse, rmse, acc, w5_acc, ind_acc])
    print(results[-1])

rf = RandomForestRegressor(n_estimators=100, random_state=42)
rf_cv = cross_validate(rf, cvX, cvy, scoring=('r2','neg_mean_squared_error', 'neg_root_mean_squared_error'), cv=gkf, groups=test_data['Sample Number'])
print(rf_cv)


Exact accuracy: 0.0
Within 5 accuracy: 0.09763779527559055
Individual exact accuracy: 0.04146981627296588
[0.7967891037556342, 253.2612569398427, np.float64(15.91418414307949), 0.0, 0.09763779527559055, np.float64(0.04146981627296588)]
Exact accuracy: 0.0
Within 5 accuracy: 0.07086614173228346
Individual exact accuracy: 0.03307086614173228
[0.7724293913710779, 325.0477112441163, np.float64(18.02907960058184), 0.0, 0.07086614173228346, np.float64(0.03307086614173228)]
Exact accuracy: 0.0
Within 5 accuracy: 0.03464566929133858
Individual exact accuracy: 0.03412073490813648
[0.7038476669206738, 579.7620513027831, np.float64(24.078248509864316), 0.0, 0.03464566929133858, np.float64(0.03412073490813648)]
Exact accuracy: 0.0
Within 5 accuracy: 0.09763779527559055
Individual exact accuracy: 0.04199475065616798
[0.7156997800867694, 305.6025610288424, np.float64(17.481491956604916), 0.0, 0.09763779527559055, np.float64(0.04199475065616798)]
Exact accuracy: 0.0
Within 5 accuracy: 0.0993690851735

In [59]:

print(np.mean(rf_cv['test_neg_mean_squared_error']))
print(np.mean(rf_cv['test_neg_root_mean_squared_error']))
print(np.mean(rf_cv['test_r2']))

-360.3350540035375
-18.645057626970424
0.7534131370433655


# XGBRF

In [16]:
from xgboost import XGBRFRegressor

In [45]:
xgbrf = XGBRFRegressor(n_estimators=100, random_state=42)
xgbrf.fit(X_train, y_train)

In [46]:
xgbrf_preds = xgbrf.predict(X_test)
r2 = xgbrf.score(X_test, y_test)
print(r2)

0.6838158965110779


In [47]:
mse = mean_squared_error(y_test, xgbrf_preds)
print(mse)
rmse = np.sqrt(mse)
print(rmse)

320.5004577636719
17.902526574862886


In [48]:
accuracy(xgbrf_preds, y_test)

Exact accuracy: 0.0015748031496062992
Within 5 accuracy: 0.05511811023622047
Individual exact accuracy: 0.03727034120734908


(0.0015748031496062992, 0.05511811023622047, np.float64(0.03727034120734908))

# Linear Regression

In [40]:
from sklearn.linear_model import LinearRegression

In [41]:
lr = LinearRegression()
lr.fit(X_train, y_train)

In [42]:
lr_preds = lr.predict(X_test)
r2 = lr.score(X_test, y_test)
print(r2)

0.5694827475010307


In [43]:
mse = mean_squared_error(y_test, lr_preds)
print(mse)
rmse = np.sqrt(mse)
print(rmse)

426.0740589650895
20.64156144687435


In [44]:
accuracy(lr_preds, y_test)

Exact accuracy: 0.0
Within 5 accuracy: 0.04566929133858268
Individual exact accuracy: 0.01889763779527559


(0.0, 0.04566929133858268, np.float64(0.01889763779527559))

# DL?