In [None]:
!pip install -U jupyterlab==3.0.16
!pip install ipywidgets # --user
!pip install sklearn

In [None]:
import sys
sys.path.append('../../')

In [None]:
# imports
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestRegressor
import matplotlib.pyplot as plt
import math
import numpy as np
import pandas as pd
import metrics
from ast import literal_eval

In [None]:
# define a function to get arrays from dataframe for sklearn training
def get_data(df, cols):
    return df[cols].to_numpy(),df['distances'].to_numpy()

## Try a Random Forest Regressor Without the Visual Features

In [None]:
# try an sklearn model right out of the box w/out the visual data
train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')

feature_cols = [
    'boxes_xmin',
    'boxes_ymin',
    'boxes_xmax',
    'boxes_ymax',
    'classes'
]

# collapse dataframe info to array
train_x,train_y = get_data(train_data, feature_cols)

# do the same for test
test_x,test_y = get_data(test_data, feature_cols)

In [None]:
# try a random forest regressor
regr = RandomForestRegressor(n_estimators=100)
regr.fit(train_x, train_y)

# get predictions for test data
train_preds = regr.predict(train_x)
test_preds = regr.predict(test_x)

In [None]:
# calculate train metrics
abs_rel_dist = np.mean(metrics.abs_relative_distance(train_preds, train_y))
sq_rel_dist = metrics.sq_relative_distance(train_preds, train_y)
rmse = metrics.rmse(train_preds, train_y)
log_rmse = metrics.log_rmse(train_preds, train_y)

# print metrics
print('Train Set Metrics')
print('----------------')
print('Abs Rel:{:.3f}'.format(abs_rel_dist))
print('Sq Rel: {:.3f}'.format(sq_rel_dist))
print('RMSE: {:.3f}'.format(rmse))
print('RMSE log: {:.3f}'.format(log_rmse))

# calculate test metrics
abs_rel_dist = np.mean(metrics.abs_relative_distance(test_preds, test_y))
sq_rel_dist = metrics.sq_relative_distance(test_preds, test_y)
rmse = metrics.rmse(test_preds, test_y)
log_rmse = metrics.log_rmse(test_preds, test_y)

# print metrics
print()
print('Test Set Metrics')
print('----------------')
print('Abs Rel:{:.3f}'.format(abs_rel_dist))
print('Sq Rel: {:.3f}'.format(sq_rel_dist))
print('RMSE: {:.3f}'.format(rmse))
print('RMSE log: {:.3f}'.format(log_rmse))

In [None]:
# plot an error histogram
abs_rel_errors = metrics.abs_relative_distance(test_preds, test_y)
abs_rel_errors *= test_y
fig,ax = plt.subplots(1,1, figsize=(10,5))
ax.hist(abs_rel_errors, bins=100, color='r')
ax.set_title('Error Histogram')
ax.set_ylabel('Frequency')
ax.set_xlabel('Error (m)')
plt.show()

## Let's Try it With the Visual Info
### PCA Reduce the Features First

In [None]:
# transform train vectors to array
train_fvs = train_data['feature_vector'].apply(literal_eval)
train_fvs = np.stack(train_fvs, axis=0)

# transform test vectors to array
test_fvs = test_data['feature_vector'].apply(literal_eval)
test_fvs = np.stack(test_fvs, axis=0)

In [None]:
# perform PCA reduction
pca = PCA(n_components=12)
train_fvs_reduced = pca.fit_transform(train_fvs)
test_fvs_reduced = pca.transform(test_fvs)

In [None]:
# add vector elements back to each dataframe
new_cols = ['feature_vector_{}'.format(i) for i in range(train_fvs_reduced.shape[1])]
train_data[new_cols] = train_fvs_reduced
test_data[new_cols] = test_fvs_reduced

In [None]:
# reconstruct the data
new_feature_cols = feature_cols+new_cols

# collapse dataframe info to array
train_x,train_y = get_data(train_data, new_feature_cols)

# do the same for test
test_x,test_y = get_data(test_data, new_feature_cols)

In [None]:
# try a random forest regressor
regr = RandomForestRegressor(n_estimators=100)
regr.fit(train_x, train_y)

# get predictions for test data
train_preds = regr.predict(train_x)
test_preds = regr.predict(test_x)

In [None]:
# calculate train metrics
abs_rel_dist = np.mean(metrics.abs_relative_distance(train_preds, train_y))
sq_rel_dist = metrics.sq_relative_distance(train_preds, train_y)
rmse = metrics.rmse(train_preds, train_y)
log_rmse = metrics.log_rmse(train_preds, train_y)

# print metrics
print('Train Set Metrics')
print('----------------')
print('Abs Rel:{:.3f}'.format(abs_rel_dist))
print('Sq Rel: {:.3f}'.format(sq_rel_dist))
print('RMSE: {:.3f}'.format(rmse))
print('RMSE log: {:.3f}'.format(log_rmse))

# calculate test metrics
abs_rel_dist = np.mean(metrics.abs_relative_distance(test_preds, test_y))
sq_rel_dist = metrics.sq_relative_distance(test_preds, test_y)
rmse = metrics.rmse(test_preds, test_y)
log_rmse = metrics.log_rmse(test_preds, test_y)

# print metrics
print()
print('Test Set Metrics')
print('----------------')
print('Abs Rel:{:.3f}'.format(abs_rel_dist))
print('Sq Rel: {:.3f}'.format(sq_rel_dist))
print('RMSE: {:.3f}'.format(rmse))
print('RMSE log: {:.3f}'.format(log_rmse))

In [None]:
# plot an error histogram
abs_rel_errors = metrics.abs_relative_distance(test_preds, test_y)
abs_rel_errors *= test_y
fig,ax = plt.subplots(1,1, figsize=(10,5))
ax.hist(abs_rel_errors, bins=100, color='r')
ax.set_title('Error Histogram')
ax.set_ylabel('Frequency')
ax.set_xlabel('Error (m)')
ax.set_xlim(0,45)
ax.set_ylim(0,165)
plt.show()