In [1]:
from glob import glob
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn import metrics
from sklearn.preprocessing import StandardScaler, LabelEncoder, MinMaxScaler
from sklearn.model_selection import train_test_split, cross_val_score, cross_validate
from sklearn.linear_model import LinearRegression, Ridge, RidgeCV, LassoCV, Lasso
from sklearn.ensemble import RandomForestRegressor

from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error
import seaborn as sns

In [2]:
import warnings
warnings.filterwarnings('ignore')

In [3]:
div_train_test = 0.3

def print_evaluate(true, predicted):  
    
    mae = metrics.mean_absolute_error(true, predicted)
    mse = metrics.mean_squared_error(true, predicted)
    rmse = np.sqrt(metrics.mean_squared_error(true, predicted))
    r2_square = metrics.r2_score(true, predicted)
    
    print('MAE:', round(mae, 4))
    print('RMSE:', round(rmse, 4))
    print('R2 Square', round(r2_square, 4))
    
def run_all_regressions(X_train, X_test, Y_train, Y_test):
    
    regs = {
        "LinearRegression": LinearRegression(),
        # "Ridge": Ridge(.5),
        'RandomForest': RandomForestRegressor(random_state=0, n_jobs=-1, max_depth=3)
    }

    for name, model in regs.items():

        model.fit(X_train, Y_train)
        
        print(f'\n-----{name}------')
        print('[Train] -------------')
        print_evaluate(Y_train, model.predict(X_train))

        print('[Test] --------------')
        print_evaluate(Y_test, model.predict(X_test))
        
        
def results(X=None, y=None):
    folds = 3
    X_train, X_test, y_train, y_test = train_test_split(X.values, 
                                                        y.values, 
                                                        test_size=div_train_test, 
                                                        random_state=0)
   
    def render_metrics(scores=[]):

        for met in all_metrics:
            if 'neg_' in met:
                print("%s =\t%0.4f \nstd =\t%0.4f" % (met, (-1 * scores['test_' + met].mean()), scores['test_' + met].std()))
            else:
                print("%s =\t%0.4f \nstd =\t%0.4f" % (met, scores['test_' + met].mean(), scores['test_' + met].std()))
            print('-' * 30)
    all_metrics  = [
        'r2',
        'neg_mean_squared_error',
        'neg_root_mean_squared_error'
    ]
    
    print('qtd images train: ', len(X_train))
    print('qtd images test: ', len(X_test))
    
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    run_all_regressions(X_train_scaled, X_test_scaled, y_train, y_test)
    
    
    # preds = RandomForestRegressor().fit(X_train_scaled, y_train).predict(X_test_scaled)
    # plt.figure(figsize=(10, 7))
    # sns.regplot(x=y_test, y=preds, ci=None, color='0');
    
    
    print('\n\n - RESULTADO CROSS-VALIDATION COM 27 IMAGENS-')
    
    X_scaled = scaler.fit_transform(X)

    print('\n\nlinear regression')
    render_metrics(cross_validate(LinearRegression(), 
                             X_scaled,
                             y, 
                             cv=folds, 
                             scoring=all_metrics))
    

    print('\n\nrandom forest')
    render_metrics(cross_validate(RandomForestRegressor(max_depth=3, n_estimators=100), 
                             X_scaled,
                             y, 
                             cv=folds, 
                             scoring=all_metrics))
    

    #     print('\n\nlinear regression ridge')
    #     render_metrics(cross_validate(Ridge(),
    #                              X_scaled, 
    #                              y, 
    #                              cv=folds, 
    #                              scoring=all_metrics))
    
def apply_pca_reduce(data, n=None):
    
    scaler = StandardScaler()
    X = scaler.fit_transform(data.values)
    
    pca = PCA(n_components=len(data.columns)).fit(X)
    
    cumsum = np.cumsum(pca.explained_variance_ratio_)
    dim = np.argmax(cumsum >= 0.90) + 1
    
    print('Para preservar 90% da variance é necessário ' + str(dim))
    
    if n:
        dim = n
        
    pca_refit = PCA(n_components=dim).fit(X)
    print('Ratio: ', pca_refit.explained_variance_ratio_)
    
    return pca_refit.transform(X)


def thresh_holding_features_corr(data, lim=0.5):
    
    data_corr = data.drop(labels=['index'], axis=1)
    corrmat = data_corr.corr()

    print("< 0", corrmat[corrmat.weight <= 0].weight)
    print(" > " + str(lim), corrmat[corrmat.weight >= lim].weight)
    
    return list(dict(corrmat[corrmat.weight >= lim].weight).keys())

In [4]:
weights = pd.read_csv("data/coletas/combined/annotations.csv")
features = pd.read_csv("features.csv")

In [5]:
weights.index = weights.img
features.index = features.label

weights = weights.drop('img', axis=1)
features = features.drop('label', axis=1)

In [6]:
data = pd.concat([weights, features],
                 axis=1,
                 ignore_index=False, 
                 verify_integrity=True)

data = data.reset_index()
data.specie = LabelEncoder().fit_transform(data.specie.values)

In [7]:
data

Unnamed: 0,index,specie,weight,width,length,area,bbox-0,bbox-1,bbox-2,bbox-3,...,moments_central-2-2,moments_central-2-3,moments_central-3-0,moments_central-3-1,moments_central-3-2,moments_central-3-3,orientation,perimeter,feret_diameter_max,solidity
0,IMG_20220110_150135.jpg,3,220,9.0,22,6265,80,29,150,162,...,9.451928e+08,-7.003478e+09,5.423848e+04,2.388364e+07,-5.789975e+07,-1.071610e+10,-1.555946,349.462987,134.495353,0.928148
1,IMG_20220110_150324.jpg,4,545,11.0,26,10600,70,19,164,187,...,4.282950e+09,-1.349467e+10,-3.121288e+06,8.530533e+08,-1.098072e+09,7.798581e+11,1.526699,444.374675,168.428026,0.949566
2,IMG_20220110_150406.jpg,0,1025,12.0,31,14170,57,5,164,203,...,1.093375e+10,2.184438e+10,3.606262e+07,-1.405601e+09,-3.543888e+10,-5.172718e+12,-1.491373,506.357431,199.007538,0.962898
3,IMG_20220110_150506.jpg,2,740,10.0,37,9042,87,6,157,200,...,2.650567e+09,-4.582602e+09,-2.538047e+06,3.087611e+06,4.523251e+09,7.371602e+10,1.566756,467.847763,194.311605,0.929960
4,IMG_20220110_150552.jpg,4,475,10.0,24,8624,73,16,156,165,...,2.356460e+09,-4.228981e+09,-1.981609e+06,4.478496e+08,9.458410e+08,4.923908e+11,1.506794,389.019336,149.482440,0.954299
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,IMG_20221122_175210.jpg,3,275,10.0,20,6538,88,47,164,174,...,9.920246e+08,-5.562059e+09,-2.208582e+06,1.258152e+08,1.062625e+09,8.083806e+10,1.539318,337.948268,127.882759,0.950844
96,IMG_20221122_175233.jpg,3,330,10.0,23,7432,77,36,157,174,...,1.465378e+09,-9.037299e+09,-1.930007e+06,2.380751e+08,6.246181e+08,2.294049e+11,1.524349,360.676190,138.812824,0.947717
97,IMG_20221122_175259.jpg,3,205,8.0,20,5541,95,61,161,187,...,6.497784e+08,-6.172970e+09,-5.676452e+05,-2.866279e+07,7.879283e+08,-6.469197e+10,-1.532136,329.806133,127.577427,0.931730
98,IMG_20221122_175337.jpg,3,373,11.0,24,8166,56,36,143,180,...,2.296737e+09,2.566274e+09,2.315086e+06,3.943457e+08,3.220822e+09,9.275901e+11,1.432452,387.989899,147.091808,0.938405


In [9]:
y = data.weight

### 'width', 'length', 'specie'

In [10]:
results(data[['width', 'length', 'specie']], y)

qtd images train:  70
qtd images test:  30

-----LinearRegression------
[Train] -------------
MAE: 91.0353
RMSE: 134.274
R2 Square 0.8354
[Test] --------------
MAE: 122.0516
RMSE: 165.2773
R2 Square 0.7697

-----RandomForest------
[Train] -------------
MAE: 51.6992
RMSE: 74.4987
R2 Square 0.9493
[Test] --------------
MAE: 77.3457
RMSE: 113.0843
R2 Square 0.8922


 - RESULTADO CROSS-VALIDATION COM 27 IMAGENS-


linear regression
r2 =	-1.3246 
std =	2.9631
------------------------------
neg_mean_squared_error =	32438.0157 
std =	22763.2718
------------------------------
neg_root_mean_squared_error =	168.1389 
std =	64.5549
------------------------------


random forest
r2 =	-0.0285 
std =	1.2168
------------------------------
neg_mean_squared_error =	20819.8134 
std =	16848.7481
------------------------------
neg_root_mean_squared_error =	133.1370 
std =	55.6269
------------------------------


### 'width', 'length'

In [11]:
results(data[['width', 'length']], y)

qtd images train:  70
qtd images test:  30

-----LinearRegression------
[Train] -------------
MAE: 97.57
RMSE: 152.3935
R2 Square 0.7879
[Test] --------------
MAE: 114.77
RMSE: 168.9376
R2 Square 0.7594

-----RandomForest------
[Train] -------------
MAE: 56.7282
RMSE: 78.5078
R2 Square 0.9437
[Test] --------------
MAE: 73.8537
RMSE: 100.9705
R2 Square 0.914


 - RESULTADO CROSS-VALIDATION COM 27 IMAGENS-


linear regression
r2 =	-1.0491 
std =	2.5371
------------------------------
neg_mean_squared_error =	34326.9824 
std =	26996.2206
------------------------------
neg_root_mean_squared_error =	170.7762 
std =	71.8504
------------------------------


random forest
r2 =	0.0509 
std =	1.1210
------------------------------
neg_mean_squared_error =	17905.2039 
std =	11688.9712
------------------------------
neg_root_mean_squared_error =	127.3096 
std =	41.2003
------------------------------


### Com todas a features com limiar de correlação positiva acima de 86% e PCA com C.P=2

In [12]:
all_features_over_zero = thresh_holding_features_corr(data, lim=0.5)

< 0 specie                -0.584234
bbox-0                -0.385783
bbox-1                -0.653520
moments_central-1-0   -0.088860
moments_central-1-1   -0.286772
moments_central-1-2   -0.405073
moments_central-1-3   -0.465445
moments_central-2-1   -0.547876
moments_central-2-3   -0.027984
moments_central-3-1   -0.125227
moments_central-3-3   -0.377389
orientation           -0.162763
Name: weight, dtype: float64
 > 0.5 weight                 1.000000
width                  0.783989
length                 0.857266
area                   0.952293
bbox-3                 0.663412
bbox_area              0.951110
convex_area            0.950719
equivalent_diameter    0.918583
filled_area            0.952293
major_axis_length      0.885855
minor_axis_length      0.818146
moments-0-0            0.952293
moments-0-1            0.963938
moments-0-2            0.953174
moments-0-3            0.935909
moments-1-0            0.933610
moments-1-1            0.943536
moments-1-2            0.935145


In [13]:
features_filter = list(filter(lambda x: x not in ['weight'], all_features_over_zero))

In [14]:
features_filter

['width',
 'length',
 'area',
 'bbox-3',
 'bbox_area',
 'convex_area',
 'equivalent_diameter',
 'filled_area',
 'major_axis_length',
 'minor_axis_length',
 'moments-0-0',
 'moments-0-1',
 'moments-0-2',
 'moments-0-3',
 'moments-1-0',
 'moments-1-1',
 'moments-1-2',
 'moments-1-3',
 'moments-2-0',
 'moments-2-1',
 'moments-2-2',
 'moments-2-3',
 'moments-3-0',
 'moments-3-1',
 'moments-3-2',
 'moments-3-3',
 'moments_central-0-0',
 'moments_central-0-2',
 'moments_central-2-0',
 'moments_central-2-2',
 'perimeter',
 'feret_diameter_max']

In [15]:
X_pca = apply_pca_reduce(data[features_filter])

results(pd.DataFrame(X_pca), y)

Para preservar 90% da variance é necessário 2
Ratio:  [0.88359937 0.06775153]
qtd images train:  70
qtd images test:  30

-----LinearRegression------
[Train] -------------
MAE: 48.5799
RMSE: 97.2005
R2 Square 0.9137
[Test] --------------
MAE: 80.3896
RMSE: 158.6735
R2 Square 0.7877

-----RandomForest------
[Train] -------------
MAE: 32.7942
RMSE: 51.8418
R2 Square 0.9755
[Test] --------------
MAE: 68.6358
RMSE: 109.3039
R2 Square 0.8993


 - RESULTADO CROSS-VALIDATION COM 27 IMAGENS-


linear regression
r2 =	0.5536 
std =	0.3854
------------------------------
neg_mean_squared_error =	22507.3776 
std =	26746.1613
------------------------------
neg_root_mean_squared_error =	121.8391 
std =	87.5363
------------------------------


random forest
r2 =	0.6911 
std =	0.2349
------------------------------
neg_mean_squared_error =	14948.1838 
std =	14233.9275
------------------------------
neg_root_mean_squared_error =	107.6922 
std =	57.8841
------------------------------


In [16]:
X_pca = apply_pca_reduce(data[['width',
                               'bbox_area', 
                               'equivalent_diameter', 
                               'convex_area',
                               'area',
                               'perimeter',
                               'specie']])

results(pd.DataFrame(X_pca), y)

Para preservar 90% da variance é necessário 2
Ratio:  [0.84949277 0.11192837]
qtd images train:  70
qtd images test:  30

-----LinearRegression------
[Train] -------------
MAE: 58.6853
RMSE: 97.9352
R2 Square 0.9124
[Test] --------------
MAE: 74.7888
RMSE: 105.4789
R2 Square 0.9062

-----RandomForest------
[Train] -------------
MAE: 37.2228
RMSE: 61.614
R2 Square 0.9653
[Test] --------------
MAE: 69.9849
RMSE: 113.8222
R2 Square 0.8908


 - RESULTADO CROSS-VALIDATION COM 27 IMAGENS-


linear regression
r2 =	0.3525 
std =	0.7631
------------------------------
neg_mean_squared_error =	13235.6159 
std =	10626.3091
------------------------------
neg_root_mean_squared_error =	106.3414 
std =	43.8991
------------------------------


random forest
r2 =	0.6348 
std =	0.3425
------------------------------
neg_mean_squared_error =	13821.9268 
std =	13426.0767
------------------------------
neg_root_mean_squared_error =	103.9637 
std =	54.8951
------------------------------
