In [1]:
import pandas as pd
import numpy as np

import torch
import torch.nn as nn
import torch.nn.functional as F

import fastai.tabular.all as ft

import random
from math import sin, cos
from agregar_coordenadas import *

## crear datos artificiales (ignorar al final)

n = 2000
A = np.random.randn(n)*4
B = [random.choice(["sin","cos"]) for _ in range(n)]
x = [(sin(a) if b == 'sin' else cos(a))+0.2*random.gauss(0,1) for a,b in zip(A,B)]
y = [(a if b == 'sin' else a+1)+0.2*random.gauss(0,1) for a,b in zip(A,B)]
biome = [random.choice(["frio","caliente", "templado"]) for i in range(n)]
dumb = [random.choice(["ss","q"]) for i in range(n)]
for (i,bio),a, in zip(enumerate(biome),A):
    if a > 0:
        biome[i] = random.choice(["frio","frio",biome[i]])
    else:
        biome[i] = random.choice(["caliente","caliente",biome[i]])
        
DF = {'A': A, 'B': B, 'x': x, 'y': y, 'biome': biome, 'dumb':dumb}


df = pd.DataFrame(DF)

# Cargar datos

Hay que escribir columnas categóricas, continuas, y-categóricas y y-contínuas.

In [2]:
from data import *
from loss_func import *

In [25]:
#df = pd.read_csv("data/EMPcomplete_environmental.csv")
df = pd.read_csv("data/merged_complete_environmental.csv")

In [26]:
df.columns

Index(['X', 'Latitude', 'Longitude', 'k__Bacteria', 'k__Archaea',
       'p__Verrucomicrobia', 'p__Proteobacteria', 'p__Bacteroidetes',
       'p__Firmicutes', 'p__Actinobacteria',
       ...
       'BIO12', 'BIO13', 'BIO14', 'BIO15', 'BIO16', 'BIO17', 'BIO18', 'BIO19',
       'BIOME', 'ECONAME'],
      dtype='object', length=929)

In [27]:
df.dropna(inplace=True)

In [28]:
df.reset_index(inplace=True)

In [29]:
start_env = list(df.columns).index("Elevation"); start_env

908

In [30]:
env_cols = [df.columns[i] for i in range(start_env,len(df.columns))]

In [31]:
xyz = esfericas2cartesianas(df.Longitude,df.Latitude)

In [32]:
df = pd.concat([df,xyz],axis=1).copy()

In [33]:
cat_names = []
cont_names = list(df.columns[3:start_env])
y_cont_names = ["x","y","z"] + env_cols[:-2]
y_cat_names = env_cols[-2:]

In [60]:
y_cont_names

['x',
 'y',
 'z',
 'Elevation',
 'BIO1',
 'BIO2',
 'BIO3',
 'BIO4',
 'BIO5',
 'BIO6',
 'BIO7',
 'BIO8',
 'BIO9',
 'BIO10',
 'BIO11',
 'BIO12',
 'BIO13',
 'BIO14',
 'BIO15',
 'BIO16',
 'BIO17',
 'BIO18',
 'BIO19']

# Preparar datos para mugre fastai

In [34]:
Yproc = Preprocess(df,y_cat_names,y_cont_names)

In [35]:
loss_func = LossFunc(df,y_cat_names, y_cont_names,Yproc)
#metrics = ([loss_func.accuracy(i) for i,_ in enumerate(y_cat_names)] + 
#                         [loss_func.mse, loss_func.l1, loss_func.catloss, loss_func.contloss] +
#                         [loss_func.distance_in_km])
metrics = ([loss_func.mse, loss_func.l1, loss_func.catloss, loss_func.contloss] +
                         [loss_func.distance_in_km])

learn = ft.tabular_learner(dls,
           layers = [],
           opt_func = ft.ranger,
           cbs=[ft.GradientClip],
           loss_func = loss_func,
           metrics= ([loss_func.accuracy(i) for i,_ in enumerate(y_cat_names)] + 
                     [loss_func.mse, loss_func.l1, loss_func.catloss, loss_func.contloss] +
                     [loss_func.distance_in_km]
                    )
                           
    )

nin = learn.model.layers[-1][0].in_features
nout = Yproc.y_len()+len(y_cont_names)

learn.model.layers[-1] = nn.Linear(nin, nout)

In [36]:
from layers import *
from model import *

In [53]:
LEARNERS=[]
def objective(trial: optuna.trial):
    
    print("\n\nStarting trial with the following properties: ")
    
    D = trial.user_attrs
    
    for key, val in D.items():
        print(f"{key}: {val}")
    print("\t\t******** STARTING ")
    values = D['vals'] if 'vals' in D else []
    
    NUM_TRIES = 6
    while len(values) < NUM_TRIES:
        seed = random.randint(0,1000000)
        
        src = ft.TabularPandas(df.copy(),
                       procs=[ft.Normalize],
                       cat_names = cat_names,
                       cont_names = cont_names,
                       y_names = y_cat_names + y_cont_names,
                       splits = ft.RandomSplitter(0.05,seed=seed)(ft.range_of(df))
                       )
        
        dls = src.dataloaders(bs=trial.suggest_categorical('bs',[256,384,512]))
        torch.manual_seed(seed)
        learn = ft.tabular_learner(dls,
               layers = [],
               opt_func = ft.ranger,
               #cbs=[ft.GradientClip],
               loss_func = loss_func,
               metrics= metrics
        )
        
        nin = learn.model.layers[-1][0].in_features
        nout = Yproc.y_len()+len(y_cont_names)

        learn.model.layers[-1][0] = nn.Linear(nin, nout)
        
        improve_model_with_trial(learn.model,trial)
        LEARNERS.append(learn)
        with learn.no_logging():
            learn.fit_one_cycle(trial.suggest_int('epochs',50,250),
                                trial.suggest_float('lr',1e-3,1e-1,log=True),
                                div=trial.suggest_float('div',0.7,1.3),
                                div_final=trial.suggest_float('div_final',10,100000,log=True),
                                pct_start=trial.suggest_float('pct_start',0.1,0.8)
                               );
        idx = learn.recorder.metric_names.index('distance_in_km')
        values.append(learn.recorder.log[idx])
        
        bold_open = "\033[1;3m"
        bold_close = "\033[0m"
        print(f"\tFinished attempt with value: {bold_open}{values[-1]}{bold_close}")
        learn.export(f"models/learn_{values[-1]:.3f}_{seed}.pkg")
        
        
    trial.set_user_attr('vals',values)
    name = trial.user_attrs['name'] if 'name' in trial.user_attrs.keys() else 'unnamed'
    
    return mean(values)

In [54]:
def get_params(t):
    if len(t.params) == 0: return t.system_attrs['fixed_params']
    return t.params
    #return {k:rround(v) for k,v in t.params.items()}

In [55]:
study = torch.load("studies/r.pth")

In [56]:
def reset_study(study):
    r = optuna.create_study()
    for t in sorted(study.trials,key=lambda x: mean(x.user_attrs['vals'])):
        r.enqueue_trial(params=get_params(t),user_attrs=t.user_attrs)
    return r

In [57]:
r = reset_study(study)

[32m[I 2022-11-20 15:35:36,354][0m A new study created in memory with name: no-name-a2f31a3c-1810-4613-b215-12f0656d96ae[0m


r = optuna.create_study()
r.enqueue_trial(params={'epochs':1})

In [58]:
r.optimize(objective,n_trials=7)



Starting trial with the following properties: 
vals: [369.06334146560755, 287.7297956525434, 64.44503614967432, 305.5906897337076]
		******** STARTING 


	Finished attempt with value: [1;3m489.15351651665753[0m


[32m[I 2022-11-20 15:38:15,436][0m Trial 0 finished with value: 387.24599221384415 and parameters: {'bs': 256, 'use_cont_dropout': False, 'noiser_value': 0.10653428695810746, 'blocks': 3, 'f0': 128, 'f1': 128, 'f2': 896, 'num_res_f0': 2, 'num_res_f1': 2, 'num_res_f2': 2, 'epochs': 231, 'lr': 0.008545396307486939, 'div': 1.1763653047785234, 'div_final': 49.86013167685375, 'pct_start': 0.6127014781795439}. Best is trial 0 with value: 387.24599221384415.[0m


	Finished attempt with value: [1;3m807.4935737648746[0m


Starting trial with the following properties: 
vals: [280.86869186822923, 368.5705447692901, 306.8337928236496, 370.77837587477916]
		******** STARTING 


	Finished attempt with value: [1;3m853.3374031212026[0m


[32m[I 2022-11-20 15:40:01,267][0m Trial 1 finished with value: 405.4168345603568 and parameters: {'bs': 256, 'use_cont_dropout': False, 'noiser_value': 0.23898215786338062, 'blocks': 2, 'f0': 768, 'f1': 896, 'num_res_f0': 1, 'num_res_f1': 3, 'epochs': 226, 'lr': 0.010720037656745826, 'div': 1.0887009940285164, 'div_final': 550.3769279008768, 'pct_start': 0.6127695886966332}. Best is trial 0 with value: 387.24599221384415.[0m


	Finished attempt with value: [1;3m252.1121989049898[0m


Starting trial with the following properties: 
vals: [296.24580899369494, 245.75167353204913, 380.03283769463195, 467.5398331151896]
		******** STARTING 


[33m[W 2022-11-20 15:40:23,559][0m Trial 2 failed because of the following error: KeyboardInterrupt()[0m
Traceback (most recent call last):
  File "/home/neuralito/miniconda3/envs/fastai/lib/python3.10/site-packages/optuna/study/_optimize.py", line 196, in _run_trial
    value_or_values = func(trial)
  File "/tmp/ipykernel_998026/1024862909.py", line 43, in objective
    learn.fit_one_cycle(trial.suggest_int('epochs',50,250),
  File "/home/neuralito/miniconda3/envs/fastai/lib/python3.10/site-packages/fastai/callback/schedule.py", line 119, in fit_one_cycle
    self.fit(n_epoch, cbs=ParamScheduler(scheds)+L(cbs), reset_opt=reset_opt, wd=wd, start_epoch=start_epoch)
  File "/home/neuralito/miniconda3/envs/fastai/lib/python3.10/site-packages/fastai/learner.py", line 256, in fit
    self._with_events(self._do_fit, 'fit', CancelFitException, self._end_cleanup)
  File "/home/neuralito/miniconda3/envs/fastai/lib/python3.10/site-packages/fastai/learner.py", line 193, in _with_events
    try

KeyboardInterrupt: 

In [171]:
learn = ft.load_learner("models/learn_252.112_219982.pkg")

In [172]:
src = ft.TabularPandas(df.copy(),
                       procs=[ft.Normalize],
                       cat_names = cat_names,
                       cont_names = cont_names,
                       y_names = y_cat_names + y_cont_names,
                       splits = ft.RandomSplitter(0.05,seed=219982)(ft.range_of(df))
                       )
        
dls = src.dataloaders(bs=512)

In [173]:
vdf = df.iloc[dls.valid.items.index].copy()

In [174]:
def interpret_preds(learn:ft.Learner, dls:ft.DataLoaders, df:pd.DataFrame, Yproc:Preprocess):
    df = df.copy().reset_index(drop=True)
    test_dl = dls.test_dl(df)
    yp,_ = learn.get_preds(dl=test_dl)
    
    i = 0
    for c,n in zip(Yproc.y_cat,Yproc.numericalizers):
        j = i+n.num_cats()
        p = yp[:,i:j].argmax(dim=1)
        df[f'{c}_p'] = p
        df[f'{c}_p'] = df[f'{c}_p'].apply(lambda x: n.idx2cat[x])
        i = j
        
    for c,(m,s) in zip(Yproc.y_cont,Yproc.mu_sd):
        df[f'{c}_p'] = yp[:,i]*s+m
        
        i += 1
        
    return Yproc.unapply(df)
    

In [175]:
Q = interpret_preds(learn,dls,vdf,Yproc)

In [176]:
EARTH_RADIUS=6371

In [177]:
for c in ['x','y','z','x_p','y_p','z_p']:
    Q[c] = EARTH_RADIUS*Q[c]

In [178]:
Q.drop(['index','X'],inplace=True,axis=1)

In [179]:
good_cols = ['Latitude','Longitude']
good_cols += y_cont_names + y_cat_names
good_cols += [c for c in Q.columns if '_p' in c]

In [180]:
Q = Q[good_cols]

In [181]:
ll = cartesianas2esfericas(Q['x_p'],Q['y_p'],Q['z_p'])
ll.rename(columns={'Latitude':'Latitude_p', 'Longitude':'Longitude_p'},inplace=True)

In [182]:
Q = pd.concat([Q,ll],axis=1)

In [183]:
malos = (Q['Latitude']/Q['Latitude_p'] < -0.9)
Q.loc[malos,'Longitude_p'] = -Q['Longitude_p']
Q.loc[malos,'Latitude_p'] = -Q['Latitude_p']

In [184]:
Q.to_csv("data/prediccion_final_alpha.csv",index=False)

In [185]:
Q[['Latitude','Latitude_p','Longitude','Longitude_p']]

Unnamed: 0,Latitude,Latitude_p,Longitude,Longitude_p
0,41.015340,40.893646,-72.528080,-72.583992
1,64.490000,63.929321,-157.750000,-157.404282
2,41.841118,41.767914,-88.230357,-88.422165
3,64.490000,63.396515,-157.750000,-157.519135
4,41.840733,42.317719,-88.230357,-88.249924
...,...,...,...,...
156,35.117000,35.257767,138.937000,139.382278
157,20.086000,20.981232,-155.829000,-156.431061
158,68.596883,67.927139,-149.601250,-149.016159
159,9.160930,9.523247,-79.742700,-75.729225


In [188]:
Q[Q['ECONAME'] != Q['ECONAME_p']][['ECONAME','ECONAME_p']]

Unnamed: 0,ECONAME,ECONAME_p
10,Central and Southern Cascades forests,British Columbia mainland coastal forests
15,Southeast Australia temperate savanna,Southeast Australia temperate forests
16,Murray-Darling woodlands and mallee,Southeast Australia temperate savanna
40,Sierra Nevada forests,Colorado Rockies forests
76,Eastern Australian temperate forests,Southeast Australia temperate forests
98,Mid-Continental Canadian forests,Low Arctic tundra
101,Montana Valley and Foothill grasslands,Colorado Rockies forests
145,Low Arctic tundra,Interior Alaska-Yukon lowland taiga
153,Central and Southern mixed grasslands,Central tall grasslands


In [191]:
Q[Q['BIOME'] != Q['BIOME_p']][['BIOME','BIOME_p','ECONAME','ECONAME_p']]

Unnamed: 0,BIOME,BIOME_p,ECONAME,ECONAME_p
15,8.0,4.0,Southeast Australia temperate savanna,Southeast Australia temperate forests
16,12.0,8.0,Murray-Darling woodlands and mallee,Southeast Australia temperate savanna
98,6.0,11.0,Mid-Continental Canadian forests,Low Arctic tundra
101,8.0,5.0,Montana Valley and Foothill grasslands,Colorado Rockies forests
145,11.0,6.0,Low Arctic tundra,Interior Alaska-Yukon lowland taiga


In [192]:
Q[['BIOME','BIOME_p','ECONAME','ECONAME_p']]

Unnamed: 0,BIOME,BIOME_p,ECONAME,ECONAME_p
0,4.0,4.0,Northeastern coastal forests,Northeastern coastal forests
1,6.0,6.0,Interior Alaska-Yukon lowland taiga,Interior Alaska-Yukon lowland taiga
2,8.0,8.0,Central forest-grasslands transition,Central forest-grasslands transition
3,6.0,6.0,Interior Alaska-Yukon lowland taiga,Interior Alaska-Yukon lowland taiga
4,8.0,8.0,Central forest-grasslands transition,Central forest-grasslands transition
...,...,...,...,...
156,4.0,4.0,Taiheiyo evergreen forests,Taiheiyo evergreen forests
157,7.0,7.0,Hawaii tropical low shrublands,Hawaii tropical low shrublands
158,11.0,11.0,Brooks-British Range tundra,Brooks-British Range tundra
159,1.0,1.0,Isthmian-Atlantic moist forests,Isthmian-Atlantic moist forests


In [189]:
10/161

0.062111801242236024

In [193]:
learn.model

TabularModel(
  (embeds): ModuleList()
  (emb_drop): Dropout(p=0.0, inplace=False)
  (bn_cont): Sequential(
    (0): Noiser()
    (1): BatchNorm1d(905, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  )
  (layers): Sequential(
    (0): Linear(in_features=905, out_features=768, bias=False)
    (1): BatchNorm1d(768, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): CELU(alpha=1.0)
    (3): ResLinear(
      (res): Sequential(
        (0): Linear(in_features=768, out_features=384, bias=True)
        (1): LeakyReLU(negative_slope=0.01)
        (2): Linear(in_features=384, out_features=384, bias=True)
        (3): BatchNorm1d(384, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (4): LeakyReLU(negative_slope=0.01)
        (5): Linear(in_features=384, out_features=768, bias=True)
        (6): BatchNorm1d(768, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      )
    )
    (4): BatchNorm1d(768, eps=1e-05, momentum