In [1]:
import sys
import os
current_dir = os.getcwd()
project_root = os.path.abspath(os.path.join(current_dir, '../'))
sys.path.append(project_root)



import src.NN_pipeline as NN_pipeline
import pandas as pd
import numpy as np
import os 
from tqdm import tqdm   



In [5]:

# Calculate eggs_per_day
def calculate_3_class(df):
    
    
    df['dtcol'] = pd.to_datetime(df['dtcol'], errors='coerce')
    df['dtinstal'] = pd.to_datetime(df['dtinstal'], errors='coerce')
    df['eggs_per_day'] = df['novos'] / (df['dtcol'] - df['dtinstal']).dt.days
    df['eggs_per_day'] = df['eggs_per_day'].replace(np.nan,0)
    df['3_class'] = pd.cut(
        df['eggs_per_day'], 
        bins=[-float('inf'), 2.65, 5, float('inf')], 
        labels=[0, 1, 2],
        ).astype(int)

    return df
    

def calculate_3_class_7_days(df):


    df['dtcol'] = pd.to_datetime(df['dtcol'], errors='coerce')
    df['dtinstal'] = pd.to_datetime(df['dtinstal'], errors='coerce')
    df['eggs_per_day'] = df['novos'] / 7
    df['eggs_per_day'] = df['eggs_per_day'].replace(np.nan,0)
    df['3_class_7_days'] = pd.cut(
        df['eggs_per_day'], 
        bins=[-float('inf'), 2.65, 5, float('inf')], 
        labels=[0, 1, 2],
        ).astype(int)

    return df


info_df = pd.read_csv('../data/final_data.csv')

parameters = {'lags': 5,'ntraps':10} 
data_path = f"../results/final_dfs/final_df_lag{parameters['lags']}_ntraps{parameters['ntraps']}.parquet"
NN_data = pd.read_parquet(data_path)
unnamed_cols = NN_data.columns [['Unnamed' in col for col in NN_data.columns] ] #TODO create function to load data
NN_data.drop(unnamed_cols,axis=1,inplace = True)


In [12]:
info_df['temp_expo'] = (pd.to_datetime(info_df['dtcol']) - pd.to_datetime(info_df['dtinstal']))/pd.Timedelta(days=1)
info_df['semepi2'] = info_df['semepi']**2 
info_df['sin_semepi'] = np.sqrt(np.sin(np.pi*(info_df['semepi']-30)/max(info_df['semepi']))**2)
info_df['sin_mesepi'] = np.sqrt(np.sin(np.pi*(info_df['mesepid']-2)/max(info_df['mesepid']))**2)
info_df = calculate_3_class(info_df)
info_df = calculate_3_class_7_days(info_df)
print('Percentage of mistakes if we use the 7 days average:',(info_df['3_class'] != info_df['3_class_7_days']).sum()/len(info_df))

features = ['mesepid', 'semepi', 'zero_perc','anoepid', 'temp_expo', 'semepi2', 'sin_semepi', 
            'sin_mesepi', 'Temperatura_previsao', 'Precipitacao_previsao','Umidade_previsao', 
            'Temperatura_week_bfr_mean', 'Precipitacao_week_bfr_mean', 'Umidade_week_bfr_mean',
            '3_class']

root = "../results/final_dfs/" # root of the final dfs

Percentage of mistakes if we use the 7 days average: 0.0026767246623676847


In [62]:
for file in tqdm(os.listdir(root)):
    new_file = 'new/' + file   # saved in a new folder to avoid overwriting and possible data loss
    
    if not file.endswith('.parquet'):
        continue
    
    if os.path.isfile(os.path.join(root, new_file)):    
        continue
    
    if file in os.listdir(os.path.join(root, 'new')):
        continue
    
    df = pd.read_parquet(os.path.join(root, file),engine="pyarrow", use_threads=True)

    for i in features:
        if i not in df.columns:
                df = df.merge(info_df[['nplaca',i]], on='nplaca', how='left')
                unnamed_cols = df.columns [['Unnamed' in col for col in df.columns] ] #TODO create function to load df
                df.to_parquet(os.path.join(root, new_file), compression='snappy', engine="pyarrow",index=False)     


100%|██████████| 3/3 [00:13<00:00,  4.39s/it]


In [63]:
# Confirming that the new dfs are correct

file = 'final_df_lag5_ntraps10.parquet'
try:
    df = pd.read_parquet(os.path.join(root, file),engine="pyarrow", use_threads=True)
    display(df.head())
except:
    print('Error loading the file')
    pass

try:
    df_new = pd.read_parquet(os.path.join(root, 'new/' + file),engine="pyarrow", use_threads=True)
    display(df_new.head())
except:
    print('Error loading new file')
    pass

try:
    df_old = pd.read_parquet(os.path.join(root, 'old/' + file),engine="pyarrow", use_threads=True)
    display(df_old.head())
except:
    print('Error loading old file')
    pass

Unnamed: 0,nplaca,novos,trap0_lag1,trap0_lag2,trap0_lag3,trap0_lag4,trap0_lag5,latitude0,longitude0,days0_lag1,...,semepi2,sin_semepi,sin_mesepi,Temperatura_previsao,Precipitacao_previsao,Umidade_previsao,Temperatura_week_bfr_mean,Precipitacao_week_bfr_mean,Umidade_week_bfr_mean,3_class
0,90341574,39.0,0.0,0.0,0.0,0.0,0.0,1.574066,-0.003048,12.0,...,2704,0.904827,0.866025,19.933944,4.979698,80.976452,21.207705,5.701359,76.480622,2
1,90340009,72.0,107.0,0.0,0.0,0.0,0.0,1.396189,-0.006688,12.0,...,2704,0.904827,0.866025,19.959918,5.28855,81.157873,21.244516,5.918992,76.557302,2
2,90341056,0.0,0.0,31.0,0.0,0.0,0.0,1.516355,-0.003025,12.0,...,2704,0.904827,0.866025,19.941126,5.064695,81.029901,21.21811,5.762025,76.504071,0
3,90340818,6.0,0.0,81.0,0.0,0.0,0.0,1.573803,-0.00715,12.0,...,2704,0.904827,0.866025,19.93315,4.980522,80.984231,21.206438,5.69675,76.488967,0
4,90340060,141.0,0.0,0.0,0.0,0.0,0.0,1.515419,-0.006561,12.0,...,2704,0.904827,0.866025,19.940756,5.069298,81.038894,21.217451,5.760677,76.51229,2


Unnamed: 0,nplaca,novos,trap0_lag1,trap0_lag2,trap0_lag3,trap0_lag4,trap0_lag5,latitude0,longitude0,days0_lag1,...,semepi2,sin_semepi,sin_mesepi,3_class,Temperatura_previsao,Precipitacao_previsao,Umidade_previsao,Temperatura_week_bfr_mean,Precipitacao_week_bfr_mean,Umidade_week_bfr_mean
0,90341574,39.0,0.0,0.0,0.0,0.0,0.0,1.574066,-0.003048,12.0,...,2704,0.904827,0.866025,2,19.933944,4.979698,80.976452,21.207705,5.701359,76.480622
1,90340009,72.0,107.0,0.0,0.0,0.0,0.0,1.396189,-0.006688,12.0,...,2704,0.904827,0.866025,2,19.959918,5.28855,81.157873,21.244516,5.918992,76.557302
2,90341056,0.0,0.0,31.0,0.0,0.0,0.0,1.516355,-0.003025,12.0,...,2704,0.904827,0.866025,0,19.941126,5.064695,81.029901,21.21811,5.762025,76.504071
3,90340818,6.0,0.0,81.0,0.0,0.0,0.0,1.573803,-0.00715,12.0,...,2704,0.904827,0.866025,0,19.93315,4.980522,80.984231,21.206438,5.69675,76.488967
4,90340060,141.0,0.0,0.0,0.0,0.0,0.0,1.515419,-0.006561,12.0,...,2704,0.904827,0.866025,2,19.940756,5.069298,81.038894,21.217451,5.760677,76.51229


Unnamed: 0,nplaca,novos,trap0_lag1,trap0_lag2,trap0_lag3,trap0_lag4,trap0_lag5,latitude0,longitude0,days0_lag1,...,temp_expo,semepi2,sin_semepi,sin_mesepi,Temperatura_previsao,Precipitacao_previsao,Umidade_previsao,Temperatura_week_bfr_mean,Precipitacao_week_bfr_mean,Umidade_week_bfr_mean
0,90341574,39.0,0.0,0.0,0.0,0.0,0.0,1.574066,-0.003048,12.0,...,7.0,2704,0.904827,0.866025,19.933944,4.979698,80.976452,21.207705,5.701359,76.480622
1,90340009,72.0,107.0,0.0,0.0,0.0,0.0,1.396189,-0.006688,12.0,...,7.0,2704,0.904827,0.866025,19.959918,5.28855,81.157873,21.244516,5.918992,76.557302
2,90341056,0.0,0.0,31.0,0.0,0.0,0.0,1.516355,-0.003025,12.0,...,7.0,2704,0.904827,0.866025,19.941126,5.064695,81.029901,21.21811,5.762025,76.504071
3,90340818,6.0,0.0,81.0,0.0,0.0,0.0,1.573803,-0.00715,12.0,...,7.0,2704,0.904827,0.866025,19.93315,4.980522,80.984231,21.206438,5.69675,76.488967
4,90340060,141.0,0.0,0.0,0.0,0.0,0.0,1.515419,-0.006561,12.0,...,7.0,2704,0.904827,0.866025,19.940756,5.069298,81.038894,21.217451,5.760677,76.51229
