In [1]:
import pandas as pd 
from datetime import datetime
import matplotlib.pyplot as plt
import utils
import numpy as np
import importlib 
from tqdm import tqdm

importlib.reload(utils)


pygame 2.6.0 (SDL 2.28.4, Python 3.12.4)
Hello from the pygame community. https://www.pygame.org/contribute.html


<module 'utils' from 'c:\\Users\\UFMG\\Periodos\\Periodo_XIV\\TCC\\Projeto\\Códigos\\tcc_ovitraps\\utils.py'>

In [2]:
data = pd.read_csv('./data/final_data.csv',parse_dates=['dtcol'])
valid_samples = data[['nplaca','novos','latitude','longitude','narmad','ano','semepi','dtcol']].drop_duplicates().dropna().reset_index(drop=True)


In [3]:
lags = 3 # number of lags to consider of all traps
n_traps = 3 # including the current one

### Treatment 
TODO: move to file

In [4]:
groups = valid_samples.groupby(['latitude','longitude'])
rows_to_update = []

for name, group in groups:
    if group['narmad'].nunique() > 1:  # More than one unique value in C
        rows_to_update.extend(group.index.tolist())  # Add row indices to the list
bad_data = data.loc[rows_to_update]

print('Traps with the same lat_long ',bad_data['narmad'].value_counts())
print('\nDates with traps with the same lat_long ',bad_data.groupby(['dtcol'])['novos'].nunique().sort_values(ascending=False).head(10))

Traps with the same lat_long  narmad
902016    305
902017    291
908062    288
902022    283
907076    260
907080    258
907111    221
902167    217
907106    191
902169    160
908095    128
907105    109
908094    100
907110     79
902168     78
907081     42
907077     39
902021     16
908105     15
908103     15
902015     10
908063      2
Name: count, dtype: int64

Dates with traps with the same lat_long  dtcol
2020-04-06    8
2015-12-07    7
2018-11-19    7
2018-10-08    7
2022-02-14    6
2023-12-04    6
2023-01-16    6
2019-12-02    6
2019-01-14    6
2019-04-08    6
Name: novos, dtype: int64


In [5]:
# This cell creates a new dataframe changing the number of the traps that are in the same position
# TODO move to data exploration of preprocessing
same_position_armad = valid_samples.groupby(['latitude','longitude'])['narmad'].nunique()
print(same_position_armad.value_counts()) #number of traps in the same position

#data['narmad'] = data.groupby(['latitude','longitude'])['narmad'].transform('first') #group by position and take the first trap number

narmad
1    1760
2       5
3       1
Name: count, dtype: int64


In [6]:
# introduce a small value on this traps to differentiate them
for trap in bad_data['narmad'].unique():
    valid_samples.loc[valid_samples['narmad'] == trap, 'latitude'] += np.random.rand()*0.00000001
    valid_samples.loc[valid_samples['narmad'] == trap, 'longitude'] += np.random.rand()*0.00000001

### Distance Matrix

In [7]:
position_matrix = valid_samples[['latitude','longitude','narmad']].drop_duplicates().dropna().reset_index(drop=True)

try: 
    distance_matrix = pd.read_csv('./results/distance_matrix.csv',index_col=0)
    distance_matrix.columns = distance_matrix.columns.astype(float)
    distance_matrix.set_index(distance_matrix.columns,inplace=True)


except:
    distance_matrix = utils.create_distance_matrix(position_matrix)
    distance_matrix.to_csv('./results/distance_matrix.csv')

distance_matrix_np = distance_matrix.to_numpy()


1570878it [06:45, 3876.51it/s]


### Week trap matrix

In [8]:
week_trap_df = valid_samples.pivot(index=['ano','semepi'],columns='narmad',values='novos')
new_index = pd.MultiIndex.from_product([week_trap_df.index.levels[0], range(101,153)]) # introduce weeks 51 and 52
new_index = new_index[(new_index <= week_trap_df.iloc[-1].name) & (new_index >= week_trap_df.iloc[0].name)] #remove indexes that are greater than the last sample or smaller than the first one
week_trap_df = week_trap_df.reindex(new_index) # [week,trap] - > novos


### Lagged Matrix

In [9]:
list_lagged_df = []
for i in range(1,2*lags+1):
    list_lagged_df.append(week_trap_df.shift(i).values)
lagged_matrix = np.stack(list_lagged_df, axis=0) # numpy array with the lagged values. [lag x week x trap] -> novos. Obs.: consider 2*lags due to the biweekly sampling rate


### NaN Count Matrix

In [10]:
nan_count_matrix = np.sum(np.isnan(lagged_matrix), axis=0) # [week x trap] -> number of nans in the lagged matrix

for i in range(2*lags):
    print("Number of samples with",i+1,"valid values: ",np.sum(nan_count_matrix==i+1))

Number of samples with 1 valid values:  0
Number of samples with 2 valid values:  0
Number of samples with 3 valid values:  742782
Number of samples with 4 valid values:  266579
Number of samples with 5 valid values:  28622
Number of samples with 6 valid values:  157019


### Info Matrix

In [11]:
info_df = valid_samples[['ano','semepi','nplaca','dtcol','novos','narmad','latitude','longitude']]

### Lagged days matrix

In [12]:
day_df = valid_samples.pivot(index=['ano','semepi'],columns='narmad',values='dtcol')
new_index = pd.MultiIndex.from_product([day_df.index.levels[0], range(101,153)]) # introduce weeks 51 and 52
new_index = new_index[(new_index <= day_df.iloc[-1].name) & (new_index >= day_df.iloc[0].name)] #remove indexes that are greater than the last sample or smaller than the first one
day_df = day_df.reindex(new_index) # [week,trap] - > dtcol
day_df = day_df.map(lambda x: x.toordinal() if pd.notnull(x) else np.nan) # convert to ordinal so we can calculate the difference between two dates
day_df_np = day_df.to_numpy()
list_lagged_days = []

for i in range(1,2*lags+1):
    list_lagged_days.append(day_df.shift(i).values)
lagged_days = np.stack(list_lagged_days, axis=0) # numpy array with the lagged values. [lag x week x trap] -> ordinal daus. Obs.: consider 2*lags due to the biweekly sampling rate




### Useful dicts

In [13]:
trap_index_dict = {trap: index for index,trap in enumerate(distance_matrix.columns)}                                           # trap: index 
yearweek_index_dict = {(year,week): index for index,(year,week) in enumerate(week_trap_df.index)}                           # (year,week): index
nplaca_week_dict = {nplaca: (year, week) for nplaca,week,year in zip(info_df['nplaca'],info_df['semepi'],info_df['ano'])}   # nplaca: (year,week)
nplaca_index_dict = {nplaca: yearweek_index_dict[(year, week)] for nplaca,week,year in zip(info_df['nplaca'],info_df['semepi'],info_df['ano'])}   # nplaca: week index 


### Final Dataframe

In [14]:
list_placas = []
list_final_samples = []
for original_trap in distance_matrix.columns:
    matching_placas = info_df[info_df['narmad'] == original_trap]['nplaca']
    original_trap_index = trap_index_dict[original_trap]
    sorted_distance_indexes = np.argsort(distance_matrix_np[original_trap_index])
    #assert sorted_distance_indexes[0] == trap_index, 'trap is not the closest to itself'
    for placa in matching_placas: 
        add_row = [placa]
        order_index = 0
        list_placas.append(placa)
        week_index = nplaca_index_dict[placa]

        # remove samples of original traps that doesn't have enough autoregressive samples 
        if nan_count_matrix[week_index,original_trap_index] > lags: 
            continue

        for trap in range(n_traps): #loop to deal with the n_traps closest to the original trap
            trap_index = sorted_distance_indexes[order_index]

            while nan_count_matrix[week_index,trap_index] > lags: #  avoid traps that doesn't have enough autoregressive samples
                order_index += 1
                trap_index = sorted_distance_indexes[order_index]
                #if order_index > 50: #avoid arbitrarily distant traps
                    #break
            
            lagged_samples = lagged_matrix[:,week_index,trap_index] # [lag x week x trap] -> novos

            [add_row.append(i) for i in lagged_samples[~np.isnan(lagged_samples)]] # add lagged novos
            add_row.append(distance_matrix_np[original_trap_index,trap_index]) # add distance
            
            #subtract lagged days from orignal sample day [lag x week x trap] -> ordinal days
            lagged_samples_days = lagged_days[:,week_index,trap_index]
            days_diff = day_df_np[week_index,original_trap_index] - lagged_samples_days[~np.isnan(lagged_samples_days)] 
            [add_row.append(i) for i in days_diff] # add lagged days
            if np.isnan(days_diff).any():
                print('nan')
       
            order_index += 1
        list_final_samples.append(add_row)


assert len(list_placas) == valid_samples.shape[0], 'invalid number of placas'
assert len(list_placas) == len(set(list_placas)), 'duplicated placas'



In [15]:
final_df = pd.DataFrame(list_final_samples)

columns_names = ['nplaca']
for j in range(n_traps): 
    for i in range(1,lags+1):
        columns_names.extend(['trap'+str(j)+'_lag'+str(i)])
    columns_names.extend(['distance'+str(j)])
    for i in range(1,lags+1):
        columns_names.extend(['days'+str(j)+'_lag'+str(i)])



final_df.columns = columns_names  
final_df = pd.merge(data[['nplaca','novos']],final_df,how='inner',on='nplaca')
final_df.set_index('nplaca',inplace=True)
final_df.to_csv(f'./results/final_df_lag{lags}_ntraps{n_traps}.csv')
