# Data augmentation
## Inteligencia Computacional 2021-2, Grupo 8a
Nicolás Canales, Matías Vergara

Este notebook tiene por objetivo crear entradas falsas para balancear en parte las clases de objetos periodicos presentes en el dataset. 

Recordemos que los objetos periódicos son aquellos clasificados por ALeRCE como: "LPV", "Periodic-Other", "RRL", "CEP", "E" o "DSCT".  

En esta ocasión, las entradas se crearán en base a las curvas de luz con features ya calculadas.

In [None]:
# imports necesarios
import pandas as pd
import random

In [None]:
# traemos las curvas de luz con sus features
!gdown --id 19uB-u0gYCGKFlKFXCKIwsK5G4_MCFvV1
# traemos tambien el archivo labels.csv, con las clasificaciones
!gdown --id  1LU1sCIVXO8BQRMeKCCqu1vZGnceV6P5c

Downloading...
From: https://drive.google.com/uc?id=19uB-u0gYCGKFlKFXCKIwsK5G4_MCFvV1
To: /content/lc_features.csv
100% 143M/143M [00:00<00:00, 148MB/s]
Downloading...
From: https://drive.google.com/uc?id=1LU1sCIVXO8BQRMeKCCqu1vZGnceV6P5c
To: /content/labels_set.csv
100% 10.2M/10.2M [00:00<00:00, 153MB/s]


In [None]:
# cargamos los csv a dataframes de pandas
lc = pd.read_csv("lc_features.csv", index_col = 'oid').drop(columns='Unnamed: 0')
target = pd.read_csv("labels_set.csv", index_col = 'oid')

In [None]:
# contamos cuantos oid distintos en las alertas filtradas corresponden
# a cada clase periodica en los labels
count = {"LPV":0, "Periodic-Other": 0, "RRL": 0, "CEP": 0, "E": 0, "DSCT": 0}
for oid in lc.index.unique():
    clss = target.loc[oid].classALeRCE
    count[clss] += 1 
count

#hay un total de 87015 entradas

{'CEP': 618,
 'DSCT': 732,
 'E': 37899,
 'LPV': 14039,
 'Periodic-Other': 1256,
 'RRL': 32459}

Notamos un importante debalance de clases, que deja muy infrarrepresentadas las clases CEP, DSCT y Periodic-Other. Crearemos curvas de luz sintéticas para estas clases, basándonos en las que ya existen y cambiando levemente sus características. 

In [None]:
# la forma menos bacán de hacerlo (la mas facil a mi parecer) es,
# de los CEP, DSCT y Periodic-Other en 
# labels_set, sacar los OID. 
CEPS = []
DSCTS = []
POS = []

for index, row in target.iterrows():
  classf = row['classALeRCE']
  if classf == 'CEP':
    CEPS.append(index)
  elif classf == 'DSCT':
    DSCTS.append(index)
  elif classf == 'Periodic-Other':
    POS.append(index)
  else:
    pass

#print(CEPS)
print(len(CEPS))
print(len(DSCTS))
print(len(POS))

618
732
1256


In [None]:
def createSintetic(base_objects, dataname):
  i = 0
  sintetics = {}
  for obj in base_objects:
    for j in range(0, 1):
      obj_df = lc.loc[obj].copy()
      for name, values in obj_df.iteritems():
        val = obj_df[name]
        porcentaje = random.uniform(0, 1)
        signo = random.choice([-1, 1])
        val_mod = val*(1+signo*(porcentaje/100))
        obj_df[name] = val_mod
      obj_df.name = 'sintetic{}{}'.format(dataname, i)
      sintetics['sintetic{}{}'.format(dataname, i)] = obj_df
      i+=1
  return sintetics

sintetics_CEPS = createSintetic(CEPS, "CEPS")
sintetics_DSCTS =  createSintetic(DSCTS, "DSCTS")
sintetics_POS =  createSintetic(POS, "PeriodicOther")

sintetics_CEPS = pd.DataFrame.from_dict(sintetics_CEPS, orient='index', dtype=None, columns=None)
sintetics_DSCTS = pd.DataFrame.from_dict(sintetics_DSCTS, orient='index', dtype=None, columns=None)
sintetics_POS = pd.DataFrame.from_dict(sintetics_POS, orient='index', dtype=None, columns=None)

sintetics_CEPS.to_csv("sintetics_CEPS.csv", index_label="oid")
sintetics_DSCTS.to_csv("sintetics_DSCTS.csv", index_label="oid")
sintetics_POS.to_csv("sintetics_POS.csv", index_label="oid")

sintetics_all = pd.concat([sintetics_CEPS, sintetics_DSCTS, sintetics_POS])
sintetics_all.to_csv("data_augmented_v2.csv", index_label="oid")


In [None]:
sintetics_all

Unnamed: 0,Multiband_period,PPE,Period_band_g,delta_period_g,Period_band_r,delta_period_r,GP_DRW_sigma_g,GP_DRW_tau_g,GP_DRW_sigma_r,GP_DRW_tau_r,Psi_CS_g,Psi_eta_g,Psi_CS_r,Psi_eta_r,Harmonics_mag_1_g,Harmonics_mag_2_g,Harmonics_mag_3_g,Harmonics_mag_4_g,Harmonics_mag_5_g,Harmonics_mag_6_g,Harmonics_mag_7_g,Harmonics_phase_2_g,Harmonics_phase_3_g,Harmonics_phase_4_g,Harmonics_phase_5_g,Harmonics_phase_6_g,Harmonics_phase_7_g,Harmonics_mse_g,Harmonics_mag_1_r,Harmonics_mag_2_r,Harmonics_mag_3_r,Harmonics_mag_4_r,Harmonics_mag_5_r,Harmonics_mag_6_r,Harmonics_mag_7_r,Harmonics_phase_2_r,Harmonics_phase_3_r,Harmonics_phase_4_r,Harmonics_phase_5_r,Harmonics_phase_6_r,...,PairSlopeTrend_g,PercentAmplitude_g,Q31_g,Rcs_g,Skew_g,SmallKurtosis_g,Std_g,StetsonK_g,Pvar_g,ExcessVar_g,SF_ML_amplitude_g,SF_ML_gamma_g,IAR_phi_g,LinearTrend_g,Amplitude_r,AndersonDarling_r,Autocor_length_r,Beyond1Std_r,Con_r,Eta_e_r,Gskew_r,MaxSlope_r,Mean_r,Meanvariance_r,MedianAbsDev_r,MedianBRP_r,PairSlopeTrend_r,PercentAmplitude_r,Q31_r,Rcs_r,Skew_r,SmallKurtosis_r,Std_r,StetsonK_r,Pvar_r,ExcessVar_r,SF_ML_amplitude_r,SF_ML_gamma_r,IAR_phi_r,LinearTrend_r
sinteticCEPS0,0.099187,0.088481,0.098674,0.000000,,,7.165726e-02,0.159470,0.029663,0.365631,0.296575,1.561043,0.469516,2.078758,4.038066,4.045700,2.784466,6.087268,1.480128,0.671312,0.979821,6.165687,5.899208,2.667672,5.414470,2.387264,5.400029,4.071913e-27,4.109930,3.045456,1.517077,0.309993,2.031911,3.391093,4.115313,0.003546,0.028100,2.823447,3.069383,3.029716,...,0.000000,0.031537,0.554065,0.340101,-0.650526,-0.922147,0.272646,0.985866,1.001389,1.983644e-04,0.553932,0.049184,0.000005,0.000371,,,,,,,,,,,,,,,,,,,,,,,,,,
sinteticCEPS1,2.106717,0.058322,2.096756,0.000000,2.108029,0.000000,1.209850e-01,0.279326,0.054191,0.650660,0.262566,0.096629,0.261773,0.189105,0.390708,0.195778,0.121155,0.074522,0.035153,0.019093,0.016228,2.416170,4.372908,0.225614,2.590908,4.476237,0.554853,6.082769e-04,0.258632,0.128185,0.082658,0.051257,0.023398,0.013375,0.007108,2.198563,3.996049,6.062797,1.975719,3.788590,...,-0.033260,0.066036,0.575429,0.075078,-0.803520,-0.763837,0.359000,0.935619,1.006811,6.293318e-04,0.450989,-0.053168,0.029676,0.000042,0.386374,1.001177,1.007468,0.343900,0.0,,-0.433652,inf,13.572902,0.016794,0.098370,0.442592,-0.033293,0.045035,0.354639,0.113321,-0.866796,-0.398910,0.227684,0.844295,0.999651,0.000284,0.316429,-0.024310,0.218254,-0.000182
sinteticCEPS2,0.531230,0.013617,0.525232,0.000000,0.084959,0.444272,2.508825e-01,2.060785,0.111279,0.175659,0.288849,0.570872,0.247879,0.938581,242.450291,199.035758,154.798271,113.139740,57.522434,36.799125,12.829013,3.248034,0.037960,3.325928,0.167236,3.217711,0.595842,3.283664e-03,1011.896876,848.505429,613.881273,384.495969,200.094949,79.272405,17.976791,0.009516,0.031648,0.060136,0.119563,0.249467,...,-0.033159,0.045039,1.110844,0.323919,0.364589,-1.369639,0.511282,0.874900,0.995084,8.103005e-04,0.984755,0.133802,0.613533,0.000345,0.452504,0.998714,0.994400,0.534841,0.0,2.260565,-0.644071,0.206967,17.926531,0.019071,0.098227,0.465064,0.000000,0.042849,0.617811,0.217005,-0.457505,-1.487761,0.340508,0.949422,0.990531,0.000346,0.469241,-0.027751,0.000008,0.000180
sinteticCEPS3,0.168559,0.054469,,,0.170488,0.000000,6.806622e-03,0.996628,0.090265,0.234906,0.458118,4.306927,0.459989,0.633681,2.101684,3.833268,2.339024,2.859249,0.577355,5.610047,3.213452,1.706990,0.689581,4.838154,0.412107,0.411540,0.279784,3.753431e-29,1.413349,3.971024,5.882909,0.902381,2.676728,3.884172,2.625564,2.498980,1.406504,4.239519,4.899649,1.408832,...,,,,,,,,,,,,,,,0.402799,0.911925,1.998387,0.331733,0.0,1.171655,-0.475334,0.158567,18.121212,0.016797,0.121960,0.499179,0.100515,0.035032,0.441045,0.455208,-0.656733,0.782390,0.306483,0.986483,1.005000,0.000274,1.003504,0.246522,0.860119,0.001120
sinteticCEPS4,16.380032,0.194198,16.179698,0.000000,16.317619,0.000000,2.131286e-01,3.160565,0.038689,1.228486,0.286196,0.262969,0.246591,0.405186,0.739807,0.217985,0.215008,0.238702,0.116721,0.062723,0.044006,0.935604,2.534351,3.158696,3.569305,4.742751,5.150683,2.166263e-03,14.357063,16.451170,14.756737,10.329684,5.449268,2.033406,0.425985,0.306104,0.745595,1.256149,1.819237,2.384873,...,0.100130,0.060922,0.961589,0.181896,0.720609,-1.176326,0.480246,0.837229,0.993837,6.522558e-04,0.650946,0.022899,0.738720,0.000468,0.339879,1.001432,1.001702,0.440552,0.0,0.090561,-0.398117,3.625145,18.363526,0.010403,0.093325,0.523909,-0.033646,0.042283,0.209156,0.116617,-1.327228,2.563716,0.190722,0.726876,1.006054,0.000091,0.206342,-0.098643,0.460593,-0.000038
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
sinteticPeriodicOther1251,7.154912,0.018768,7.154693,0.000000,,,2.494882e-02,0.182165,,,0.293752,0.591055,,,0.224773,0.030529,0.090396,0.102735,0.050416,0.044487,0.079414,5.233089,4.155278,3.249216,1.310258,0.133454,4.465395,6.504406e-04,,,,,,,,,,,,,...,0.133276,0.017787,0.282849,0.260334,0.120051,-1.445876,0.157226,0.911973,0.997699,1.395807e-04,0.453667,0.406540,0.000010,0.000006,,,,,,,,,,,,,,,,,,,,,,,,,,
sinteticPeriodicOther1252,0.050949,0.001685,0.058426,0.006760,0.076209,0.025653,7.576949e-10,1.067783,0.000378,2.454320,0.267963,3.144672,0.322698,2.914363,91800.411381,79141.072500,53573.450783,77835.839835,69696.675010,105811.522479,36996.452534,4.791746,0.017882,5.009627,1.755394,4.005072,5.629230,9.266901e-16,2.725047,3.530231,1.332670,2.415964,3.230347,3.850166,0.827455,1.460692,6.040533,6.300907,5.755333,3.190564,...,0.066378,0.003361,0.010640,0.280030,2.105450,9.638551,0.017488,0.707509,0.629850,4.154321e-07,0.025733,-0.186914,0.968584,0.000073,0.031716,0.713409,0.999995,0.503942,0.0,0.379659,-0.012071,0.003965,14.779449,0.001514,0.014255,0.496227,-0.033027,0.002541,0.029241,0.428877,-0.155416,1.210866,0.022353,0.882765,1.001295,0.000002,0.433072,0.914034,0.864819,0.000161
sinteticPeriodicOther1253,16.373503,0.060434,16.349996,0.000000,16.547295,0.000000,9.110843e-03,2.410979,0.012548,4.241373,0.201440,0.629022,0.232922,0.487265,0.340004,0.072194,0.163975,0.255130,0.225702,0.136643,0.046679,4.604736,3.041060,0.194256,3.444902,0.461291,3.810736,1.700953e-03,0.140584,0.159499,0.146645,0.110337,0.079193,0.081926,0.052770,2.097808,2.964260,4.183786,4.973409,5.930070,...,-0.033028,0.021868,0.130295,0.239864,-0.751710,1.240214,0.094291,0.735929,1.004743,3.859500e-05,0.122885,0.009284,0.653986,-0.000006,0.199431,1.009854,0.993652,0.181841,0.0,,-0.142804,inf,14.085496,0.007059,0.044657,0.470244,-0.033024,0.021122,0.078158,0.208933,-1.304001,2.046338,0.097360,0.619294,0.998754,0.000049,0.123642,-0.075144,0.752671,0.000006
sinteticPeriodicOther1254,0.062391,0.007360,0.125057,0.063164,0.063076,0.000724,2.028577e-04,5.145935,0.003448,153.658186,0.385287,1.054121,0.357932,1.407881,3.627320,1.137645,1.152582,0.972035,1.434742,3.047443,2.504843,5.495814,3.724180,1.439993,0.484827,4.298796,0.805912,4.249404e-10,2.595978,3.163006,0.363721,3.802128,1.201322,1.266225,3.080217,1.189052,5.160163,3.418278,3.081790,4.490353,...,0.033223,0.001953,0.017567,0.277522,-0.131703,0.073586,0.015493,0.785652,0.996966,1.065319e-06,0.029380,0.234213,0.830719,-0.000041,0.072528,0.990964,0.993573,0.166923,0.0,0.153875,0.081801,0.004620,13.435251,0.003483,0.012229,0.669450,0.033345,0.008491,0.020317,0.354613,1.448201,9.160109,0.046531,0.726698,0.991648,0.000012,0.257223,1.503189,0.989874,-0.000328
