In [1]:
import numpy as np
import pandas as pd
import os
import math

In [2]:
stars_files = [
    ("./normalized/SY.csv", 1),
    ("./normalized/NP.csv", 2),
    ("./normalized/RG.csv", 3)
]

In [3]:
df1 = pd.read_csv(stars_files[2][0], header=None)
df1.shape

(69146, 343)

## Create Unbalanced Data (1200)

In [4]:
limit = 1200
result = []
for file in stars_files:
    df1 = pd.read_csv(file[0], header=None)
    size = min(len(df1), limit)
    df1 = df1.sample(size, random_state=10)
    df1 = df1.assign(id=file[1])
    result.append(df1)

df_result = pd.concat(result)

out_dir = './datasets'
if not os.path.exists(out_dir):
    os.mkdir(out_dir)
fullname = os.path.join(out_dir, "unbalanced.csv")
df_result.to_csv(fullname, header=False, index=False)

## Create Unbalanced Data (15000)

In [None]:
limit = 15000
result = []
for file in stars_files:
    df1 = pd.read_csv(file[0], header=None)
    size = min(len(df1), limit)
    df1 = df1.sample(size, random_state=10)
    df1 = df1.assign(id=file[1])
    result.append(df1)

df_result = pd.concat(result)

out_dir = './datasets'
if not os.path.exists(out_dir):
    os.mkdir(out_dir)
fullname = os.path.join(out_dir, "unbalanced_15000.csv")
df_result.to_csv(fullname, header=False, index=False)

## Create Balanced Data

In [95]:
limit = 1000

In [96]:
def normalize(spectra):
    min1 = min(spectra)
    max1 = max(spectra)
    # Normalizar los espectros al rango de 0 a 1
    return (spectra - min1) / (max1 - min1)

In [97]:
noises = [
    np.random.normal(loc=0.0, scale=0.01, size=343),
    np.random.normal(loc=0.0, scale=0.02, size=343),
    np.random.normal(loc=0.0, scale=0.03, size=343),
    np.random.normal(loc=0.0, scale=0.04, size=343),
    np.random.normal(loc=0.0, scale=0.05, size=343)
]

In [98]:
def add_noise(df2, size):
    result = []
    cant = math.ceil((size / len(df2))) - 1
    for index, row in df2.iterrows():
        result.append(row)
        for i in range(0, cant):
            result.append(normalize(row + noises[i]))

    return pd.DataFrame(result)

In [99]:
result = []
for file in stars_files:
    df1 = pd.read_csv(file[0], header=None)
    if len(df1) < limit:
        df1 = add_noise(df1, limit)

    df1 = df1.sample(limit, random_state=10)
    df1 = df1.assign(id=file[1])
    result.append(df1)

df_result = pd.concat(result)

In [100]:
out_dir = './datasets'
if not os.path.exists(out_dir):
    os.mkdir(out_dir)
fullname = os.path.join(out_dir, "balanced.csv")
df_result.to_csv(fullname, header=False, index=False)