# Clean Data

Limpiar y organizar datos para usar en el an√°lisis:

-Arregla decimales

-Borra duplicados

## Packages

In [1]:
from datetime import datetime, timedelta
import pandas as pd
import os

## Parameters

In [3]:
n=185
# Obtener fecha actual en formato YYYY-MM-DD
#fecha_actual = datetime.now().strftime("%Y-%m-%d")
fecha_actual = "2025-11-04"
window_minutes=30
output_dir = f"{fecha_actual}_Analysis_for_{n}_days"
# Crear carpeta principal (si no existe) y mostrar mensaje
if os.path.exists(output_dir):
    print(f"‚ö†Ô∏è La carpeta principal ya exist√≠a: {output_dir}")
else:
    os.makedirs(output_dir, exist_ok=True)
    print(f"üìÅ Carpeta principal creada: {output_dir}")

# path of GOES data full:
csv_path_full = f"{fecha_actual}_Analysis_for_{n}_days/df_full_goes_{n}.csv" #archivos se pasan a mano de Data y se renombran con n
csv_path_flares = f"{fecha_actual}_Analysis_for_{n}_days/df_full_flares_{n}.csv"

# Verificar existencia de archivos y avisar
if not os.path.exists(csv_path_full):
    print(f"‚ö†Ô∏è No se encontr√≥ el archivo GOES: {csv_path_full}")
else:
    print(f"‚úÖ Archivo GOES encontrado: {csv_path_full}")

if not os.path.exists(csv_path_flares):
    print(f"‚ö†Ô∏è No se encontr√≥ el archivo de flares: {csv_path_flares}")
else:
    print(f"‚úÖ Archivo de flares encontrado: {csv_path_flares}")

‚ö†Ô∏è La carpeta principal ya exist√≠a: 2025-11-04_Analysis_for_185_days
‚úÖ Archivo GOES encontrado: 2025-11-04_Analysis_for_185_days/df_full_goes_185.csv
‚úÖ Archivo de flares encontrado: 2025-11-04_Analysis_for_185_days/df_full_flares_185.csv


In [4]:
df_full = pd.read_csv(csv_path_full)
df_flares = pd.read_csv(csv_path_flares)

In [5]:
df_full.describe()

Unnamed: 0,xrsa,xrsb,xrsa_corr,xrsb_corr,T_cor,EM_cor,T_phot,EM_phot,EM_cor_norm,EM_phot_norm
count,261303.0,261302.0,96601.0,91943.0,260786.0,91943.0,260786.0,91943.0,91943.0,91943.0
mean,1.995556e-06,1.171021e-05,1.238707e-06,4.455114e-06,3821.194,1.1369579999999999e+51,410.9869,2.0072389999999998e+51,113.6958,200.7239
std,1.733392e-05,5.600244e-05,1.433194e-05,4.088831e-05,1284670.0,3.3878900000000003e+53,107580.9,5.934095e+53,33878.9,59340.95
min,1e-09,1e-09,0.0,0.0,1.028558,0.0,1.027503,0.0,0.0,0.0
25%,3.524724e-08,1.60699e-06,2.692161e-09,3.821003e-08,2.795147,6.913854e+46,2.637235,1.905335e+47,0.006913854,0.01905335
50%,9.223707e-08,2.960039e-06,1.127986e-08,1.463442e-07,2.811526,2.100401e+47,2.645967,5.907192e+47,0.02100401,0.05907192
75%,3.193478e-07,6.191495e-06,5.954828e-08,5.932917e-07,3.269362,6.279393e+47,2.993372,1.800797e+48,0.06279393,0.1800797
max,0.0007807088,0.002629195,0.0007713941,0.002488573,640799600.0,1.0272420000000001e+56,37796450.0,1.7992140000000002e+56,10272420.0,17992140.0


### Function

In [6]:
def check_and_fix_csv(csv_path, output_dir, n, output_filename, time_col="Unnamed: 0"):
    """
    Checks and fixes a CSV file to ensure:
    - The time column is in datetime format
    - The time resolution is 1 minute
    - There are no duplicate timestamps
    Saves the cleaned file in the specified output directory.
    """
    
    print(f"\n=== Checking file: {csv_path} ===")
    
    # --- Load CSV ---
    try:
        df = pd.read_csv(csv_path)
    except Exception as e:
        print(f"‚ùå Error reading the file: {e}")
        return
    
    changes = []  # keep track of any corrections made
    
    # --- Verify time column ---
    if time_col not in df.columns:
        print(f"‚ùå The time column '{time_col}' does not exist in the CSV.")
        return
    
    # --- Convert to datetime ---
    # Remove decimals if present
    original_time_values = df[time_col].astype(str)
    if original_time_values.str.contains(r"\.").any():
        print("üßπ Decimal points detected in timestamps. Removing fractional seconds...")
        df[time_col] = original_time_values.str.split(".").str[0]
    else:
        print("‚úÖ No decimal points found in timestamps.")

    # Try to convert to datetime
    try:
        df[time_col] = pd.to_datetime(df[time_col], format="%Y-%m-%d %H:%M:%S", errors="coerce")
        if df[time_col].isna().any():
            print("‚ö†Ô∏è Some timestamps could not be parsed using the strict format. They were set to NaT.")
    except Exception as e:
        print(f"‚ö†Ô∏è Error while converting to datetime: {e}")
        print("üîÑ Retrying with automatic format detection...")
        df[time_col] = pd.to_datetime(df[time_col], errors="coerce")



    # Drop invalid (NaT) rows
    # --- Drop invalid (NaT) rows ---
    n_null = df[time_col].isna().sum()
    if n_null > 0:
        print(f"‚ö†Ô∏è Found {n_null} invalid or missing timestamps. Removing them...")
        changes.append(f"Removed {n_null} rows with invalid or missing timestamps.")
        df = df.dropna(subset=[time_col])
    else:
        print("‚úÖ No invalid or missing timestamps found.")

    # --- Check for duplicates ---
    duplicate_count = df.duplicated(subset=time_col).sum()
    if duplicate_count > 0:
        print(f"‚ö†Ô∏è Found {duplicate_count} duplicated timestamps. Keeping the first occurrence...")
        changes.append(f"Removed {duplicate_count} duplicate rows.")
        n_before = len(df)
        df = df.drop_duplicates(subset=time_col, keep="first")
        n_after = len(df)
        print(f"   ‚Üí Rows before: {n_before}, after: {n_after}")
    else:
        print("‚úÖ No duplicated timestamps found.")
    
    # --- Check time resolution ---
    df = df.sort_values(by=time_col)
    diffs = df[time_col].diff().dropna()
    freq_counts = diffs.value_counts()

    if not freq_counts.empty:
        most_common_freq = freq_counts.index[0]
        if most_common_freq.total_seconds() == 60:
            print("‚úÖ Main resolution: 1 minute")
        else:
            changes.append(f"Detected time step: {most_common_freq}.")
            print(f"‚ö†Ô∏è Main resolution is not 1 minute, detected: {most_common_freq}")
    else:
        print("‚ö†Ô∏è Could not determine time resolution (too few rows).")
        
    # --- Save cleaned CSV ---
    
    output_path = os.path.join(output_dir, output_filename)
    df.to_csv(output_path, index=False)
    
    # --- Final report ---
    print(f"\nFile saved to: {output_path}")
    if changes:
        print("üîß Changes made:")
        for c in changes:
            print(" - " + c)
    else:
        print("‚úÖ Data were already clean. No changes applied.")
    
    return df


## data

In [7]:
df_flares.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2022 entries, 0 to 2021
Data columns (total 5 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   StartTime    2022 non-null   object
 1   EndTime      2022 non-null   object
 2   Class        2022 non-null   object
 3   Observatory  2022 non-null   object
 4   PeakTime     2022 non-null   object
dtypes: object(5)
memory usage: 79.1+ KB


## Cleaning

In [8]:

df_full_clean = check_and_fix_csv(csv_path_full, output_dir, n,
                             output_filename = f"all_df_full_{n}_cleaned.csv", 
                             time_col="date")



=== Checking file: 2025-11-04_Analysis_for_185_days/df_full_goes_185.csv ===
‚úÖ No decimal points found in timestamps.
‚úÖ No invalid or missing timestamps found.
‚úÖ No duplicated timestamps found.
‚úÖ Main resolution: 1 minute

File saved to: 2025-11-04_Analysis_for_185_days/all_df_full_185_cleaned.csv
‚úÖ Data were already clean. No changes applied.


In [9]:
df_flares_clean = check_and_fix_csv(csv_path_flares, output_dir, n,
                             output_filename = f"all_df_flares_{n}_cleaned.csv", 
                             time_col="PeakTime")


=== Checking file: 2025-11-04_Analysis_for_185_days/df_full_flares_185.csv ===
‚úÖ No decimal points found in timestamps.
‚úÖ No invalid or missing timestamps found.
‚úÖ No duplicated timestamps found.
‚ö†Ô∏è Main resolution is not 1 minute, detected: 0 days 00:21:00

File saved to: 2025-11-04_Analysis_for_185_days/all_df_flares_185_cleaned.csv
üîß Changes made:
 - Detected time step: 0 days 00:21:00.


In [10]:
df_full_clean.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 266400 entries, 0 to 266399
Data columns (total 12 columns):
 #   Column        Non-Null Count   Dtype         
---  ------        --------------   -----         
 0   date          266400 non-null  datetime64[ns]
 1   observatory   266400 non-null  object        
 2   xrsa          261303 non-null  float64       
 3   xrsb          261302 non-null  float64       
 4   xrsa_corr     96601 non-null   float64       
 5   xrsb_corr     91943 non-null   float64       
 6   T_cor         260786 non-null  float64       
 7   EM_cor        91943 non-null   float64       
 8   T_phot        260786 non-null  float64       
 9   EM_phot       91943 non-null   float64       
 10  EM_cor_norm   91943 non-null   float64       
 11  EM_phot_norm  91943 non-null   float64       
dtypes: datetime64[ns](1), float64(10), object(1)
memory usage: 24.4+ MB


In [11]:
df_flares_clean.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2022 entries, 0 to 2021
Data columns (total 5 columns):
 #   Column       Non-Null Count  Dtype         
---  ------       --------------  -----         
 0   StartTime    2022 non-null   object        
 1   EndTime      2022 non-null   object        
 2   Class        2022 non-null   object        
 3   Observatory  2022 non-null   object        
 4   PeakTime     2022 non-null   datetime64[ns]
dtypes: datetime64[ns](1), object(4)
memory usage: 94.8+ KB


In [12]:
df_flares_clean

Unnamed: 0,StartTime,EndTime,Class,Observatory,PeakTime
0,2000-06-06 00:28:00,2000-06-06 00:55:00,C4.6,GOES,2000-06-06 00:43:00
1,2000-06-06 01:30:00,2000-06-06 02:01:00,C2.4,GOES,2000-06-06 01:49:00
2,2000-06-06 08:06:00,2000-06-06 08:34:00,C2.8,GOES,2000-06-06 08:16:00
3,2000-06-06 08:47:00,2000-06-06 08:54:00,C2.4,GOES,2000-06-06 08:51:00
4,2000-06-06 11:23:00,2000-06-06 11:29:00,C1.8,GOES,2000-06-06 11:27:00
...,...,...,...,...,...
2017,2025-06-19 07:13:00,2025-06-19 07:51:00,C6.0,GOES,2025-06-19 07:27:00
2018,2025-06-19 08:23:00,2025-06-19 08:29:00,C2.3,GOES,2025-06-19 08:27:00
2019,2025-06-19 09:06:00,2025-06-19 09:53:00,C7.9,GOES,2025-06-19 09:17:00
2020,2025-06-19 10:31:00,2025-06-19 10:39:00,C1.9,GOES,2025-06-19 10:37:00


In [13]:
df_flares

Unnamed: 0,StartTime,EndTime,Class,Observatory,PeakTime
0,2000-06-06 00:28:00,2000-06-06 00:55:00,C4.6,GOES,2000-06-06 00:43:00
1,2000-06-06 01:30:00,2000-06-06 02:01:00,C2.4,GOES,2000-06-06 01:49:00
2,2000-06-06 08:06:00,2000-06-06 08:34:00,C2.8,GOES,2000-06-06 08:16:00
3,2000-06-06 08:47:00,2000-06-06 08:54:00,C2.4,GOES,2000-06-06 08:51:00
4,2000-06-06 11:23:00,2000-06-06 11:29:00,C1.8,GOES,2000-06-06 11:27:00
...,...,...,...,...,...
2017,2025-06-19 07:13:00,2025-06-19 07:51:00,C6.0,GOES,2025-06-19 07:27:00
2018,2025-06-19 08:23:00,2025-06-19 08:29:00,C2.3,GOES,2025-06-19 08:27:00
2019,2025-06-19 09:06:00,2025-06-19 09:53:00,C7.9,GOES,2025-06-19 09:17:00
2020,2025-06-19 10:31:00,2025-06-19 10:39:00,C1.9,GOES,2025-06-19 10:37:00
