## Correcting some columns without removing data

In [1]:
# Packages
import pandas as pd
import numpy as np

In [2]:
# Defining the path for the dataframe
dataframe = 'C:/Users/Peter/py_projects/solar_stats/cdaw_cme_flare_ar_smart_database.p'

# Importing the dataframe 
df = pd.read_pickle(dataframe)

### GOES flux

In [3]:
# Splitting goes_class column into 2 so as to separate the class form the flux value

""" The pattern ([a-zA-Z]+)([^a-zA-Z]+) means match a group of letters: ([a-zA-Z]+) followed by a group 
    of non letters: ([^a-zA-Z]+) """

glux = df.flare_goes_class.str.extract('([a-zA-Z]+)([^a-zA-Z]+)', expand=True)
glux.columns = ['goes_class', 'goes_flux']

# Adding new class column into df 
df['goes_class'] = glux['goes_class']

In [4]:
glux

Unnamed: 0,goes_class,goes_flux
0,,
1,,
2,,
3,,
4,,
...,...,...
30223,,
30224,,
30225,,
30226,,


In [5]:
# Setting the NaNs to zeros for the time being
glux = glux.fillna(0)

In [6]:
# Checking which values need correcting
print(glux.goes_flux[pd.to_numeric(glux.goes_flux, errors='coerce').isnull()])

5433      7,4
5434      7,4
5585     6.1*
5640     2.6*
5641     2.6*
         ... 
28147    2.3*
28330    2.8*
28568    1.4*
28675    1.4*
28723    1.1*
Name: goes_flux, Length: 331, dtype: object


In [7]:
# Getting rid of the asterisks
glux.goes_flux = glux.goes_flux.replace('\*','',regex=True)

In [8]:
# Checking what else needs to be corrected
print(glux.goes_flux[pd.to_numeric(glux.goes_flux, errors='coerce').isnull()])

5433    7,4
5434    7,4
Name: goes_flux, dtype: object


In [9]:
# Okay, replacing the commas with periods
glux.goes_flux = glux.goes_flux.replace('\,','.',regex=True)

In [10]:
# Should be all?
print(glux.goes_flux[pd.to_numeric(glux.goes_flux, errors='coerce').isnull()])

Series([], Name: goes_flux, dtype: object)


In [11]:
# Converting to a float for future use
glux.goes_flux = glux.goes_flux.astype(float)

In [12]:
glux

Unnamed: 0,goes_class,goes_flux
0,0,0.0
1,0,0.0
2,0,0.0
3,0,0.0
4,0,0.0
...,...,...
30223,0,0.0
30224,0,0.0
30225,0,0.0
30226,0,0.0


In [13]:
# Now multiplying the flux values by the relevant power for its GOES class
glux.loc[glux.goes_class == 'A', 'goes_flux'] *= 1e-8
glux.loc[glux.goes_class == 'B', 'goes_flux'] *= 1e-7
glux.loc[glux.goes_class == 'C', 'goes_flux'] *= 1e-6
glux.loc[glux.goes_class == 'M', 'goes_flux'] *= 1e-5
glux.loc[glux.goes_class == 'X', 'goes_flux'] *= 1e-4

In [14]:
# Now let's replace the NaNs 
glux.goes_flux = glux.goes_flux.replace({0:np.nan})
glux.goes_class = glux.goes_class.replace({0:np.nan})

In [15]:
glux

Unnamed: 0,goes_class,goes_flux
0,,
1,,
2,,
3,,
4,,
...,...,...
30223,,
30224,,
30225,,
30226,,


In [16]:
df

Unnamed: 0,cme_time,cme_angle,cme_width,cme_speed,initial_2nd_order,final_2nd_order,20r_2nd_order,cme_acceleration,cme_mass,cme_kinetic_energy,...,smart_psl_length,smart_r_value,smart_b_max,smart_b_min,smart_observation_time,smart_string_latlon,smart_total_area,smart_negative_area,smart_hg_longitude,goes_class
0,1996-08-01 12:36:35,274.0,35.0,499.0,451.0,551.0,615.0,7.4,2.6e+14,3.2e+29,...,,,,,NaT,,,,,
1,1996-08-01 18:12:21,210.0,82.0,118.0,94.0,142.0,324.0,4.0,9.8e+13,6.8e+27,...,,,,,NaT,,,,,
2,1996-08-03 19:45:37,231.0,26.0,71.0,59.0,84.0,128.0,0.5,,,...,,,,,NaT,,,,,
3,1996-08-07 13:15:05,242.0,27.0,,,,,,,,...,,,,,NaT,,,,,
4,1996-08-10 09:25:05,68.0,68.0,148.0,133.0,164.0,201.0,1.0,4.6e+14,5.0e+28,...,,,,,NaT,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
30223,2020-05-29 11:24:05,84.0,52.0,347.0,406.0,288.0,0.0,-21.9,,,...,,,,,NaT,,,,,
30224,2020-05-29 15:12:05,83.0,46.0,204.0,215.0,193.0,0.0,-2.8,,,...,,,,,NaT,,,,,
30225,2020-05-29 17:36:05,76.0,34.0,210.0,295.0,120.0,0.0,-19.3,,,...,,,,,NaT,,,,,
30226,2020-05-30 01:25:43,80.0,39.0,269.0,203.0,339.0,331.0,2.8,,,...,,,,,NaT,,,,,


In [17]:
# Adding the new goes_flux column into the database
df['goes_flux'] = glux.goes_flux

In [18]:
df['goes_flux']

0       NaN
1       NaN
2       NaN
3       NaN
4       NaN
         ..
30223   NaN
30224   NaN
30225   NaN
30226   NaN
30227   NaN
Name: goes_flux, Length: 30228, dtype: float64

### SRS Hale Classes

In [19]:
# Need to group Beta and BETA together, etc.
df['srs_hale'].replace({"ALPHA": 1, "Alpha": 1, 
                         "BETA": 2, "Beta": 2, 
                         "BETA-GAMMA": 3, "Beta-Gamma": 3, 
                         "BETA-DELTA": 4, "Beta-Delta": 4, 
                         "BETA-GAMMA-DELTA": 5, "Beta-Gamma-Delta": 5,
                         "GAMMA-DELTA": 6, "Gamma-Delta": 6,}, inplace=True)

In [20]:
df['srs_hale']

0       NaN
1       NaN
2       NaN
3       NaN
4       NaN
         ..
30223   NaN
30224   NaN
30225   NaN
30226   NaN
30227   NaN
Name: srs_hale, Length: 30228, dtype: float64

### CME halo

In [21]:
# Replacing the non-numeric hale classes with numeric values
df['cme_halo'].replace({"I": "1", "II": "2", "III": "3", "IV": "4"}, inplace=True)
df['cme_halo'] = df['cme_halo'].astype(float)

In [22]:
df['cme_halo']

0        1.0
1        1.0
2        1.0
3        1.0
4        1.0
        ... 
30223    1.0
30224    1.0
30225    1.0
30226    1.0
30227    1.0
Name: cme_halo, Length: 30228, dtype: float64

### CME Kinetic Energy

In [23]:
# Setting the NaNs to zeros for the time being
df['cme_kinetic_energy'] = df['cme_kinetic_energy'].fillna(0)

In [24]:
# Checking which values need correcting
print(df.cme_kinetic_energy[pd.to_numeric(df.cme_kinetic_energy, errors='coerce').isnull()])

10495    
Name: cme_kinetic_energy, dtype: object


In [25]:
# There are empty values ---> replacing them with zeros
df.cme_kinetic_energy = df.cme_kinetic_energy.replace('',0,regex=True)

In [26]:
# Just converting to a float
df['cme_kinetic_energy'] = df['cme_kinetic_energy'].astype(float)

In [27]:
# Replacing the NaNs 
df.cme_kinetic_energy = df.cme_kinetic_energy.replace({0:np.nan})

In [28]:
df['cme_kinetic_energy']

0        3.200000e+29
1        6.800000e+27
2                 NaN
3                 NaN
4        5.000000e+28
             ...     
30223             NaN
30224             NaN
30225             NaN
30226             NaN
30227             NaN
Name: cme_kinetic_energy, Length: 30228, dtype: float64

### NOTE: there are many zero values

In [31]:
(df == 0).astype(int).sum(axis=0)

cme_time                 0
cme_angle               48
cme_width                0
cme_speed                0
initial_2nd_order      846
                      ... 
smart_total_area         0
smart_negative_area      4
smart_hg_longitude       1
goes_class               0
goes_flux                0
Length: 61, dtype: int64

In [32]:
# Going to leave this for now, simple fix anyways ---> replace them with NaNs and then remove when needed

## Fixing the issue with the LMSAL entries in the flare_end_time column

In [None]:
# Getting columns I want
df1 = df[['flare_start_time', 'flare_end_time']]
df1 = df1.dropna(how='any')
df1 = df1.reset_index()

In [None]:
# Finding the duration of the flares
durations = calculate_flare_duration(df1['flare_start_time'], df1['flare_end_time'])

In [None]:
# Finding out how many are negative
print((durations < 0).astype(int).sum(axis=0))
print((durations > 0).astype(int).sum(axis=0))

In [None]:
# Making a list of the indicies for the incorrect entries 
wrong_ind = df1.index[durations < 0].tolist()

In [None]:
# Checking that the length matches up
len(wrong_ind)

In [None]:
# Checking which database the wrong entries come from
nd = df[['flare_start_time', 'flare_end_time', 'flare_type']]
nd = nd.dropna(how='any')
nd = nd.reset_index(drop=True)

In [None]:
# Making a df containing the indicies of the incorrect entries
wrong_db = nd.iloc[wrong_ind, :]
wrong_db

### Saving the new df

In [33]:
df.to_pickle('custom_df.p')

In [35]:
# Testing the save
df2 = 'C:/Users/Peter/py_projects/solar_stats/custom_df.p'
test = pd.read_pickle(df2)

Unnamed: 0,cme_time,cme_angle,cme_width,cme_speed,initial_2nd_order,final_2nd_order,20r_2nd_order,cme_acceleration,cme_mass,cme_kinetic_energy,...,smart_r_value,smart_b_max,smart_b_min,smart_observation_time,smart_string_latlon,smart_total_area,smart_negative_area,smart_hg_longitude,goes_class,goes_flux
0,1996-08-01 12:36:35,274.0,35.0,499.0,451.0,551.0,615.0,7.4,2.6e+14,3.200000e+29,...,,,,NaT,,,,,,
1,1996-08-01 18:12:21,210.0,82.0,118.0,94.0,142.0,324.0,4.0,9.8e+13,6.800000e+27,...,,,,NaT,,,,,,
2,1996-08-03 19:45:37,231.0,26.0,71.0,59.0,84.0,128.0,0.5,,,...,,,,NaT,,,,,,
3,1996-08-07 13:15:05,242.0,27.0,,,,,,,,...,,,,NaT,,,,,,
4,1996-08-10 09:25:05,68.0,68.0,148.0,133.0,164.0,201.0,1.0,4.6e+14,5.000000e+28,...,,,,NaT,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
30223,2020-05-29 11:24:05,84.0,52.0,347.0,406.0,288.0,0.0,-21.9,,,...,,,,NaT,,,,,,
30224,2020-05-29 15:12:05,83.0,46.0,204.0,215.0,193.0,0.0,-2.8,,,...,,,,NaT,,,,,,
30225,2020-05-29 17:36:05,76.0,34.0,210.0,295.0,120.0,0.0,-19.3,,,...,,,,NaT,,,,,,
30226,2020-05-30 01:25:43,80.0,39.0,269.0,203.0,339.0,331.0,2.8,,,...,,,,NaT,,,,,,
