## Importing the DataFrame

In [1]:
# Packages
import pickle
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
# Defining the path for the dataframe
dataframe = 'C:/Users/Peter/py_projects/solar_stats/cdaw_cme_flare_ar_smart_database.p'

# Using PICKLE to import the dataframe
infile = open(dataframe, 'rb')
df_pickle = pickle.load(infile)
infile.close()

# Can import the dataframe using PANDAS also --> simpler?
df = pd.read_pickle(dataframe)

# Looking at the header
df.head()

Unnamed: 0,cme_time,cme_angle,cme_width,cme_speed,initial_2nd_order,final_2nd_order,20r_2nd_order,cme_acceleration,cme_mass,cme_kinetic_energy,...,smart_positive_area,smart_psl_length,smart_r_value,smart_b_max,smart_b_min,smart_observation_time,smart_string_latlon,smart_total_area,smart_negative_area,smart_hg_longitude
0,1996-08-01 12:36:35,274.0,35.0,499.0,451.0,551.0,615.0,7.4,260000000000000.0,3.2e+29,...,,,,,,NaT,,,,
1,1996-08-01 18:12:21,210.0,82.0,118.0,94.0,142.0,324.0,4.0,98000000000000.0,6.8e+27,...,,,,,,NaT,,,,
2,1996-08-03 19:45:37,231.0,26.0,71.0,59.0,84.0,128.0,0.5,,,...,,,,,,NaT,,,,
3,1996-08-07 13:15:05,242.0,27.0,,,,,,,,...,,,,,,NaT,,,,
4,1996-08-10 09:25:05,68.0,68.0,148.0,133.0,164.0,201.0,1.0,460000000000000.0,5.0000000000000005e+28,...,,,,,,NaT,,,,


In [3]:
df.index

RangeIndex(start=0, stop=30228, step=1)

In [4]:
df.columns

Index(['cme_time', 'cme_angle', 'cme_width', 'cme_speed', 'initial_2nd_order',
       'final_2nd_order', '20r_2nd_order', 'cme_acceleration', 'cme_mass',
       'cme_kinetic_energy', 'cme_mpa', 'cme_remarks', 'cme_quality',
       'cme_halo', 'flare_end_time', 'flare_hg_longitude', 'flare_window_end',
       'flare_hcx_position', 'flare_active_region_no', 'flare_window_start',
       'flare_string_latlon', 'flare_start_time', 'flare_peak_time',
       'flare_type', 'flare_hcy_position', 'flare_hg_latitude',
       'flare_goes_class', 'srs_lon_extent', 'srs_no_spots',
       'srs_carrington_lon', 'srs_active_region_no', 'srs_observation_time',
       'srs_hg_latitude', 'srs_mcintosh', 'srs_area', 'srs_hale',
       'srs_hg_longitude', 'srs_string_latlon', 'smart_hcx_position',
       'smart_flux_fraction', 'smart_limb_event', 'smart_hcy_position',
       'smart_positive_flux', 'smart_bipole_separation', 'smart_negative_flux',
       'smart_b_mean', 'smart_wlsg', 'smart_total_flux', 'sma

In [5]:
df.dtypes

cme_time                   datetime64[ns]
cme_angle                         float64
cme_width                         float64
cme_speed                         float64
initial_2nd_order                 float64
final_2nd_order                   float64
20r_2nd_order                     float64
cme_acceleration                  float64
cme_mass                           object
cme_kinetic_energy                 object
cme_mpa                           float64
cme_remarks                        object
cme_quality                        object
cme_halo                           object
flare_end_time             datetime64[ns]
flare_hg_longitude                 object
flare_window_end           datetime64[ns]
flare_hcx_position                 object
flare_active_region_no             object
flare_window_start         datetime64[ns]
flare_string_latlon                object
flare_start_time           datetime64[ns]
flare_peak_time            datetime64[ns]
flare_type                        

In [6]:
# How many NaNs in the entire dataframe?
tot_count = df.isnull().sum().sum()
print('Dataframe NaN count: ' + str(tot_count))

# How many NaNs per column?
col_count = df.isnull().sum()
print('Column NaN count: \n' + str(col_count))

Dataframe NaN count: 979325
Column NaN count: 
cme_time                       0
cme_angle                    721
cme_width                      0
cme_speed                    103
initial_2nd_order            468
final_2nd_order              468
20r_2nd_order                468
cme_acceleration             468
cme_mass                   10549
cme_kinetic_energy         10581
cme_mpa                        0
cme_remarks                    0
cme_quality                    0
cme_halo                       0
flare_end_time             13676
flare_hg_longitude         22184
flare_window_end               0
flare_hcx_position         24479
flare_active_region_no     18115
flare_window_start             0
flare_string_latlon        24147
flare_start_time           13676
flare_peak_time            13681
flare_type                 13676
flare_hcy_position         24479
flare_hg_latitude          22184
flare_goes_class           19442
srs_lon_extent             22843
srs_no_spots               22

In [7]:
# Simple way to count the number of unique values in a column
len(df["cme_time"].unique().tolist())

29498

In [9]:
# Counting the values within a column 
halo_count = df['cme_halo'].value_counts()
print(halo_count)
print("")

# Notice it is not in order of halo class ---> reindex 
halo_count = halo_count.reindex(index = ['I','II','III','IV'])

# Converting this into a series
halo_series = pd.Series(halo_count, name='Frequency')
print(halo_series)

# Now converting this series into a dataframe
halo_df = halo_series.to_frame()
halo_df

I      27613
II      1334
IV       778
III      503
Name: cme_halo, dtype: int64

I      27613
II      1334
III      503
IV       778
Name: Frequency, dtype: int64


Unnamed: 0,Frequency
I,27613
II,1334
III,503
IV,778
