# Create the Features and Labels Files
After calculating the lifted index, we will aggregate all the features and labels together and create one feature file and one label file.

In [1]:
import os
import glob
import pandas as pd

SILVER_GPH20S10K_PATH = '/usr/datalake/silver/igra/gph20s10k'
SILVER_LI_PATH = '/usr/datalake/silver/igra/li'
GOLD_PARQUET_PATH = '/usr/datalake/silver/igra/liftedindex_lr'

# You will need the pyarrow package to create a parquet file
#%conda install pyarrow

In [2]:
# Make sure the destiation folder exists
os.makedirs(GOLD_PARQUET_PATH, exist_ok=True)


In [3]:
def df_from_concatinated_csv(path: str, pattern: str) -> pd.DataFrame:
    """Load multiple csv files into one DataFrame"""
    
    # Get the list of files from a folder
    file_list = glob.glob(path + '/' + pattern)

    # Read each CSV file into DataFrame
    # This creates a list of dataframes
    df_list = (pd.read_csv(file) for file in file_list)

    # Pandas doesn't like to concatinate "empty" DataFrames
    df_list = [dataframe for dataframe in df_list if dataframe.shape[0] > 0]

    # Concatenate all DataFrames
    return pd.concat(df_list, ignore_index=True)

def df_merge(features: pd.DataFrame, labels: pd.DataFrame):
    """Perform inner join on raw features and labels"""

    # Inner Join on id, effectie_date, and hour
    return features \
        .merge(labels, how='inner', on=['id', 'effective_date', 'hour'])

def orchestrate() -> pd.DataFrame:
    """Load and merge the raw files"""
    df_labels = df_from_concatinated_csv(SILVER_LI_PATH, '*-data-li.csv')
    df_features = df_from_concatinated_csv(SILVER_GPH20S10K_PATH, '*-data-gph20s10k.csv')
    return df_merge(df_features, df_labels)


In [4]:
# Get a combined DataFrame of the raw feauters and labels
df = orchestrate()

In [5]:
# Sanity checks
print(f"Row count: {df.shape[0]}")
print("df.info:")
df.info()

Row count: 359040
df.info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 359040 entries, 0 to 359039
Columns: 131 entries, id to li
dtypes: float64(128), int64(1), object(2)
memory usage: 358.8+ MB


In [6]:
# View statistics
df.describe()

Unnamed: 0,hour,day_num,0_gph,0_pres,0_temp,0_dp,0_u,0_v,1_gph,1_pres,...,19_dp,19_u,19_v,20_gph,20_pres,20_temp,20_dp,20_u,20_v,li
count,359040.0,359040.0,359040.0,359040.0,359040.0,359040.0,359040.0,359040.0,359040.0,359040.0,...,359040.0,359040.0,359040.0,359040.0,359040.0,359040.0,359040.0,359040.0,359040.0,359040.0
mean,6.447368,-0.004734,526.331258,954.856631,13.406533,5.721887,-0.218871,0.603693,798.211643,923.948908,...,-55.188882,20.509333,-0.634633,10000.0,274.690661,-44.301622,-58.234051,21.554595,-0.4911,8.435121
std,6.12793,0.712032,375.98678,42.319421,12.474994,11.359168,2.963668,3.957543,207.241836,23.118147,...,10.223609,16.019458,16.208312,0.0,12.131103,7.09934,9.636109,16.41848,16.525885,10.49025
min,0.0,-1.0,198.0,826.6,-42.0,-46.4,-55.2,-89.6,688.1,826.6,...,-88.8,-49.0,-91.1,10000.0,231.0,-68.5,-90.4,-46.7,-105.0,-27.4
25%,0.0,-0.72,229.0,920.3,4.3,-2.9,-2.0,-1.7,688.1,919.2,...,-62.0,9.3,-9.4,10000.0,266.8,-49.8,-64.6,10.0,-9.6,-0.2
50%,12.0,0.0,357.0,973.0,15.0,6.3,-0.0,0.5,688.1,933.2,...,-55.3,19.4,-0.5,10000.0,276.5,-44.3,-58.5,20.5,-0.5,7.0
75%,12.0,0.71,849.0,987.3,23.0,15.4,1.5,2.9,849.0,938.3,...,-48.0,30.8,8.5,10000.0,283.7,-38.7,-51.5,32.2,8.9,16.0
max,23.0,1.0,1475.0,1022.8,43.7,37.1,38.6,46.3,1475.0,964.6,...,-5.0,109.0,80.4,10000.0,520.2,-4.8,-6.0,107.8,128.6,57.3


In [7]:
# Write the parquet file.
df.to_parquet(GOLD_PARQUET_PATH + '/gph20s10k_li.parquet')
