# Create the Features and Labels Files
After calculating the lifted index, we will aggregate all the features and labels together and create one feature file and one label file.

In [8]:
import os
import glob
import pandas as pd

SILVER_GPH20S10K_PATH = '/Users/olievortex/lakehouse/default/Files/silver/igra2/gph20s10k'
SILVER_LI_PATH = '/Users/olievortex/lakehouse/default/Files/silver/igra2/li'
GOLD_PARQUET_PATH = '/Users/olievortex/lakehouse/default/Files/gold/igra2/liftedindex_lr'

# You will need the pyarrow package to create a parquet file
#%conda install pyarrow

In [9]:
# Make sure the destiation folder exists
os.makedirs(GOLD_PARQUET_PATH, exist_ok=True)


In [10]:
def df_from_concatinated_csv(path: str, pattern: str) -> pd.DataFrame:
    """Load multiple csv files into one DataFrame"""
    
    # Get the list of files from a folder
    file_list = glob.glob(path + '/' + pattern)

    # Read each CSV file into DataFrame
    # This creates a list of dataframes
    df_list = (pd.read_csv(file) for file in file_list)

    # Pandas doesn't like to concatinate "empty" DataFrames
    df_list = [dataframe for dataframe in df_list if dataframe.shape[0] > 0]

    # Concatenate all DataFrames
    return pd.concat(df_list, ignore_index=True)

def df_merge(features: pd.DataFrame, labels: pd.DataFrame):
    """Perform inner join on raw features and labels"""

    # Inner Join on id, effectie_date, and hour
    return features \
        .merge(labels, how='inner', on=['id', 'effective_date', 'hour'])

def orchestrate() -> pd.DataFrame:
    """Load and merge the raw files"""
    df_labels = df_from_concatinated_csv(SILVER_LI_PATH, '*-data-li.csv')
    df_features = df_from_concatinated_csv(SILVER_GPH20S10K_PATH, '*-data-gph20s10k.csv')
    return df_merge(df_features, df_labels)


In [11]:
# Get a combined DataFrame of the raw feauters and labels
df = orchestrate()

In [12]:
# Sanity checks
print(f"Row count: {df.shape[0]}")
print("df.info:")
df.info()

Row count: 336030
df.info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 336030 entries, 0 to 336029
Columns: 131 entries, id to li
dtypes: float64(128), int64(1), object(2)
memory usage: 335.8+ MB


In [13]:
# View statistics
df.describe()

Unnamed: 0,hour,day_num,0_gph,0_pres,0_temp,0_dp,0_u,0_v,1_gph,1_pres,...,19_dp,19_u,19_v,20_gph,20_pres,20_temp,20_dp,20_u,20_v,li
count,336030.0,336030.0,336030.0,336030.0,336030.0,336030.0,336030.0,336030.0,336030.0,336030.0,...,336030.0,336030.0,336030.0,336030.0,336030.0,336030.0,336030.0,336030.0,336030.0,336030.0
mean,6.554748,-0.019505,511.762596,956.545331,13.04313,5.553043,-0.109772,0.430937,679.730111,937.312283,...,-55.3412,20.648738,-0.639877,10000.0,274.429911,-44.497937,-58.436341,21.710722,-0.494767,8.682874
std,6.10138,0.711816,397.491861,44.795354,12.567463,11.433532,2.771751,3.514093,277.989375,31.010859,...,10.201488,16.024596,16.309995,0.0,12.245019,7.105588,9.595212,16.433255,16.627336,10.513349
min,0.0,-1.0,14.0,824.2,-42.0,-46.4,-55.2,-87.0,513.3,824.2,...,-88.8,-49.0,-90.7,10000.0,231.0,-68.5,-90.4,-46.7,-105.0,-27.5
25%,0.0,-0.73,230.0,919.5,3.8,-3.1,-1.9,-1.7,513.3,919.5,...,-62.1,9.4,-9.5,10000.0,266.4,-50.0,-64.7,10.2,-9.7,-0.1
50%,12.0,-0.03,357.0,972.8,14.4,6.0,-0.0,0.3,513.3,952.5,...,-55.4,19.6,-0.5,10000.0,276.2,-44.6,-58.7,20.7,-0.4,7.4
75%,12.0,0.69,849.0,988.3,22.8,15.2,1.5,2.6,849.0,957.8,...,-48.2,31.0,8.6,10000.0,283.5,-38.9,-51.8,32.4,9.0,16.3
max,23.0,1.0,1475.0,1042.7,43.7,37.1,38.6,46.3,1475.0,986.8,...,-5.0,108.7,80.5,10000.0,520.2,-4.8,-6.0,107.8,128.6,57.3


In [14]:
# Write the parquet file.
df.to_parquet(GOLD_PARQUET_PATH + '/gph20s10k_li.parquet')
