# Create the Features and Labels Files
After calculating the lifted index, we will aggregate all the features and labels together and create one feature file and one label file.

In [1]:
import os
import glob
import pandas as pd

SILVER_GPH20S10K_PATH = '/lakehouse/default/Files/silver/igra2/gph20s10k'
SILVER_LI_PATH = '/lakehouse/default/Files/silver/igra2/li'
GOLD_PARQUET_PATH = '/lakehouse/default/Files/gold/igra2/liftedindex_lr'

# You will need the pyarrow package to create a parquet file
#%conda install pyarrow

In [2]:
# Make sure the destiation folder exists
os.makedirs(GOLD_PARQUET_PATH, exist_ok=True)


In [3]:
def df_from_concatinated_csv(path: str, pattern: str) -> pd.DataFrame:
    """Load multiple csv files into one DataFrame"""
    
    # Get the list of files from a folder
    file_list = glob.glob(path + '/' + pattern)

    # Read each CSV file into DataFrame
    # This creates a list of dataframes
    df_list = (pd.read_csv(file) for file in file_list)

    # Pandas doesn't like to concatinate "empty" DataFrames
    df_list = [dataframe for dataframe in df_list if dataframe.shape[0] > 0]

    # Concatenate all DataFrames
    return pd.concat(df_list, ignore_index=True)

def df_merge(features: pd.DataFrame, labels: pd.DataFrame):
    """Perform inner join on raw features and labels"""

    # Inner Join on id, effectie_date, and hour
    return features \
        .merge(labels, how='inner', on=['id', 'effective_date', 'hour'])

def orchestrate() -> pd.DataFrame:
    """Load and merge the raw files"""
    df_labels = df_from_concatinated_csv(SILVER_LI_PATH, '*-data-li.csv')
    df_features = df_from_concatinated_csv(SILVER_GPH20S10K_PATH, '*-data-gph20s10k.csv')
    return df_merge(df_features, df_labels)


In [4]:
# Get a combined DataFrame of the raw feauters and labels
df = orchestrate()

In [5]:
# Sanity checks
print(f"Row count: {df.shape[0]}")
print("df.info:")
df.info()

Row count: 331801
df.info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 331801 entries, 0 to 331800
Columns: 131 entries, id to li
dtypes: float64(128), int64(1), object(2)
memory usage: 331.6+ MB


In [6]:
# View statistics
df.describe()

Unnamed: 0,hour,day_num,0_gph,0_pres,0_temp,0_dp,0_u,0_v,1_gph,1_pres,...,19_dp,19_u,19_v,20_gph,20_pres,20_temp,20_dp,20_u,20_v,li
count,331801.0,331801.0,331801.0,331801.0,331801.0,331801.0,331801.0,331801.0,331801.0,331801.0,...,331801.0,331801.0,331801.0,331801.0,331801.0,331801.0,331801.0,331801.0,331801.0,331801.0
mean,6.561436,-0.011635,529.589516,954.507033,13.095635,5.543753,-0.210102,0.59661,799.923079,923.753975,...,-55.35328,20.607472,-0.651793,10000.0,274.551824,-44.42274,-58.393462,21.644289,-0.512284,8.621142
std,6.102106,0.711634,377.485995,42.480249,12.470388,11.377085,2.965843,3.940514,208.630093,23.253499,...,10.204038,16.000494,16.293743,0.0,12.225762,7.088883,9.606863,16.397635,16.603024,10.516321
min,0.0,-1.0,198.0,824.2,-42.0,-46.4,-55.2,-87.0,688.1,824.2,...,-88.8,-49.0,-91.1,10000.0,231.0,-68.5,-90.4,-46.7,-105.0,-27.5
25%,0.0,-0.73,230.0,919.6,3.9,-3.0,-2.0,-1.7,688.1,918.7,...,-62.1,9.4,-9.5,10000.0,266.6,-49.9,-64.7,10.1,-9.7,-0.1
50%,12.0,-0.02,357.0,972.8,14.6,6.1,0.0,0.5,688.1,933.1,...,-55.4,19.5,-0.5,10000.0,276.4,-44.5,-58.6,20.6,-0.4,7.3
75%,12.0,0.69,849.0,987.2,22.8,15.2,1.5,2.9,849.0,938.2,...,-48.2,30.9,8.5,10000.0,283.5,-38.8,-51.7,32.3,8.9,16.2
max,23.0,1.0,1475.0,1021.1,43.7,37.1,38.6,46.3,1475.0,964.5,...,-5.0,109.0,80.4,10000.0,520.2,-4.8,-6.0,107.8,128.6,57.3


In [7]:
# Write the parquet file.
df.to_parquet(GOLD_PARQUET_PATH + '/gph20s10k_li.parquet')
