# Part II: Features Engineering

## Content:
1. Basic settings and load pre-processed data
2. Date time related features
3. Sales related feauters
4. Stores and products related features
5. Weather information

## 1. Basic settings and load-preprocessed data

In [1]:
from pathlib import Path

import numpy as np
import pandas as pd
from tqdm import tqdm

pd.options.display.max_columns = 20
pd.options.display.max_rows = 150

In [2]:
preprocessed_data_path = "../data/preprocessed-data/sales_data_preprocessed.csv"
df_sales = pd.read_csv(preprocessed_data_path, parse_dates=["date"])

In [3]:
df_sales.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 365500 entries, 0 to 365499
Data columns (total 4 columns):
 #   Column  Non-Null Count   Dtype         
---  ------  --------------   -----         
 0   date    365500 non-null  datetime64[ns]
 1   store   365500 non-null  int64         
 2   item    365500 non-null  int64         
 3   sales   365500 non-null  float64       
dtypes: datetime64[ns](1), float64(1), int64(2)
memory usage: 11.2 MB


In [4]:
df_sales.date.min(), df_sales.date.max()

(Timestamp('2016-01-01 00:00:00'), Timestamp('2017-12-31 00:00:00'))

## 2. Date time related features

In [5]:
df = df_sales.copy()

In [6]:
def create_dt_features(df):
    """
    Enhances a DataFrame by extracting various date-related features from a 'date' column.

    Parameters:
    - df (pd.DataFrame): Input DataFrame containing a 'date' column.

    Returns:
    - pd.DataFrame: DataFrame with additional date-related features.
    """

    # Extracting month from the date
    df_new = df.assign(
        month=df.date.dt.month,
        # Extracting day of the week (0 = Monday, 6 = Sunday)
        day_of_week=df.date.dt.weekday,
        # Extracting day of the month
        day_of_month=df.date.dt.day,
        # Extracting day of the year
        day_of_year=df.date.dt.dayofyear,
        # Extracting ISO week of the year
        week_of_year=df.date.dt.isocalendar().week.astype(np.int64),
        # Checking if the day is a weekend (Saturday or Sunday) and converting to int
        is_weekend=(df.date.dt.weekday > 4).astype(np.int64),
        # Checking if the day is the start of the month and converting to int
        is_month_start=df.date.dt.is_month_start.astype(np.int64),
        # Checking if the day is the end of the month and converting to int
        is_month_end=df.date.dt.is_month_end.astype(np.int64),
        is_quarter_start=df.date.dt.is_quarter_start.astype(np.int64),
        is_quarter_end=df.date.dt.is_quarter_end.astype(np.int64),
        is_year_start=df.date.dt.is_year_start.astype(np.int64),
        is_year_end=df.date.dt.is_year_end.astype(np.int64),
    )

    return df_new

In [7]:
create_dt_features(df).head()

Unnamed: 0,date,store,item,sales,month,day_of_week,day_of_month,day_of_year,week_of_year,is_weekend,is_month_start,is_month_end,is_quarter_start,is_quarter_end,is_year_start,is_year_end
0,2016-01-01,1,1,21.0,1,4,1,1,53,0,1,0,1,0,1,0
1,2016-01-02,1,1,13.0,1,5,2,2,53,1,0,0,0,0,0,0
2,2016-01-03,1,1,12.0,1,6,3,3,53,1,0,0,0,0,0,0
3,2016-01-04,1,1,12.0,1,0,4,4,1,0,0,0,0,0,0,0
4,2016-01-05,1,1,12.0,1,1,5,5,1,0,0,0,0,0,0,0


## 3. Sales related features

In [8]:
# get the stores and items list
stores_lst = df["store"].unique().tolist()
items_lst = df["item"].unique().tolist()

### 3.1. Last n days sales

In [9]:
days = 1
keys = ["store", "item", "date"]
(
    df.sort_values(keys).assign(
        **{f"last_{days}_days_sales": lambda df: df["sales"].shift(days)}
    )
).iloc[:10]

Unnamed: 0,date,store,item,sales,last_1_days_sales
0,2016-01-01,1,1,21.0,
1,2016-01-02,1,1,13.0,21.0
2,2016-01-03,1,1,12.0,13.0
3,2016-01-04,1,1,12.0,12.0
4,2016-01-05,1,1,12.0,12.0
5,2016-01-06,1,1,16.0,12.0
6,2016-01-07,1,1,12.0,16.0
7,2016-01-08,1,1,12.0,12.0
8,2016-01-09,1,1,16.0,12.0
9,2016-01-10,1,1,26.0,16.0


In [10]:
def add_last_ndays_sales(df, lst_days=[1, 7, 14, 21, 28, 90, 180]):
    # lst_days = [1, 7, 14, 21, 28, 90, 180]
    keys = ["store", "item", "date"]
    for days in lst_days:
        df = df.sort_values(keys).assign(
            **{f"last_{days}_days_sales": lambda df: df["sales"].shift(days)}
        )
    return df

In [11]:
# add_last_ndays_sales(df).iloc[:50]

### 3.2. Recent mean, min, max, std sales in n days

In [12]:
days = 7
keys = ["store", "item", "date"]
(
    df.sort_values(keys).assign(
        **{
            f"recent_{days}_mean_sales": lambda df: df["sales"].rolling(days).mean(),
            f"recent_{days}_min_sales": lambda df: df["sales"].rolling(days).min(),
            f"recent_{days}_max_sales": lambda df: df["sales"].rolling(days).max(),
            f"recent_{days}_std_sales": lambda df: df["sales"].rolling(days).std(),
        }
    )
).iloc[:10]

Unnamed: 0,date,store,item,sales,recent_7_mean_sales,recent_7_min_sales,recent_7_max_sales,recent_7_std_sales
0,2016-01-01,1,1,21.0,,,,
1,2016-01-02,1,1,13.0,,,,
2,2016-01-03,1,1,12.0,,,,
3,2016-01-04,1,1,12.0,,,,
4,2016-01-05,1,1,12.0,,,,
5,2016-01-06,1,1,16.0,,,,
6,2016-01-07,1,1,12.0,14.0,12.0,21.0,3.41565
7,2016-01-08,1,1,12.0,12.714286,12.0,16.0,1.496026
8,2016-01-09,1,1,16.0,13.142857,12.0,16.0,1.9518
9,2016-01-10,1,1,26.0,15.142857,12.0,26.0,5.145502


In [13]:
def add_ndays_sales_stats(df, lst_days=[7, 14, 28, 60, 90]):
    keys = ["store", "item", "date"]
    for days in lst_days:
        df = df.sort_values(keys).assign(
            **{
                f"recent_{days}_days_mean_sales": lambda df: df["sales"]
                .rolling(days)
                .mean(),
                f"recent_{days}_days_min_sales": lambda df: df["sales"].rolling(days).min(),
                f"recent_{days}_day_max_sales": lambda df: df["sales"].rolling(days).max(),
                f"recent_{days}_day_std_sales": lambda df: df["sales"].rolling(days).std(),
            }
        )
    return df

In [14]:
# add_ndays_sales_stats(df).iloc[:50]

In [15]:
# Need to process by store & item

### 3.3. Exponentially weighted moving average sales

In [16]:
days = 3
alpha = 0.5
keys = ["store", "item", "date"]
(
    df.sort_values(keys).assign(
        **{
            f"sales_ewma_{days}_sales": lambda df: df["sales"]
            .shift(days)
            .ewm(alpha=alpha)
            .mean()
        }
    )
).iloc[:10]

Unnamed: 0,date,store,item,sales,sales_ewma_3_sales
0,2016-01-01,1,1,21.0,
1,2016-01-02,1,1,13.0,
2,2016-01-03,1,1,12.0,
3,2016-01-04,1,1,12.0,21.0
4,2016-01-05,1,1,12.0,15.666667
5,2016-01-06,1,1,16.0,13.571429
6,2016-01-07,1,1,12.0,12.733333
7,2016-01-08,1,1,12.0,12.354839
8,2016-01-09,1,1,16.0,14.206349
9,2016-01-10,1,1,26.0,13.094488


In [17]:
def add_ewma_sales(df, lst_days=[3, 7, 14, 28, 60, 90], alphas=[0.5, 0.75, 0.9]):
    keys = ["store", "item", "date"]

    for alpha in alphas:
        for days in lst_days:
            df = df.sort_values(keys).assign(
                **{
                    f"sales_ewma_{alpha}_alpha_{days}_days_sales": lambda df: df[
                        "sales"
                    ]
                    .shift(days)
                    .ewm(alpha=alpha)
                    .mean()
                }
            )
    return df

In [18]:
# add_ewma_sales(df).iloc[:10]

## 4. Stores and items related features

### 4.1. Stores and items sales

In [19]:
date_start = pd.Timestamp("2016-01-01")

days = 7
date_end = date_start + pd.Timedelta(days, unit="D")
# print(end)

gb_keys = ["store"]

condition = "(date >= @date_start) & (date < @date_end)"
_df = df.query(condition)
_df_sum = _df.groupby(gb_keys).sales.sum()
_df_mean = _df.groupby(gb_keys).sales.mean()

df_result = pd.DataFrame(
    {
        "date": date_end,
        f"sum_store_sales_in_{days}days": _df_sum,
        f"mean_store_sales_in_{days}days": _df_mean,
    }
)

In [20]:
df_result

Unnamed: 0_level_0,date,sum_store_sales_in_7days,mean_store_sales_in_7days
store,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,2016-01-08,12409.138719,35.454682
2,2016-01-08,17536.569359,50.104484
3,2016-01-08,15371.854039,43.919583
4,2016-01-08,14344.28468,40.983671
5,2016-01-08,10577.996379,30.222847
6,2016-01-08,10299.427019,29.426934
7,2016-01-08,9463.711699,27.039176
8,2016-01-08,16648.854039,47.568154
9,2016-01-08,13997.569359,39.993055
10,2016-01-08,15396.427019,43.989791


In [21]:
def add_summary_sales_features(df, date_range, gb_keys=["store"], days=7):
    start_flg = True
    for date_start in date_range:
        date_end = date_start + pd.Timedelta(days, unit="D")
        time_range_condition = "(date >= @date_start) & (date < @date_end)"
        _df = df.query(time_range_condition)
        _df_sum = _df.groupby(gb_keys).sales.sum()
        _df_mean = _df.groupby(gb_keys).sales.mean()
        if start_flg:
            df_result = pd.DataFrame(
                {
                    "date": date_end,
                    f"sum_{days}_days_{gb_keys[0]}_sales": _df_sum,
                    f"mean_{days}_days_{gb_keys[0]}_sales": _df_mean,
                }
            )
            start_flg = False
        else:
            df_new_row = pd.DataFrame(
                {
                    "date": date_end,
                    f"sum_{days}_days_{gb_keys[0]}_sales": _df_sum,
                    f"mean_{days}_days_{gb_keys[0]}_sales": _df_mean,
                }
            )
            df_result = pd.concat([df_result, df_new_row])
    # limit the range of result dataframe
    upper_limit = df.date.max()
    df_result = df_result.query("date <= @upper_limit")
    return df_result

In [22]:
date_range = pd.date_range("2016-01-01", "2016-01-01")
days = 7
display(
    add_summary_sales_features(df, date_range=date_range, gb_keys=["item"], days=days)
)

Unnamed: 0_level_0,date,sum_7_days_item_sales,mean_7_days_item_sales
item,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,2016-01-08,1300.427019,18.577529
2,2016-01-08,3024.28468,43.204067
3,2016-01-08,1939.0,27.7
4,2016-01-08,1117.0,15.957143
5,2016-01-08,1040.28468,14.86121
6,2016-01-08,3006.0,42.942857
7,2016-01-08,2942.14234,42.030605
8,2016-01-08,3968.14234,56.687748
9,2016-01-08,2541.0,36.3
10,2016-01-08,3874.427019,55.348957


In [23]:
# date_range = pd.date_range("2016-01-01", "2016-01-01")
# lst_days = [7, 14, 28]
# for days in lst_days:
#     display(add_store_sales_features(df, date_range=date_range, gb_keys=["store"], days=days))

### 4.2. Stores and items encoding

In [24]:
def add_ohe_features(df, columns_to_encode):
    df_encoded = df.copy()
    df_encoded = pd.get_dummies(df_encoded, columns=columns_to_encode)
    return df_encoded

In [25]:
add_ohe_features(df, ["store", "item"])

Unnamed: 0,date,sales,store_1,store_2,store_3,store_4,store_5,store_6,store_7,store_8,...,item_41,item_42,item_43,item_44,item_45,item_46,item_47,item_48,item_49,item_50
0,2016-01-01,21.0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,2016-01-02,13.0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,2016-01-03,12.0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,2016-01-04,12.0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,2016-01-05,12.0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
365495,2017-12-27,63.0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
365496,2017-12-28,59.0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
365497,2017-12-29,74.0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
365498,2017-12-30,62.0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1


## 5. Weather temperature

In [26]:
weather_data_path = Path("../data/preprocessed-data/tokyo_weather.csv")
df_tokyo_weather = pd.read_csv(weather_data_path, parse_dates=["date"])

In [27]:
df_tokyo_weather.shape

(731, 5)

In [28]:
df_tokyo_weather.groupby(["year", "season"]).mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,temperature,humidity
year,season,Unnamed: 2_level_1,Unnamed: 3_level_1
2016,fall,15.182122,61.1078
2016,spring,14.699303,49.456428
2016,summer,24.510443,79.468291
2016,winter,6.408534,31.659593
2017,fall,15.682703,60.184843
2017,spring,14.546191,50.451363
2017,summer,25.355923,80.619643
2017,winter,5.363609,28.39497


## 6. Create features

### 6.1. Add last n days sales

In [29]:
# Add the sales related features
grouped = df.groupby(["store", "item"])
df_lst = []
for store in stores_lst:
    for item in items_lst:
        df_store = grouped.get_group((store, item))
        df_features = add_last_ndays_sales(df_store)
        df_features = add_ndays_sales_stats(df_features)
        # df_features = add_ewma_sales(df_features)
        df_lst.append(df_features)

df_recent_sales = pd.concat(df_lst, axis=0, ignore_index=True)

In [30]:
# df_recent_sales

In [31]:
def add_sales_features(df):
    """Add the sales related features"""
    grouped = df.groupby(["store", "item"])
    df_lst = []
    for store in stores_lst:
        for item in items_lst:
            df_store = grouped.get_group((store, item))
            df_features = add_last_ndays_sales(df_store)
            df_features = add_ndays_sales_stats(df_features)
            # df_features = add_ewma_sales(df_features)
            df_lst.append(df_features)

    df_recent_sales = pd.concat(df_lst, axis=0, ignore_index=True)
    return df_recent_sales

### 6.2. Add stores and items features

In [32]:
# # Add the stores and items sales
# date_range = pd.date_range(df.date.min(), df.date.max(), freq="D")
# days_lst = [7, 14, 30]
# for days in days_lst:
#     df_sales_by_store = add_summary_sales_features(
#         df, date_range=date_range, gb_keys=["store"], days=days
#     )
#     df_sales_by_item = add_summary_sales_features(
#         df, date_range=date_range, gb_keys=["item"], days=days
#     )
#     globals()[f"df_{days}_sales_by_store"] = df_sales_by_store.reset_index()
#     globals()[f"df_{days}_sales_by_item"] = df_sales_by_item.reset_index()

In [33]:
# df_30_sales_by_item

In [34]:
# Add the stores and items sales
# date_range = pd.date_range(df.date.min(), df.date.max(), freq="D")
date_range = pd.date_range("2016-01-01", "2016-01-03", freq="D")
days_lst = [7, 14, 30]
df_joined = pd.DataFrame()

# Create an empty list to store DataFrames
df_joined = pd.DataFrame()

# Flag to track the first iteration
start_flg = True

# Iterate through days_lst with tqdm for progress tracking
for days in days_lst:
# for days in tqdm(days_lst, desc="Processing days_lst"):
    # Calculate sales features for stores and items
    df_sales_by_store = add_summary_sales_features(
        df, date_range=date_range, gb_keys=["store"], days=days
    )
    df_sales_by_item = add_summary_sales_features(
        df, date_range=date_range, gb_keys=["item"], days=days
    )

    # Reset index before concatenating
    df_sales_by_store = df_sales_by_store.reset_index()
    df_sales_by_item = df_sales_by_item.reset_index()

    df2join = pd.merge(df_sales_by_store, df_sales_by_item, on=["date"], how="left")
    if start_flg:
        df_joined = df.merge(df2join, on=["store", "item", "date"], how="left")
        start_flg = False
    else:
        df_joined = df_joined.merge(df2join, on=["store", "item", "date"], how="left")

In [35]:
condition = "(store==2)&(item==2)&(date=='2016-01-31')"
df_joined.query(condition)

Unnamed: 0,date,store,item,sales,sum_7_days_store_sales,mean_7_days_store_sales,sum_7_days_item_sales,mean_7_days_item_sales,sum_14_days_store_sales,mean_14_days_store_sales,sum_14_days_item_sales,mean_14_days_item_sales,sum_30_days_store_sales,mean_30_days_store_sales,sum_30_days_item_sales,mean_30_days_item_sales
4056,2016-01-31,2,2,76.0,,,,,,,,,74954.846796,49.969898,13227.281058,44.090937


In [36]:
def merge_sales_features(df, days_lst):
    """
    Merge sales features for stores and items based on the specified days.

    Parameters:
    - df (pd.DataFrame): Original DataFrame.
    - days_lst (list): List of days for which sales features need to be calculated.

    Returns:
    - pd.DataFrame: Merged DataFrame containing sales features for stores and items.
    """
    # Add the stores and items sales
    date_range = pd.date_range(df.date.min(), df.date.max(), freq="D")

    # Create an empty DataFrame to store the merged results
    df_joined = pd.DataFrame()

    # Flag to track the first iteration
    start_flg = True

    # Iterate through days_lst with tqdm for progress tracking
    for days in days_lst:
        # Calculate sales features for stores and items
        df_sales_by_store = add_summary_sales_features(
            df, date_range=date_range, gb_keys=["store"], days=days
        )
        df_sales_by_item = add_summary_sales_features(
            df, date_range=date_range, gb_keys=["item"], days=days
        )

        # Reset index before concatenating
        df_sales_by_store = df_sales_by_store.reset_index()
        df_sales_by_item = df_sales_by_item.reset_index()

        # Merge sales features for stores and items based on the date
        df2join = pd.merge(df_sales_by_store, df_sales_by_item, on=["date"], how="left")

        if start_flg:
            # For the first iteration, merge with the original DataFrame
            df_joined = df.merge(df2join, on=["store", "item", "date"], how="left")
            start_flg = False
        else:
            # For subsequent iterations, merge with the previously merged DataFrame
            df_joined = df_joined.merge(
                df2join, on=["store", "item", "date"], how="left"
            )

    return df_joined

In [37]:
# Add stores and items related features
df_preprocessed = merge_sales_features(df_recent_sales, days_lst=[7, 14, 30])

In [38]:
# Add One Hot Encoding for stores and items
# df_preprocessed = add_ohe_features(df_preprocessed, columns_to_encode=["store", "item"])

### 6.3. Add weather features

In [39]:
df_tokyo_weather.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 731 entries, 0 to 730
Data columns (total 5 columns):
 #   Column       Non-Null Count  Dtype         
---  ------       --------------  -----         
 0   date         731 non-null    datetime64[ns]
 1   temperature  731 non-null    float64       
 2   humidity     731 non-null    float64       
 3   season       731 non-null    object        
 4   year         731 non-null    int64         
dtypes: datetime64[ns](1), float64(2), int64(1), object(1)
memory usage: 28.7+ KB


In [40]:
# Add weather data
df_preprocessed.merge(
    df_tokyo_weather[["date", "temperature", "humidity"]],
    on=["date"],
    how="left",
).head()

Unnamed: 0,date,store,item,sales,last_1_days_sales,last_7_days_sales,last_14_days_sales,last_21_days_sales,last_28_days_sales,last_90_days_sales,...,sum_14_days_store_sales,mean_14_days_store_sales,sum_14_days_item_sales,mean_14_days_item_sales,sum_30_days_store_sales,mean_30_days_store_sales,sum_30_days_item_sales,mean_30_days_item_sales,temperature,humidity
0,2016-01-01,1,1,21.0,,,,,,,...,,,,,,,,,6.863908,29.685357
1,2016-01-02,1,1,13.0,21.0,,,,,,...,,,,,,,,,8.019488,34.310967
2,2016-01-03,1,1,12.0,13.0,,,,,,...,,,,,,,,,5.766577,43.686565
3,2016-01-04,1,1,12.0,12.0,,,,,,...,,,,,,,,,8.563105,34.792572
4,2016-01-05,1,1,12.0,12.0,,,,,,...,,,,,,,,,0.44381,45.623986


In [41]:
def add_weather_features(df, df_weather):
    """Add weather data"""
    return df.merge(
        df_weather[["date", "temperature", "humidity"]],
        on=["date"],
        how="left",
    )

In [42]:
# df_preprocessed.info()

### 6.4. Main processing (main function)

In [43]:
def create_features(df, df_weather):
    """
    Enhance a DataFrame with additional features for analytical modeling.

    Parameters:
    - df (pd.DataFrame): The original DataFrame containing the base data.
    - df_weather (pd.DataFrame): DataFrame containing weather information to be merged.

    Returns:
    - pd.DataFrame: The DataFrame with added features for analysis and modeling.

    Steps:
    1. Add sales-related features to the DataFrame using the 'add_sales_features' function.
    2. Merge additional sales-related features based on different time windows (7, 14, and 30 days).
    3. Apply One Hot Encoding to categorical columns ('store' and 'item') using 'add_ohe_features'.
    4. Add weather-related features to the DataFrame based on the provided 'df_weather'.

    Example:
    ```python
    # Assuming df and df_weather are the DataFrames to be used
    enhanced_df = create_features(df, df_weather)
    ```

    Note:
    - Ensure that the required functions ('add_sales_features', 'merge_sales_features', 'add_ohe_features',
      and 'add_weather_features') are defined and available in the environment.

    """
    # Add sales features
    df = add_sales_features(df)

    # Add stores and items related features
    df = merge_sales_features(df, days_lst=[7, 14, 30])

    # Add One Hot Encoding for stores and items
    # df = add_ohe_features(df, columns_to_encode=["store", "item"])

    # Add weather information
    df = add_weather_features(df, df_weather)

    return df

In [44]:
df_preprocessed = create_features(df, df_tokyo_weather)

In [45]:
df_preprocessed.shape[1]

45

In [46]:
df_preprocessed.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 365500 entries, 0 to 365499
Data columns (total 45 columns):
 #   Column                     Non-Null Count   Dtype         
---  ------                     --------------   -----         
 0   date                       365500 non-null  datetime64[ns]
 1   store                      365500 non-null  int64         
 2   item                       365500 non-null  int64         
 3   sales                      365500 non-null  float64       
 4   last_1_days_sales          365000 non-null  float64       
 5   last_7_days_sales          362000 non-null  float64       
 6   last_14_days_sales         358500 non-null  float64       
 7   last_21_days_sales         355000 non-null  float64       
 8   last_28_days_sales         351500 non-null  float64       
 9   last_90_days_sales         320500 non-null  float64       
 10  last_180_days_sales        275500 non-null  float64       
 11  recent_7_days_mean_sales   362500 non-null  float64 

## Save data with additional features

In [47]:
def save_data(df, file_path, file_format="feather"):
    """
    Save a DataFrame to a specified file format.

    Parameters:
    - df (pd.DataFrame): The DataFrame to be saved.
    - file_path (str): The path where the file will be saved.
    - file_format (str): The format in which to save the file. Supported formats: 'feather', 'csv'.
                        Default is 'feather'.
    Example:
    ```python
    # Assuming df is the DataFrame you want to save
    save_data(df, 'output_data.feather', file_format='feather')
    ```

    Note:
    - Make sure to have the required libraries (pandas and feather-format) installed.
    """
    if file_format.lower() == "feather":
        # Save to Feather format
        df.to_feather(file_path)
        print(f"DataFrame saved to {file_path} in Feather format.")
    elif file_format.lower() == "csv":
        # Save to CSV format
        df.to_csv(file_path, index=False)
        print(f"DataFrame saved to {file_path} in CSV format.")
    else:
        print(f"Error: Unsupported file format '{file_format}'. Supported formats: 'feather', 'csv'.")

In [48]:
num_features = df_preprocessed.shape[1]
save_path = Path(f"../data/preprocessed-data/data_additional_{num_features}_features.feather")

In [49]:
save_data(df_preprocessed, save_path, file_format='feather')

DataFrame saved to ../data/preprocessed-data/data_additional_45_features.feather in Feather format.
