# Read and print out 5 row of dataset

In [2]:
import pandas as pd

# Read the dataset
df = pd.read_csv('../data/raw/insurance_stocks_weekly_10y.csv')

# Print out 5 rows of the dataset
print(df.head())

         time ticker  close  volume
0  2014-11-30    BIC   5.15  576920
1  2014-12-07    BIC   5.38  865480
2  2014-12-14    BIC   5.42  555740
3  2014-12-21    BIC   5.11  437290
4  2014-12-28    BIC   4.92  589110


In [3]:
df_pivot = df.pivot(index='time', columns='ticker', values='close')

print(df_pivot)

ticker        BIC    BMI    BVH    MIG    PGI
time                                         
2014-11-30   5.15   5.78  28.63    NaN   4.66
2014-12-07   5.38   5.68  29.33    NaN   4.66
2014-12-14   5.42   5.68  29.02    NaN   4.70
2014-12-21   5.11   5.49  24.01    NaN   4.70
2014-12-28   4.92   5.20  24.79    NaN   4.75
...           ...    ...    ...    ...    ...
2025-10-26  25.00  19.30  47.83  17.25  20.00
2025-11-02  24.30  19.65  49.84  18.00  19.95
2025-11-09  24.15  19.80  51.02  17.05  19.50
2025-11-16  23.95  19.25  55.30  17.25  19.90
2025-11-23  23.20  18.70  54.10  17.35  19.70

[574 rows x 5 columns]


In [7]:
import os
import pandas as pd

# Load raw weekly insurance stocks data
raw_path = "../data/raw/insurance_stocks_weekly_10y.csv"
df_raw = pd.read_csv(raw_path)

# Convert time column to datetime and pivot to time x ticker matrix
df_raw["time"] = pd.to_datetime(df_raw["time"])
df_wide = (
    df_raw
    .pivot(index="time", columns="ticker", values="close")
    .sort_index()
)

df_wide["MIG"]

time
2014-11-30      NaN
2014-12-07      NaN
2014-12-14      NaN
2014-12-21      NaN
2014-12-28      NaN
              ...  
2025-10-26    17.25
2025-11-02    18.00
2025-11-09    17.05
2025-11-16    17.25
2025-11-23    17.35
Name: MIG, Length: 574, dtype: float64

In [None]:
import os
import pandas as pd

# === Split each ticker into its own cleaned CSV for training ===

# 1. Load raw weekly insurance stocks data
raw_path = "../data/raw/insurance_stocks_weekly_10y.csv"
df_raw = pd.read_csv(raw_path)

# 2. Convert time to datetime and pivot to time Ã— ticker matrix
df_raw["time"] = pd.to_datetime(df_raw["time"])
df_wide = (
    df_raw
    .pivot(index="time", columns="ticker", values="close")
    .sort_index()
)

# 3. Prepare output directory
output_dir = "../data/processed/per_ticker"
os.makedirs(output_dir, exist_ok=True)

# 4. Clean and save core tickers (BIC, BMI, BVH, PGI) over full history
core_tickers = ["BIC", "BMI", "BVH", "PGI"]
core_filled = (
    df_wide[core_tickers]
    .interpolate(method="linear")
    .ffill()
    .bfill()
)

for ticker in core_tickers:
    series = core_filled[[ticker]].copy()
    series = series.rename(columns={ticker: "close"})
    out = series.reset_index()  # columns: time, close
    out_path = os.path.join(output_dir, f"{ticker}_weekly_clean.csv")
    out.to_csv(out_path, index=False)
    print(f"Saved {ticker} to: {out_path}")

# 5. Clean and save MIG separately: start from first valid date (active period only)
mig_series = df_wide["MIG"].copy()
first_valid_mig = mig_series.first_valid_index()
print(f"First valid MIG date (raw): {first_valid_mig}")

if first_valid_mig is not None:
    # Keep only active MIG period (drop inactive rows before listing)
    mig_active = mig_series.loc[first_valid_mig:]

    # Interpolate + ffill + bfill within active period only
    mig_active_filled = (
        mig_active
        .interpolate(method="linear")
        .ffill()
        .bfill()
    )

    mig_out = mig_active_filled.reset_index()  # columns: time, MIG
    mig_out = mig_out.rename(columns={"MIG": "close"})

    mig_path = os.path.join(output_dir, "MIG_weekly_clean.csv")
    mig_out.to_csv(mig_path, index=False)
    print(f"Saved MIG to: {mig_path}")
else:
    print("No valid MIG data found in raw dataset.")
