In [1]:
import pandas as pd
import pickle
import torch

IGRA_PATH = '/usr/datalake/silver/igra/gph20s10k'
ARTIFACTS_PATH = '/usr/datalake/silver/stormevents/artifacts/igra_storm_event_autoencoder'
STATION_ID = 'USM00072649'

In [2]:
def load_data(station_id: str) -> pd.DataFrame:
    # Load the dataset
    filename = f'{IGRA_PATH}/{station_id}-data-gph20s10k.csv'
    result = pd.read_csv(filename)

    # Remove irrelevant data
    result = result[result['hour'] == 12]
    result = result.drop(['0_gph',
                '1_gph', '2_gph', '3_gph', '4_gph', '5_gph',
                '6_gph', '7_gph', '8_gph', '9_gph', '10_gph',
                '11_gph', '12_gph', '13_gph', '14_gph', '15_gph',
                '16_gph', '17_gph', '18_gph', '19_gph', '20_gph'
                ], axis=1)
    
    # Grab 100 random samples
    result = result.sample(100)
    result = result.reset_index(drop=True)

    return result

def load_scaler(station_id: str):
    with open(f'{ARTIFACTS_PATH}/{station_id}_scaler.pkl', 'rb') as f:
        scaler = pickle.load(f)

    return scaler

def scale_data(scaler, sample: pd.DataFrame):
    result = scaler.transform(sample)

    return result

def unscale_data(scaler, values):
    return scaler.inverse_transform(values).round(1)

class AutoEncoder(torch.nn.Module):
    def __init__(self):
        super().__init__()
        
        self.encoder = torch.nn.Sequential(
            torch.nn.Linear(105, 80),
            torch.nn.Tanh(),
            torch.nn.Linear(80, 60),
            torch.nn.Tanh(),
            torch.nn.Linear(60, 40),
            torch.nn.Tanh(),
            torch.nn.Linear(40, 20)
        )

        self.decoder = torch.nn.Sequential(
            torch.nn.Linear(20, 40),
            torch.nn.Tanh(),
            torch.nn.Linear(40, 60),
            torch.nn.Tanh(),
            torch.nn.Linear(60, 80),
            torch.nn.Tanh(),
            torch.nn.Linear(80, 105)
        )

    def forward(self, x):
        encoded = self.encoder(x)
        decoded = self.decoder(encoded)

        return decoded

In [3]:
model = AutoEncoder()
model.load_state_dict(torch.load(f'{ARTIFACTS_PATH}/{STATION_ID}_autoencoder.pt'))
model.eval()

AutoEncoder(
  (encoder): Sequential(
    (0): Linear(in_features=105, out_features=80, bias=True)
    (1): Tanh()
    (2): Linear(in_features=80, out_features=60, bias=True)
    (3): Tanh()
    (4): Linear(in_features=60, out_features=40, bias=True)
    (5): Tanh()
    (6): Linear(in_features=40, out_features=20, bias=True)
  )
  (decoder): Sequential(
    (0): Linear(in_features=20, out_features=40, bias=True)
    (1): Tanh()
    (2): Linear(in_features=40, out_features=60, bias=True)
    (3): Tanh()
    (4): Linear(in_features=60, out_features=80, bias=True)
    (5): Tanh()
    (6): Linear(in_features=80, out_features=105, bias=True)
  )
)

In [4]:
# Load the dataset
df_original = load_data(STATION_ID)
ss = load_scaler(STATION_ID)

# Prepare the sample
x = df_original.drop(['id', 'effective_date', 'hour', 'day_num'], axis=1)
x = scale_data(ss, x)

# Convert to tensor
x = torch.from_numpy(x).float()
x.shape

torch.Size([100, 105])

In [5]:
# Encode the values
logits = model.encoder(x).clone().detach()

# Decode the values
decoded = model.decoder(logits).clone().detach()

# Undo the initial scaling transform
decoded = unscale_data(ss, decoded)

decoded.shape



(100, 105)

In [6]:
# Add the index back to the dataframe
output = pd.DataFrame(decoded, columns=df_original.columns[4:])
output.insert(0, 'id', df_original['id'] )
output.insert(1, 'effective_date', df_original['effective_date'] )
output.insert(2, 'hour', df_original['hour'] )
output.insert(3, 'day_num', df_original['day_num'] )
output.head()

Unnamed: 0,id,effective_date,hour,day_num,0_pres,0_temp,0_dp,0_u,0_v,1_pres,...,19_pres,19_temp,19_dp,19_u,19_v,20_pres,20_temp,20_dp,20_u,20_v
0,USM00072649,2007-11-17,12,-0.78,978.1,0.1,-3.1,-0.5,-1.3,947.9,...,280.2,-45.2,-74.6,25.4,-4.7,260.0,-46.2,-74.9,26.5,-4.5
1,USM00072649,2023-05-02,12,0.53,976.1,0.2,-4.2,2.4,-2.5,945.9,...,283.5,-47.7,-59.5,16.7,-4.2,263.2,-50.5,-62.3,17.3,-1.9
2,USM00072649,2011-03-27,12,-0.07,989.9,-6.8,-10.1,0.1,0.1,958.5,...,272.1,-51.2,-59.0,21.5,-7.4,252.0,-51.1,-60.6,21.1,-6.9
3,USM00072649,2016-07-26,12,0.88,982.8,19.9,17.6,0.3,0.1,954.7,...,304.6,-36.0,-55.0,7.4,-1.3,283.4,-39.4,-56.6,8.3,-2.3
4,USM00072649,2020-11-06,12,-0.66,980.4,11.2,6.5,0.4,1.2,951.4,...,300.0,-39.8,-44.0,20.2,-10.7,279.0,-43.9,-47.5,20.4,-11.6


In [7]:
# Interleave the results into the source for visual comparison
interleave = pd.concat([df_original, output])
interleave = interleave.sort_index()
interleave.head(10)

Unnamed: 0,id,effective_date,hour,day_num,0_pres,0_temp,0_dp,0_u,0_v,1_pres,...,19_pres,19_temp,19_dp,19_u,19_v,20_pres,20_temp,20_dp,20_u,20_v
0,USM00072649,2007-11-17,12,-0.78,978.4,0.2,-3.7,-0.3,-0.9,947.7,...,280.7,-46.6,-73.2,24.4,-5.6,260.1,-47.8,-73.0,24.8,-8.2
0,USM00072649,2007-11-17,12,-0.78,978.1,0.1,-3.1,-0.5,-1.3,947.9,...,280.2,-45.2,-74.6,25.4,-4.7,260.0,-46.2,-74.9,26.5,-4.5
1,USM00072649,2023-05-02,12,0.53,976.1,0.2,-4.2,2.4,-2.5,945.9,...,283.5,-47.7,-59.5,16.7,-4.2,263.2,-50.5,-62.3,17.3,-1.9
1,USM00072649,2023-05-02,12,0.53,975.6,5.0,-4.8,2.0,-3.0,945.6,...,282.6,-48.4,-59.1,17.0,-6.4,261.9,-51.6,-63.7,11.6,-9.4
2,USM00072649,2011-03-27,12,-0.07,990.2,-8.8,-11.6,-0.0,-0.0,958.5,...,272.4,-53.0,-60.5,22.7,-4.2,252.3,-49.2,-57.8,24.6,-3.1
2,USM00072649,2011-03-27,12,-0.07,989.9,-6.8,-10.1,0.1,0.1,958.5,...,272.1,-51.2,-59.0,21.5,-7.4,252.0,-51.1,-60.6,21.1,-6.9
3,USM00072649,2016-07-26,12,0.88,982.8,19.9,17.6,0.3,0.1,954.7,...,304.6,-36.0,-55.0,7.4,-1.3,283.4,-39.4,-56.6,8.3,-2.3
3,USM00072649,2016-07-26,12,0.88,983.0,19.9,19.1,-0.0,-0.0,954.7,...,305.1,-36.5,-56.7,7.8,-1.5,283.9,-40.4,-59.6,9.4,-2.4
4,USM00072649,2020-11-06,12,-0.66,980.2,11.2,2.2,0.2,1.0,951.2,...,300.0,-38.8,-42.9,21.3,-7.3,279.0,-42.6,-47.3,21.2,-12.0
4,USM00072649,2020-11-06,12,-0.66,980.4,11.2,6.5,0.4,1.2,951.4,...,300.0,-39.8,-44.0,20.2,-10.7,279.0,-43.9,-47.5,20.4,-11.6


In [8]:
interleave.to_csv(f'{ARTIFACTS_PATH}/{STATION_ID}_test.csv')