In [1]:
import pandas as pd
import pickle
import torch

IGRA_PATH = '/usr/datalake/silver/igra/gph20s10k'
ARTIFACTS_PATH = '/usr/datalake/silver/stormevents/artifacts/igra_storm_event_autoencoder'
STATION_ID = 'USM00072649'

In [2]:
def load_data(station_id: str) -> pd.DataFrame:
    # Load the dataset
    filename = f'{IGRA_PATH}/{station_id}-data-gph20s10k.csv'
    result = pd.read_csv(filename)

    # Remove irrelevant data
    result = result[result['hour'] == 12]
    result = result.drop(['0_gph',
                '1_gph', '2_gph', '3_gph', '4_gph', '5_gph',
                '6_gph', '7_gph', '8_gph', '9_gph', '10_gph',
                '11_gph', '12_gph', '13_gph', '14_gph', '15_gph',
                '16_gph', '17_gph', '18_gph', '19_gph', '20_gph'
                ], axis=1)
    
    # Grab 100 random samples
    result = result.sample(100)
    result = result.reset_index(drop=True)

    return result

def load_scaler(station_id: str):
    with open(f'{ARTIFACTS_PATH}/{station_id}_scaler.pkl', 'rb') as f:
        scaler = pickle.load(f)

    return scaler

def scale_data(scaler, sample: pd.DataFrame):
    result = scaler.transform(sample)

    return result

def unscale_data(scaler, values):
    return scaler.inverse_transform(values).round(1)

class AutoEncoder(torch.nn.Module):
    def __init__(self):
        super().__init__()
        
        self.encoder = torch.nn.Sequential(
            torch.nn.Linear(105, 80),
            torch.nn.Tanh(),
            torch.nn.Linear(80, 60),
            torch.nn.Tanh(),
            torch.nn.Linear(60, 40),
            torch.nn.Tanh(),
            torch.nn.Linear(40, 20)
        )

        self.decoder = torch.nn.Sequential(
            torch.nn.Linear(20, 40),
            torch.nn.Tanh(),
            torch.nn.Linear(40, 60),
            torch.nn.Tanh(),
            torch.nn.Linear(60, 80),
            torch.nn.Tanh(),
            torch.nn.Linear(80, 105)
        )

    def forward(self, x):
        encoded = self.encoder(x)
        decoded = self.decoder(encoded)

        return decoded

In [3]:
model = AutoEncoder()
model.load_state_dict(torch.load(f'{ARTIFACTS_PATH}/{STATION_ID}_fnn.pt'))
model.eval()

AutoEncoder(
  (encoder): Sequential(
    (0): Linear(in_features=105, out_features=80, bias=True)
    (1): Tanh()
    (2): Linear(in_features=80, out_features=60, bias=True)
    (3): Tanh()
    (4): Linear(in_features=60, out_features=40, bias=True)
    (5): Tanh()
    (6): Linear(in_features=40, out_features=20, bias=True)
  )
  (decoder): Sequential(
    (0): Linear(in_features=20, out_features=40, bias=True)
    (1): Tanh()
    (2): Linear(in_features=40, out_features=60, bias=True)
    (3): Tanh()
    (4): Linear(in_features=60, out_features=80, bias=True)
    (5): Tanh()
    (6): Linear(in_features=80, out_features=105, bias=True)
  )
)

In [4]:
# Load the dataset
df_original = load_data(STATION_ID)
ss = load_scaler(STATION_ID)

# Prepare the sample
x = df_original.drop(['id', 'effective_date', 'hour', 'day_num'], axis=1)
x = scale_data(ss, x)

# Convert to tensor
x = torch.from_numpy(x).float()
x.shape

torch.Size([100, 105])

In [5]:
# Encode the values
logits = model.encoder(x).clone().detach()

# Decode the values
decoded = model.decoder(logits).clone().detach()

# Undo the initial scaling transform
decoded = unscale_data(ss, decoded)

decoded.shape



(100, 105)

In [6]:
# Add the index back to the dataframe
output = pd.DataFrame(decoded, columns=df_original.columns[4:])
output.insert(0, 'id', df_original['id'] )
output.insert(1, 'effective_date', df_original['effective_date'] )
output.insert(2, 'hour', df_original['hour'] )
output.insert(3, 'day_num', df_original['day_num'] )
output.head()

Unnamed: 0,id,effective_date,hour,day_num,0_pres,0_temp,0_dp,0_u,0_v,1_pres,...,19_pres,19_temp,19_dp,19_u,19_v,20_pres,20_temp,20_dp,20_u,20_v
0,USM00072649,2023-09-17,12,0.17,984.8,10.8,8.7,-0.1,-0.1,939.4,...,294.5,-43.9,-52.4,1.5,-33.3,273.7,-47.6,-56.5,2.4,-33.9
1,USM00072649,2007-06-27,12,1.0,985.8,15.5,13.0,2.1,-0.9,941.1,...,304.0,-35.4,-57.0,24.0,4.8,283.4,-38.8,-59.1,25.3,4.9
2,USM00072649,2002-03-25,12,-0.1,996.2,-15.9,-19.9,-3.5,-2.0,946.0,...,277.6,-57.2,-62.6,14.6,0.6,256.7,-60.0,-65.7,16.0,0.8
3,USM00072649,2013-09-11,12,0.28,982.5,17.9,15.4,0.3,0.2,938.5,...,302.2,-35.8,-53.5,31.1,-1.4,281.7,-39.3,-55.6,32.6,-1.1
4,USM00072649,2018-05-15,12,0.71,982.9,10.6,7.8,-1.1,-2.6,937.6,...,292.8,-44.6,-55.0,31.0,-3.7,272.2,-48.1,-57.5,33.1,-3.2


In [7]:
# Interleave the results into the source for visual comparison
interleave = pd.concat([df_original, output])
interleave = interleave.sort_index()
interleave.head(10)

Unnamed: 0,id,effective_date,hour,day_num,0_pres,0_temp,0_dp,0_u,0_v,1_pres,...,19_pres,19_temp,19_dp,19_u,19_v,20_pres,20_temp,20_dp,20_u,20_v
0,USM00072649,2023-09-17,12,0.17,984.9,11.2,10.6,0.0,-0.0,939.4,...,294.2,-44.0,-53.9,2.8,-32.5,274.0,-47.3,-56.2,3.6,-34.5
0,USM00072649,2023-09-17,12,0.17,984.8,10.8,8.7,-0.1,-0.1,939.4,...,294.5,-43.9,-52.4,1.5,-33.3,273.7,-47.6,-56.5,2.4,-33.9
1,USM00072649,2007-06-27,12,1.0,985.8,15.5,13.0,2.1,-0.9,941.1,...,304.0,-35.4,-57.0,24.0,4.8,283.4,-38.8,-59.1,25.3,4.9
1,USM00072649,2007-06-27,12,1.0,985.2,18.0,11.6,1.9,-1.0,942.4,...,303.6,-37.1,-57.5,24.2,6.1,283.1,-40.3,-60.0,25.6,4.7
2,USM00072649,2002-03-25,12,-0.1,996.9,-12.6,-17.4,-3.6,-2.1,946.0,...,277.0,-57.0,-62.8,13.5,0.1,255.9,-59.8,-65.7,13.5,2.2
2,USM00072649,2002-03-25,12,-0.1,996.2,-15.9,-19.9,-3.5,-2.0,946.0,...,277.6,-57.2,-62.6,14.6,0.6,256.7,-60.0,-65.7,16.0,0.8
3,USM00072649,2013-09-11,12,0.28,982.5,17.9,15.4,0.3,0.2,938.5,...,302.2,-35.8,-53.5,31.1,-1.4,281.7,-39.3,-55.6,32.6,-1.1
3,USM00072649,2013-09-11,12,0.28,982.5,17.1,16.0,-0.0,-0.0,938.4,...,302.3,-36.7,-58.7,33.4,2.2,281.4,-40.1,-45.7,39.6,5.8
4,USM00072649,2018-05-15,12,0.71,982.0,12.1,6.8,-0.9,-2.4,936.7,...,292.6,-45.6,-55.9,33.0,-2.9,271.8,-47.0,-60.2,30.0,0.4
4,USM00072649,2018-05-15,12,0.71,982.9,10.6,7.8,-1.1,-2.6,937.6,...,292.8,-44.6,-55.0,31.0,-3.7,272.2,-48.1,-57.5,33.1,-3.2


In [8]:
interleave.to_csv(f'{ARTIFACTS_PATH}/{STATION_ID}_test.csv')