In [141]:
import pandas as pd
import ast
from sklearn.model_selection import train_test_split
import os

In [142]:
def read_raw_data(path):
    raw_df = pd.read_csv(path)
    return raw_df

In [143]:
def preoprocess_data(raw_df):
    # extract necessary columns
    use_columns = ['ID', '緯度(度)', '緯度(分)', '経度(度)', '経度(分)', 'temp', '観測所名']
    df = raw_df[use_columns].copy()  

    # rename columuns
    df.columns = ['ID', 'latitude_D','latitude_m', 'longitude_D','longitude_m', 'temp', 'station_name']
    df["latitude"] = df["latitude_D"] + df["latitude_m"] / 60
    df["longitude"] = df["longitude_D"] + df["longitude_m"] / 60

    # drop Nan in temp col
    df = df[df["temp"].notna()]

    # extract only temp values
    df["temp"] = df['temp'].apply(lambda x: ast.literal_eval(x)[0])

    # # groupby
    summarized_df = df.groupby(['latitude', 'longitude', 'station_name']).agg({'temp': 'max'}).reset_index()
    return summarized_df

In [144]:
def split_data(df, test_size=0.2, random_state=42):
    train_df, test_df = train_test_split(df, test_size=test_size, random_state=random_state)
    return train_df, test_df

In [145]:
def build_dataset():
    path = "C://dropout_MC//dropout_MC//_01_data//data//amedas_data_20250713.csv"
    raw_df = read_raw_data(path)

    # preprocess
    summarized_df = preoprocess_data(raw_df)

    # split dataset
    train_df, test_df = split_data(summarized_df, test_size=0.3, random_state=42)
    
    # save
    build_dir = os.path.abspath(os.path.join(os.getcwd(), os.pardir))

    all_data_path = os.path.join(build_dir, "data", "processed_data.csv")
    summarized_df.to_csv(all_data_path, index=False)

    trai_data_path = os.path.join(build_dir, "data", "train_data.csv")
    train_df.to_csv(trai_data_path, index=False)

    test_data_path = os.path.join(build_dir, "data", "test_data.csv")
    test_df.to_csv(test_data_path, index=False)

In [146]:
build_dataset()