# Dateningestierung

Die Daten werden nun aus den Parquet Files geladen und dann in die Datenbank überführt

In [13]:
import pandas as pd
from dotenv import load_dotenv
import os
from src.models import db, Station, StationName, StationLocation, Wind10Minutes, model_init
import numpy as np

load_dotenv()

# Datenmodel initalisieren. Dabei werden alle vorherigen Daten gelöscht.
model_init()

In [14]:
PARQUET_FOLDER = os.path.join(os.getenv('PROJECT_DIR'), 'data','interim', 'parquet' )

GEO_PARQUET_FILE_PATH = os.path.join(PARQUET_FOLDER, 'geo.parquet')
NAME_PARQUET_FILE_PATH = os.path.join(PARQUET_FOLDER, 'name.parquet')
WIND_PARQUET_FILE_PATH = os.path.join(PARQUET_FOLDER, 'wind.parquet')



In [15]:
geo_df = pd.read_parquet(GEO_PARQUET_FILE_PATH)
name_df = pd.read_parquet(NAME_PARQUET_FILE_PATH)

wind_df = pd.read_parquet(WIND_PARQUET_FILE_PATH, engine='pyarrow')

In [16]:
with db.atomic():
    
    print(f"Deleted {StationName.delete().execute()} station names")
    print(f"Deleted {StationLocation.delete().execute()} station locations")    
    print(f"Deleted {Station.delete().execute()} stations")
    print(f"Deleted {Wind10Minutes.delete().execute()} wind data")


Deleted 0 station names
Deleted 0 station locations
Deleted 0 stations
Deleted 0 wind data


In [None]:

# insert stations
stations_data = [{'id': sid} for sid in geo_df['station_id'].unique().tolist()]
Station.insert_many(stations_data).execute()

# insert station names
station_names_data = name_df.replace({np.nan: None}).to_dict(orient="records")
StationName.insert_many(station_names_data).execute()

# insert station locations
station_locations_data = geo_df.drop(columns=['station_name', 'state']).replace({np.nan: None}).to_dict(orient="records")
StationLocation.insert_many(station_locations_data).execute()

#insert wind data
wind_data = wind_df.replace({np.nan: None}).rename(columns={'station_id': 'station'}).to_dict(orient="records")
Wind10Minutes.insert_many(wind_data).execute()

In [None]:
print(f"Inserted {Station.select().count()} stations")
print(f"Inserted {StationName.select().count()} station names")
print(f"Inserted {StationLocation.select().count()} station locations")
print(f"Inserted {Wind10Minutes.select().count()} wind data")