In [1]:
import pandas as pd
from dotenv import load_dotenv
import os
from src.models import db, Station, StationName, StationLocation, Wind10Minutes, model_init
import numpy as np

load_dotenv()

model_init()

In [2]:
PARQUET_FOLDER = os.path.join(os.getenv('PROJECT_DIR'), 'data','interim', 'parquet' )

GEO_PARQUET_FILE_PATH = os.path.join(PARQUET_FOLDER, 'geo.parquet')
NAME_PARQUET_FILE_PATH = os.path.join(PARQUET_FOLDER, 'name.parquet')
WIND_PARQUET_FILE_PATH = os.path.join(PARQUET_FOLDER, 'wind.parquet')

In [3]:
geo_df = pd.read_parquet(GEO_PARQUET_FILE_PATH)
name_df = pd.read_parquet(NAME_PARQUET_FILE_PATH)
wind_df = pd.read_parquet(WIND_PARQUET_FILE_PATH, filters=[('station_id', '=', 3)], engine='pyarrow')

In [4]:
wind_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 908204 entries, 0 to 337802
Data columns (total 5 columns):
 #   Column         Non-Null Count   Dtype         
---  ------         --------------   -----         
 0   station_id     908204 non-null  int64         
 1   timestamp      908204 non-null  datetime64[ns]
 2   quality        908204 non-null  int64         
 3   avg_speed      908204 non-null  float64       
 4   avg_direction  908204 non-null  float64       
dtypes: datetime64[ns](1), float64(2), int64(2)
memory usage: 41.6 MB


In [5]:
wind_df['station_id'].unique()

array([3])

In [6]:
with db.atomic():
    
    print(f"Deleted {StationName.delete().execute()} station names")
    print(f"Deleted {StationLocation.delete().execute()} station locations")    
    print(f"Deleted {Station.delete().execute()} stations")
    print(f"Deleted {Wind10Minutes.delete().execute()} wind data")


Deleted 0 station names
Deleted 0 station locations
Deleted 0 stations
Deleted 0 wind data


In [7]:

# insert stations
stations_data = [{'id': sid} for sid in geo_df['station_id'].unique().tolist()]
Station.insert_many(stations_data).execute()

# insert station names
station_names_data = name_df.replace({np.nan: None}).to_dict(orient="records")
StationName.insert_many(station_names_data).execute()


# insert station locations
station_locations_data = geo_df.drop(columns=['station_name']).replace({np.nan: None}).to_dict(orient="records")
StationLocation.insert_many(station_locations_data).execute()

#insert wind data
wind_data = wind_df.replace({np.nan: None}).rename(columns={'station_id': 'station'}).to_dict(orient="records")
Wind10Minutes.insert_many(wind_data).execute()
    

<peewee.ModelTupleCursorWrapper at 0x1176bf090>

In [8]:
print(f"Inserted {Station.select().count()} stations")
print(f"Inserted {StationName.select().count()} station names")
print(f"Inserted {StationLocation.select().count()} station locations")
print(f"Inserted {Wind10Minutes.select().count()} wind data")

Inserted 310 stations
Inserted 450 station names
Inserted 1264 station locations
Inserted 908204 wind data


In [9]:
wind_df

Unnamed: 0,station_id,timestamp,quality,avg_speed,avg_direction
0,3,2010-01-01 00:00:00,3,2.3,350.0
1,3,2010-01-01 00:10:00,3,2.2,350.0
2,3,2010-01-01 00:20:00,3,2.1,350.0
3,3,2010-01-01 00:30:00,3,2.3,350.0
4,3,2010-01-01 00:40:00,3,1.6,360.0
...,...,...,...,...,...
337798,3,2000-01-01 00:10:00,1,2.7,200.0
337799,3,2000-01-01 00:20:00,1,2.9,210.0
337800,3,2000-01-01 00:30:00,1,2.3,210.0
337801,3,2000-01-01 00:40:00,1,2.3,210.0
