In [1]:
import sys
from pathlib import Path
import math

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.impute import KNNImputer

In [278]:
# Set max number of columns to display; default 20
pd.options.display.max_columns = 80

In [279]:
# Create absolute path object to data directory
# Note: cwd
cwd_path = Path.cwd()
data_path = cwd_path.parent.joinpath('data')
data_push_path = cwd_path.parent.joinpath('data_to_push')

In [280]:
# Read in
df = pd.read_pickle(data_push_path / 'df_turbines_knn_wrangled.pkl')

In [281]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30645 entries, 0 to 30644
Data columns (total 25 columns):
 #   Column                                 Non-Null Count  Dtype         
---  ------                                 --------------  -----         
 0   EinheitMastrNummer                     30645 non-null  object        
 1   DatumLetzteAktualisierung              30645 non-null  datetime64[ns]
 2   Bundesland                             30645 non-null  object        
 3   Postleitzahl                           30645 non-null  int64         
 4   Ort                                    30645 non-null  object        
 5   Laengengrad                            30645 non-null  float64       
 6   Breitengrad                            30645 non-null  float64       
 7   Registrierungsdatum                    30645 non-null  datetime64[ns]
 8   Inbetriebnahmedatum                    30645 non-null  datetime64[ns]
 9   EinheitBetriebsstatus                  30645 non-null  object

### Create datetimeindex

In [282]:
end_date = df['Inbetriebnahmedatum'].max().date()

# This year only
datetime_index = pd.date_range(start='2023-01-01', end=end_date, freq='D', name='Date')
datetime_index

DatetimeIndex(['2023-01-01', '2023-01-02', '2023-01-03', '2023-01-04',
               '2023-01-05', '2023-01-06', '2023-01-07', '2023-01-08',
               '2023-01-09', '2023-01-10',
               ...
               '2023-08-02', '2023-08-03', '2023-08-04', '2023-08-05',
               '2023-08-06', '2023-08-07', '2023-08-08', '2023-08-09',
               '2023-08-10', '2023-08-11'],
              dtype='datetime64[ns]', name='Date', length=223, freq='D')

In [283]:
# Number of turbines that went into operation grouped by day
df.groupby(by='Inbetriebnahmedatum')['Inbetriebnahmedatum'].value_counts().sort_values(ascending=False).head(30)

Inbetriebnahmedatum
2017-09-29    65
2001-12-21    65
2001-12-20    58
2017-05-31    55
2002-12-19    53
2000-01-01    50
2001-12-19    47
2004-12-30    47
2002-12-20    46
2001-12-28    45
2017-06-30    45
2013-12-20    45
2005-12-29    43
2005-12-30    41
2004-12-28    41
2003-12-23    41
2015-12-30    40
2002-12-18    40
2001-12-27    39
2013-12-19    38
2017-04-28    38
2006-12-28    37
2006-12-29    37
2003-12-18    37
2003-12-19    37
2017-09-28    37
2012-12-21    37
2004-07-29    35
2001-03-01    35
2003-12-30    34
Name: count, dtype: int64

In [284]:
# Select all datetime columns! Very handy 
df.select_dtypes(include=['datetime64']).notna().sum()

DatumLetzteAktualisierung                30645
Registrierungsdatum                      30645
Inbetriebnahmedatum                      30645
DatumEndgueltigeStilllegung               1158
DatumBeginnVoruebergehendeStilllegung       68
DatumWiederaufnahmeBetrieb                   6
dtype: int64

### Derive new column checking if turbine is operational on each datetimeindex day
- Use datetimeindex as input
- Logic (input: datetimeindex timestamp):
    - First check: is input date before Inbetriebnahmedatum?
        - not operational
    - Second check: 

## Idea
- Datetimeindex to frame
- Apply custom function to datetimeindex frame
- Feed date into function as well as df of turbine data
- Or just feed subset of turbine data in after running conditional check 

In [287]:
# for row in df.itertuples():
#     print(row.EinheitMastrNummer)

In [263]:
df[['Inbetriebnahmedatum', 'DatumEndgueltigeStilllegung', 'DatumBeginnVoruebergehendeStilllegung', 'DatumWiederaufnahmeBetrieb']]

Unnamed: 0,Inbetriebnahmedatum,DatumEndgueltigeStilllegung,DatumBeginnVoruebergehendeStilllegung,DatumWiederaufnahmeBetrieb
0,2017-09-01,NaT,NaT,NaT
1,2017-09-28,NaT,NaT,NaT
2,2017-09-04,NaT,NaT,NaT
3,2017-08-31,NaT,NaT,NaT
4,2017-01-11,NaT,NaT,NaT
...,...,...,...,...
30640,2023-06-24,NaT,NaT,NaT
30641,2023-07-28,NaT,NaT,NaT
30642,2023-07-20,NaT,NaT,NaT
30643,1996-01-11,NaT,NaT,NaT


In [None]:
def operational_check(turbine_dates, date):
    """
    date is datetimeindex date -> isn't it an iterable? not a singular date
    turbine_dates is dataframe consisting of turbine date columns
    
    Handle cases according to how frequent/likely they are
    Idea: does it make sense to flip case 1 to check if date is AFTER operational date?
    """
    # if date is before first operational date, return 0 for not operational
    if date < turbine_dates['Inbetriebnahmedatum']:
        return 0
    # second case: already decommisioned
    elif date >= turbine_dates['DatumEndgueltigeStilllegung']:
        return 0
    # third case: if date is between DatumBeginnVoruebergehendeStilllegung and DatumWiederaufnahmeBetrieb -> 0
    # turbine is still temporarily 'out of order'
    elif turbine_dates['DatumBeginnVoruebergehendeStilllegung'] <= date <= turbine_dates['DatumWiederaufnahmeBetrieb']:
        return 0
    # fourth case:
    




### Potential custom function to calculate total net nominal power from operational turbines per day
- Feed in datetimeindex and turbines df
- Loop through each day in datetimeindex
- Can I groupby based on conditional to determine if turbine operational?

In [None]:
def custom_function(dt_idx, df_turbines):
    
    # loop through each date in datetimeindex
    for date in dt_idx:
        

In [262]:
# for date in datetime_index:
#     print(type(date))

In [259]:
datetime_index.to_frame().apply(lambda row: row[0], axis=1)

Date
2023-01-01   2023-01-01
2023-01-02   2023-01-02
2023-01-03   2023-01-03
2023-01-04   2023-01-04
2023-01-05   2023-01-05
                ...    
2023-08-07   2023-08-07
2023-08-08   2023-08-08
2023-08-09   2023-08-09
2023-08-10   2023-08-10
2023-08-11   2023-08-11
Freq: D, Length: 223, dtype: datetime64[ns]

In [245]:
df[df['DatumWiederaufnahmeBetrieb'].notna()].select_dtypes(include=['datetime64'])

Unnamed: 0,DatumLetzteAktualisierung,Registrierungsdatum,Inbetriebnahmedatum,DatumEndgueltigeStilllegung,DatumBeginnVoruebergehendeStilllegung,DatumWiederaufnahmeBetrieb
4451,2020-10-27 20:33:47.288074900,2019-09-15,2014-10-22,NaT,2019-04-03,2019-09-13
8191,2022-12-01 07:52:10.180000000,2019-08-08,1997-11-26,2021-07-01,NaT,2021-07-01
9854,2022-03-14 12:07:32.491932600,2021-01-18,1998-08-14,NaT,2021-01-01,2021-01-02
18008,2021-03-25 12:17:48.534331900,2020-06-19,2020-06-17,NaT,2020-06-15,2020-06-17
30467,2023-03-06 10:32:58.569853000,2022-02-08,2017-03-09,NaT,2017-03-28,2022-12-21
30546,2022-12-28 08:47:21.904918500,2022-08-22,1991-09-10,NaT,2022-03-07,2022-12-01


In [250]:
# only one instance of Inbetriebnahmedatum after DatumBeginnVoruebergehendeStilllegung
df[df['Inbetriebnahmedatum'] > df['DatumBeginnVoruebergehendeStilllegung']]

Unnamed: 0,EinheitMastrNummer,DatumLetzteAktualisierung,Bundesland,Postleitzahl,Ort,Laengengrad,Breitengrad,Registrierungsdatum,Inbetriebnahmedatum,EinheitBetriebsstatus,DatumEndgueltigeStilllegung,DatumBeginnVoruebergehendeStilllegung,DatumWiederaufnahmeBetrieb,NameStromerzeugungseinheit,Nettonennleistung,AnschlussAnHoechstOderHochSpannung,Einspeisungsart,NameWindpark,Hersteller,Technologie,Typenbezeichnung,Nabenhoehe,Rotordurchmesser,Rotorblattenteisungssystem,area_blades
18008,SEE905478543632,2021-03-25 12:17:48.534331900,Niedersachsen,27243,Winkelsett,8.5013,52.862604,2020-06-19,2020-06-17,In Betrieb,NaT,2020-06-15,2020-06-17,Windpark Winkelsett 1-1,3000.0,1.0,Volleinspeisung,Windpark Winkelsett 1,ENERCON GmbH,Horizontalläufer,E-115,149.0,115.0,0.0,10386


In [247]:
df[df['DatumBeginnVoruebergehendeStilllegung'].notna()].select_dtypes(include=['datetime64'])

Unnamed: 0,DatumLetzteAktualisierung,Registrierungsdatum,Inbetriebnahmedatum,DatumEndgueltigeStilllegung,DatumBeginnVoruebergehendeStilllegung,DatumWiederaufnahmeBetrieb
2160,2022-12-15 07:55:11.526666700,2019-03-07,2000-02-04,NaT,2019-09-18,NaT
2603,2023-04-12 14:09:50.116785000,2019-03-19,2013-12-18,NaT,2023-03-15,NaT
3037,2023-05-08 09:57:50.760571900,2019-03-26,2001-07-31,NaT,2022-11-16,NaT
4451,2020-10-27 20:33:47.288074900,2019-09-15,2014-10-22,NaT,2019-04-03,2019-09-13
7975,2022-05-31 14:03:06.570449500,2019-08-02,2000-09-30,NaT,2022-05-24,NaT
...,...,...,...,...,...,...
30546,2022-12-28 08:47:21.904918500,2022-08-22,1991-09-10,NaT,2022-03-07,2022-12-01
30562,2023-05-31 19:42:04.614054100,2022-12-15,2023-02-04,NaT,2023-05-19,NaT
30564,2023-03-16 07:00:12.982323600,2022-10-26,1992-08-07,NaT,2021-12-31,NaT
30573,2023-03-08 09:45:31.647308700,2022-11-06,1991-06-24,NaT,2023-01-01,NaT


In [8]:
df_2023 = df[df['Inbetriebnahmedatum'] >= '2023-01-01'].sort_values(by='Inbetriebnahmedatum', ascending=True)
df_2023['DatumLetzteAktualisierung'] = df_2023['DatumLetzteAktualisierung'].apply(lambda datetime: datetime.floor('D'))
df_2023.head(3)

Unnamed: 0,EinheitMastrNummer,DatumLetzteAktualisierung,Bundesland,Postleitzahl,Ort,Laengengrad,Breitengrad,Registrierungsdatum,Inbetriebnahmedatum,EinheitBetriebsstatus,DatumEndgueltigeStilllegung,DatumBeginnVoruebergehendeStilllegung,DatumWiederaufnahmeBetrieb,NameStromerzeugungseinheit,Nettonennleistung,AnschlussAnHoechstOderHochSpannung,Einspeisungsart,NameWindpark,Hersteller,Technologie,Typenbezeichnung,Nabenhoehe,Rotordurchmesser,Rotorblattenteisungssystem,area_blades
30616,SEE967578323439,2023-04-21,Bayern,86470,Thannhausen,10.472708,48.276667,2023-04-21,2023-01-01,In Betrieb,NaT,NaT,NaT,Skywind,0.8,,Teileinspeisung (einschließlich Eigenverbrauch),Skywind,SkyWind GmbH,Vertikalläufer,SkyWind NG,13.0,1.5,0.0,1
30161,SEE908251592698,2023-03-21,Bayern,97618,Hollstadt,10.347222,50.370806,2023-01-16,2023-01-05,In Betrieb,NaT,NaT,NaT,WEA WAR 2,2400.0,,Volleinspeisung,Windpark Weißer Turm Nord,Nordex Energy GmbH,Horizontalläufer,N117/2400,140.6,116.8,0.0,10714
3660,SEE971059741650,2023-04-17,Nordrhein-Westfalen,33181,Bad Wünnenberg,8.823972,51.496552,2023-01-12,2023-01-06,In Betrieb,NaT,NaT,NaT,WB03 - 1380876,4200.0,,Volleinspeisung,Wohlbedacht,ENERCON GmbH,Horizontalläufer,E-138 EP3 E2,160.0,138.25,1.0,15011


In [9]:
df_2023.loc[30616, 'DatumLetzteAktualisierung']

Timestamp('2023-04-21 00:00:00')

In [10]:
df_2023.loc[30616, 'Registrierungsdatum']

Timestamp('2023-04-21 00:00:00')

In [11]:
df_2023.loc[30616, 'DatumLetzteAktualisierung'].floor('D')

Timestamp('2023-04-21 00:00:00')

In [12]:
df_2023.info()

<class 'pandas.core.frame.DataFrame'>
Index: 453 entries, 30616 to 29888
Data columns (total 25 columns):
 #   Column                                 Non-Null Count  Dtype         
---  ------                                 --------------  -----         
 0   EinheitMastrNummer                     453 non-null    object        
 1   DatumLetzteAktualisierung              453 non-null    datetime64[ns]
 2   Bundesland                             453 non-null    object        
 3   Postleitzahl                           453 non-null    int64         
 4   Ort                                    453 non-null    object        
 5   Laengengrad                            453 non-null    float64       
 6   Breitengrad                            453 non-null    float64       
 7   Registrierungsdatum                    453 non-null    datetime64[ns]
 8   Inbetriebnahmedatum                    453 non-null    datetime64[ns]
 9   EinheitBetriebsstatus                  453 non-null    object   

In [13]:
# Group on date turbine went into operation and sum other columns like net nominal power
# This reflects the total net nominal power of all turbines that went into operation on that date..
# This is NOT total net nominal power for ALL turbines in operation that day
df_2023.groupby(by='Inbetriebnahmedatum')[['Nettonennleistung', 'area_blades']].sum()

# Alternative way allowing me to use more than one agg function
# df_2023.groupby(by='Inbetriebnahmedatum')[['Nettonennleistung', 'area_blades']].aggregate(['sum', 'count'])

Unnamed: 0_level_0,Nettonennleistung,area_blades
Inbetriebnahmedatum,Unnamed: 1_level_1,Unnamed: 2_level_1
2023-01-01,0.8,1
2023-01-05,2400.0,10714
2023-01-06,4200.0,15011
2023-01-10,5609.5,17705
2023-01-12,0.6,1
...,...,...
2023-08-04,26500.0,88638
2023-08-07,1.0,2
2023-08-09,5700.0,17436
2023-08-10,5002.0,13686


In [14]:
df_2023.groupby(by='Inbetriebnahmedatum')[['Nettonennleistung', 'area_blades']].agg(['sum', 'count'])

Unnamed: 0_level_0,Nettonennleistung,Nettonennleistung,area_blades,area_blades
Unnamed: 0_level_1,sum,count,sum,count
Inbetriebnahmedatum,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
2023-01-01,0.8,1,1,1
2023-01-05,2400.0,1,10714,1
2023-01-06,4200.0,1,15011,1
2023-01-10,5609.5,3,17705,3
2023-01-12,0.6,1,1,1
...,...,...,...,...
2023-08-04,26500.0,6,88638,6
2023-08-07,1.0,1,2,1
2023-08-09,5700.0,1,17436,1
2023-08-10,5002.0,3,13686,3


-----

## Aggregate and merge on datetime index - test on lastest 10 turbines

In [15]:
datetime_index

DatetimeIndex(['2023-01-01', '2023-01-02', '2023-01-03', '2023-01-04',
               '2023-01-05', '2023-01-06', '2023-01-07', '2023-01-08',
               '2023-01-09', '2023-01-10',
               ...
               '2023-08-02', '2023-08-03', '2023-08-04', '2023-08-05',
               '2023-08-06', '2023-08-07', '2023-08-08', '2023-08-09',
               '2023-08-10', '2023-08-11'],
              dtype='datetime64[ns]', name='Inbetriebnahmedatum', length=223, freq='D')

In [16]:
df_2023.at[30082, 'DatumLetzteAktualisierung'].date()

datetime.date(2023, 8, 8)

In [17]:
df_2023.tail(10)

Unnamed: 0,EinheitMastrNummer,DatumLetzteAktualisierung,Bundesland,Postleitzahl,Ort,Laengengrad,Breitengrad,Registrierungsdatum,Inbetriebnahmedatum,EinheitBetriebsstatus,DatumEndgueltigeStilllegung,DatumBeginnVoruebergehendeStilllegung,DatumWiederaufnahmeBetrieb,NameStromerzeugungseinheit,Nettonennleistung,AnschlussAnHoechstOderHochSpannung,Einspeisungsart,NameWindpark,Hersteller,Technologie,Typenbezeichnung,Nabenhoehe,Rotordurchmesser,Rotorblattenteisungssystem,area_blades
30082,SEE933185549142,2023-08-08,Schleswig-Holstein,25926,Karlum,9.017741,54.817604,2023-08-08,2023-08-04,In Betrieb,NaT,NaT,NaT,WEA 2,4200.0,,Volleinspeisung,WEA 2,Vestas Deutschland GmbH,Horizontalläufer,"V136-4,2",112.0,136.0,0.0,14526
21831,SEE903067276619,2023-08-07,Niedersachsen,27432,Oerel,9.066034,53.460066,2023-08-07,2023-08-04,In Betrieb,NaT,NaT,NaT,Windpark Oerel WEA 2,4200.0,,Volleinspeisung,WP Oerel,ENERCON GmbH,Horizontalläufer,E138 EP2 E2,159.0,138.0,1.0,14957
30420,SEE937652577805,2023-08-07,Schleswig-Holstein,24616,Willenscharen,9.853393,54.015535,2023-08-07,2023-08-04,In Betrieb,NaT,NaT,NaT,WEA 2 - 1250378,4000.0,,Volleinspeisung,WP Willenscharen-Brokstedt,ENERCON GmbH,Horizontalläufer,E-126EP3,115.8,127.0,0.0,12667
30506,SEE973997226435,2023-08-07,Baden-Württemberg,76437,Rastatt,8.176968,48.862603,2023-08-07,2023-08-07,In Betrieb,NaT,NaT,NaT,Winkraft,1.0,,Teileinspeisung (einschließlich Eigenverbrauch),,SkyWind GmbH,Vertikalläufer,Mikrowindkraftsystem,9.7782,1.65,0.0,2
19718,SEE963063914649,2023-08-15,Brandenburg,16928,Gerdshagen,12.245942,53.211137,2023-08-15,2023-08-09,In Betrieb,NaT,NaT,NaT,WEA Rapsh. 1,5700.0,,Volleinspeisung,Windpark Rapshagen,Nordex Germany GmbH,Horizontalläufer,N149-5.7 MW,164.89,149.0,0.0,17436
30637,SEE994277782281,2023-08-10,Rheinland-Pfalz,56321,Rhens,7.615526,50.279646,2023-08-10,2023-08-10,In Betrieb,NaT,NaT,NaT,SkyWind NG Dach,1.0,,Teileinspeisung (einschließlich Eigenverbrauch),SKY Dach,SkyWind GmbH,Horizontalläufer,SkyWind NG Version 5.5,10.0,1.5,0.0,1
30638,SEE975190700958,2023-08-10,Rheinland-Pfalz,56321,Rhens,7.615526,50.279646,2023-08-10,2023-08-10,In Betrieb,NaT,NaT,NaT,SkyWind NG Dach2,1.0,,Teileinspeisung (einschließlich Eigenverbrauch),SKY Dach,SkyWind GmbH,Horizontalläufer,SkyWind NG Version 5.5,10.0,1.5,0.0,1
29746,SEE925226906618,2023-08-10,Schleswig-Holstein,25584,Holstenniendorf,9.340211,54.05635,2023-08-10,2023-08-10,In Betrieb,NaT,NaT,NaT,WEA 04,5000.0,,Volleinspeisung,WP HND,Siemens Gamesa Renewable Energy GmbH & Co. KG,Horizontalläufer,Siemens SG-5.0-132,84.0,132.0,0.0,13684
30175,SEE956622405849,2023-08-15,Schleswig-Holstein,25704,Nordermeldorf,9.043998,54.130372,2023-08-15,2023-08-11,In Betrieb,NaT,NaT,NaT,WEA 8,5000.0,,Volleinspeisung,WEA 8,Siemens Gamesa Renewable Energy GmbH & Co. KG,Horizontalläufer,SG 5.0-132,84.0,132.0,0.0,13684
29888,SEE951471840018,2023-08-14,Niedersachsen,49757,Lahn,7.581297,52.815249,2023-08-14,2023-08-11,In Betrieb,NaT,NaT,NaT,WEA 09,4200.0,,Volleinspeisung,Windpark Lahn IV,ENERCON GmbH,Horizontalläufer,E-138 EP3 E2,160.0,138.0,0.0,14957


## Checkpoint here!

In [18]:
df_test = df_2023[['Nettonennleistung', 'area_blades', 'Inbetriebnahmedatum', 'DatumLetzteAktualisierung']].tail(10)
df_test.set_index('Inbetriebnahmedatum', inplace=True)

In [19]:
df_test

Unnamed: 0_level_0,Nettonennleistung,area_blades,DatumLetzteAktualisierung
Inbetriebnahmedatum,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2023-08-04,4200.0,14526,2023-08-08
2023-08-04,4200.0,14957,2023-08-07
2023-08-04,4000.0,12667,2023-08-07
2023-08-07,1.0,2,2023-08-07
2023-08-09,5700.0,17436,2023-08-15
2023-08-10,1.0,1,2023-08-10
2023-08-10,1.0,1,2023-08-10
2023-08-10,5000.0,13684,2023-08-10
2023-08-11,5000.0,13684,2023-08-15
2023-08-11,4200.0,14957,2023-08-14


In [20]:
df_test.groupby(by='Inbetriebnahmedatum').count()

Unnamed: 0_level_0,Nettonennleistung,area_blades,DatumLetzteAktualisierung
Inbetriebnahmedatum,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2023-08-04,3,3,3
2023-08-07,1,1,1
2023-08-09,1,1,1
2023-08-10,3,3,3
2023-08-11,2,2,2


In [21]:
# def test_groupby_apply(group):
#     """
#     The groupby object gets passed in
#     """
#     return group['Nettonennleistung'].sum()

In [22]:
def test_groupby_apply(group):
    """
    The groupby object gets passed in
    """
    return group

In [23]:
# returns multi-index df
df_test.groupby(by='Inbetriebnahmedatum').apply(test_groupby_apply)

Unnamed: 0_level_0,Unnamed: 1_level_0,Nettonennleistung,area_blades,DatumLetzteAktualisierung
Inbetriebnahmedatum,Inbetriebnahmedatum,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2023-08-04,2023-08-04,4200.0,14526,2023-08-08
2023-08-04,2023-08-04,4200.0,14957,2023-08-07
2023-08-04,2023-08-04,4000.0,12667,2023-08-07
2023-08-07,2023-08-07,1.0,2,2023-08-07
2023-08-09,2023-08-09,5700.0,17436,2023-08-15
2023-08-10,2023-08-10,1.0,1,2023-08-10
2023-08-10,2023-08-10,1.0,1,2023-08-10
2023-08-10,2023-08-10,5000.0,13684,2023-08-10
2023-08-11,2023-08-11,5000.0,13684,2023-08-15
2023-08-11,2023-08-11,4200.0,14957,2023-08-14


In [24]:
df_test.groupby(by='Inbetriebnahmedatum')['Nettonennleistung'].sum()

Inbetriebnahmedatum
2023-08-04    12400.0
2023-08-07        1.0
2023-08-09     5700.0
2023-08-10     5002.0
2023-08-11     9200.0
Name: Nettonennleistung, dtype: float64

### This is close to what I want!
- call the transform method on the groupby object

In [25]:
df_test.groupby(by='Inbetriebnahmedatum')['Nettonennleistung'].transform('sum')

Inbetriebnahmedatum
2023-08-04    12400.0
2023-08-04    12400.0
2023-08-04    12400.0
2023-08-07        1.0
2023-08-09     5700.0
2023-08-10     5002.0
2023-08-10     5002.0
2023-08-10     5002.0
2023-08-11     9200.0
2023-08-11     9200.0
Name: Nettonennleistung, dtype: float64

In [26]:
df_test.groupby(by='Inbetriebnahmedatum')['Nettonennleistung',].transform('sum')

Unnamed: 0_level_0,Nettonennleistung
Inbetriebnahmedatum,Unnamed: 1_level_1
2023-08-04,12400.0
2023-08-04,12400.0
2023-08-04,12400.0
2023-08-07,1.0
2023-08-09,5700.0
2023-08-10,5002.0
2023-08-10,5002.0
2023-08-10,5002.0
2023-08-11,9200.0
2023-08-11,9200.0


In [27]:
df_test.groupby(by='Inbetriebnahmedatum')['Nettonennleistung',].transform('sum')

Unnamed: 0_level_0,Nettonennleistung
Inbetriebnahmedatum,Unnamed: 1_level_1
2023-08-04,12400.0
2023-08-04,12400.0
2023-08-04,12400.0
2023-08-07,1.0
2023-08-09,5700.0
2023-08-10,5002.0
2023-08-10,5002.0
2023-08-10,5002.0
2023-08-11,9200.0
2023-08-11,9200.0


In [28]:
# Access index value in apply function using row.name
def get_index_value(row):
    updated_date = row[0]
    return row.name

In [29]:
# Way to compare turbine dates in an apply function
# if using axis=1 then input is column
def date_compare_bool(row):
    in_operation_date = row.name
    updated_date = row[0]
    if in_operation_date > updated_date:
        return 'after'
    elif in_operation_date == updated_date:
        return 'same'
    else:
        return 'before'

In [30]:
df_test[['DatumLetzteAktualisierung', 'Nettonennleistung', 'area_blades']].apply(date_compare_bool, axis=1)

Inbetriebnahmedatum
2023-08-04    before
2023-08-04    before
2023-08-04    before
2023-08-07      same
2023-08-09    before
2023-08-10      same
2023-08-10      same
2023-08-10      same
2023-08-11    before
2023-08-11    before
dtype: object

In [31]:
df_test['in_operation_check'] = df_test[['DatumLetzteAktualisierung', 'Nettonennleistung', 'area_blades']].apply(date_compare_bool, axis=1)
df_test

Unnamed: 0_level_0,Nettonennleistung,area_blades,DatumLetzteAktualisierung,in_operation_check
Inbetriebnahmedatum,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2023-08-04,4200.0,14526,2023-08-08,before
2023-08-04,4200.0,14957,2023-08-07,before
2023-08-04,4000.0,12667,2023-08-07,before
2023-08-07,1.0,2,2023-08-07,same
2023-08-09,5700.0,17436,2023-08-15,before
2023-08-10,1.0,1,2023-08-10,same
2023-08-10,1.0,1,2023-08-10,same
2023-08-10,5000.0,13684,2023-08-10,same
2023-08-11,5000.0,13684,2023-08-15,before
2023-08-11,4200.0,14957,2023-08-14,before


In [32]:
datetime_index

DatetimeIndex(['2023-01-01', '2023-01-02', '2023-01-03', '2023-01-04',
               '2023-01-05', '2023-01-06', '2023-01-07', '2023-01-08',
               '2023-01-09', '2023-01-10',
               ...
               '2023-08-02', '2023-08-03', '2023-08-04', '2023-08-05',
               '2023-08-06', '2023-08-07', '2023-08-08', '2023-08-09',
               '2023-08-10', '2023-08-11'],
              dtype='datetime64[ns]', name='Inbetriebnahmedatum', length=223, freq='D')

In [33]:
# This inner joins the df index on the absolute datetime index
datetime_index.join(df_test.index, how='inner')

DatetimeIndex(['2023-08-04', '2023-08-04', '2023-08-04', '2023-08-07',
               '2023-08-09', '2023-08-10', '2023-08-10', '2023-08-10',
               '2023-08-11', '2023-08-11'],
              dtype='datetime64[ns]', name='Inbetriebnahmedatum', freq=None)

In [57]:
datetime_index.to_frame(index=False)

Unnamed: 0,0
0,2023-01-01
1,2023-01-02
2,2023-01-03
3,2023-01-04
4,2023-01-05
...,...
218,2023-08-07
219,2023-08-08
220,2023-08-09
221,2023-08-10


In [60]:
datetime_index.to_series()

2023-01-01   2023-01-01
2023-01-02   2023-01-02
2023-01-03   2023-01-03
2023-01-04   2023-01-04
2023-01-05   2023-01-05
                ...    
2023-08-07   2023-08-07
2023-08-08   2023-08-08
2023-08-09   2023-08-09
2023-08-10   2023-08-10
2023-08-11   2023-08-11
Freq: D, Length: 223, dtype: datetime64[ns]

In [53]:
df_test.index[0]

Timestamp('2023-08-04 00:00:00')

In [51]:
datetime_index[0]

Timestamp('2023-01-01 00:00:00')

In [179]:
df_test.groupby(by='Inbetriebnahmedatum')['Nettonennleistung'].sum()

Inbetriebnahmedatum
2023-08-04    12400.0
2023-08-07        1.0
2023-08-09     5700.0
2023-08-10     5002.0
2023-08-11     9200.0
Name: Nettonennleistung, dtype: float64

In [180]:
df_test

Unnamed: 0_level_0,Nettonennleistung,area_blades,DatumLetzteAktualisierung,in_operation_check
Inbetriebnahmedatum,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2023-08-04,4200.0,14526,2023-08-08,before
2023-08-04,4200.0,14957,2023-08-07,before
2023-08-04,4000.0,12667,2023-08-07,before
2023-08-07,1.0,2,2023-08-07,same
2023-08-09,5700.0,17436,2023-08-15,before
2023-08-10,1.0,1,2023-08-10,same
2023-08-10,1.0,1,2023-08-10,same
2023-08-10,5000.0,13684,2023-08-10,same
2023-08-11,5000.0,13684,2023-08-15,before
2023-08-11,4200.0,14957,2023-08-14,before


------

## This works but why the duplicate index as a column?
- This is not what i want... I want a row for every single day, not skipping days. Is concat what I want?
- Idea: do groupby FIRST and then join to or merge on datetimeindex
    - Do multiple groupby using various agg and join each to the datetimeindex df and grow out from there?

In [224]:
df_test

Unnamed: 0_level_0,Nettonennleistung,area_blades,DatumLetzteAktualisierung,in_operation_check
Inbetriebnahmedatum,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2023-08-04,4200.0,14526,2023-08-08,before
2023-08-04,4200.0,14957,2023-08-07,before
2023-08-04,4000.0,12667,2023-08-07,before
2023-08-07,1.0,2,2023-08-07,same
2023-08-09,5700.0,17436,2023-08-15,before
2023-08-10,1.0,1,2023-08-10,same
2023-08-10,1.0,1,2023-08-10,same
2023-08-10,5000.0,13684,2023-08-10,same
2023-08-11,5000.0,13684,2023-08-15,before
2023-08-11,4200.0,14957,2023-08-14,before


In [225]:
group = df_test.groupby(by='Inbetriebnahmedatum')['Nettonennleistung']
group

<pandas.core.groupby.generic.SeriesGroupBy object at 0x1469fabf0>

In [226]:
df_test.groupby(by='Inbetriebnahmedatum')['Nettonennleistung'].sum().to_frame()

Unnamed: 0_level_0,Nettonennleistung
Inbetriebnahmedatum,Unnamed: 1_level_1
2023-08-04,12400.0
2023-08-07,1.0
2023-08-09,5700.0
2023-08-10,5002.0
2023-08-11,9200.0


In [239]:
total_powerdf_test.groupby(by='Inbetriebnahmedatum')['Nettonennleistung'].sum()

Inbetriebnahmedatum
2023-08-04    12400.0
2023-08-07        1.0
2023-08-09     5700.0
2023-08-10     5002.0
2023-08-11     9200.0
Name: Nettonennleistung, dtype: float64

In [227]:
type(group.sum())

pandas.core.series.Series

In [228]:
type(datetime_index)

pandas.core.indexes.datetimes.DatetimeIndex

In [229]:
pd.concat([datetime_index.to_frame(), group.sum()], join='inner', axis=1)

Unnamed: 0,Date,Nettonennleistung
2023-08-04,2023-08-04,12400.0
2023-08-07,2023-08-07,1.0
2023-08-09,2023-08-09,5700.0
2023-08-10,2023-08-10,5002.0
2023-08-11,2023-08-11,9200.0


In [238]:
datetime_index.to_frame().join(group.sum(), how='left')

Unnamed: 0_level_0,Date,Nettonennleistung
Date,Unnamed: 1_level_1,Unnamed: 2_level_1
2023-01-01,2023-01-01,
2023-01-02,2023-01-02,
2023-01-03,2023-01-03,
2023-01-04,2023-01-04,
2023-01-05,2023-01-05,
...,...,...
2023-08-07,2023-08-07,1.0
2023-08-08,2023-08-08,
2023-08-09,2023-08-09,5700.0
2023-08-10,2023-08-10,5002.0


In [232]:
datetime_index.to_frame().join(df_test, how='left')

Unnamed: 0,Date,Nettonennleistung,area_blades,DatumLetzteAktualisierung,in_operation_check
2023-01-01,2023-01-01,,,NaT,
2023-01-02,2023-01-02,,,NaT,
2023-01-03,2023-01-03,,,NaT,
2023-01-04,2023-01-04,,,NaT,
2023-01-05,2023-01-05,,,NaT,
...,...,...,...,...,...
2023-08-10,2023-08-10,1.0,1.0,2023-08-10,same
2023-08-10,2023-08-10,1.0,1.0,2023-08-10,same
2023-08-10,2023-08-10,5000.0,13684.0,2023-08-10,same
2023-08-11,2023-08-11,5000.0,13684.0,2023-08-15,before


In [183]:
df_test.join(datetime_index.to_frame(), how='inner')

Unnamed: 0,Nettonennleistung,area_blades,DatumLetzteAktualisierung,in_operation_check,0
2023-08-04,4200.0,14526,2023-08-08,before,2023-08-04
2023-08-04,4200.0,14957,2023-08-07,before,2023-08-04
2023-08-04,4000.0,12667,2023-08-07,before,2023-08-04
2023-08-07,1.0,2,2023-08-07,same,2023-08-07
2023-08-09,5700.0,17436,2023-08-15,before,2023-08-09
2023-08-10,1.0,1,2023-08-10,same,2023-08-10
2023-08-10,1.0,1,2023-08-10,same,2023-08-10
2023-08-10,5000.0,13684,2023-08-10,same,2023-08-10
2023-08-11,5000.0,13684,2023-08-15,before,2023-08-11
2023-08-11,4200.0,14957,2023-08-14,before,2023-08-11


In [233]:
pd.merge(df_test, datetime_index.to_frame(), left_index=True, right_index=True, how='inner')

Unnamed: 0,Nettonennleistung,area_blades,DatumLetzteAktualisierung,in_operation_check,Date
2023-08-04,4200.0,14526,2023-08-08,before,2023-08-04
2023-08-04,4200.0,14957,2023-08-07,before,2023-08-04
2023-08-04,4000.0,12667,2023-08-07,before,2023-08-04
2023-08-07,1.0,2,2023-08-07,same,2023-08-07
2023-08-09,5700.0,17436,2023-08-15,before,2023-08-09
2023-08-10,1.0,1,2023-08-10,same,2023-08-10
2023-08-10,1.0,1,2023-08-10,same,2023-08-10
2023-08-10,5000.0,13684,2023-08-10,same,2023-08-10
2023-08-11,5000.0,13684,2023-08-15,before,2023-08-11
2023-08-11,4200.0,14957,2023-08-14,before,2023-08-11


In [234]:
datetime_index

DatetimeIndex(['2023-01-01', '2023-01-02', '2023-01-03', '2023-01-04',
               '2023-01-05', '2023-01-06', '2023-01-07', '2023-01-08',
               '2023-01-09', '2023-01-10',
               ...
               '2023-08-02', '2023-08-03', '2023-08-04', '2023-08-05',
               '2023-08-06', '2023-08-07', '2023-08-08', '2023-08-09',
               '2023-08-10', '2023-08-11'],
              dtype='datetime64[ns]', name='Date', length=223, freq='D')

In [235]:
df_test

Unnamed: 0_level_0,Nettonennleistung,area_blades,DatumLetzteAktualisierung,in_operation_check
Inbetriebnahmedatum,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2023-08-04,4200.0,14526,2023-08-08,before
2023-08-04,4200.0,14957,2023-08-07,before
2023-08-04,4000.0,12667,2023-08-07,before
2023-08-07,1.0,2,2023-08-07,same
2023-08-09,5700.0,17436,2023-08-15,before
2023-08-10,1.0,1,2023-08-10,same
2023-08-10,1.0,1,2023-08-10,same
2023-08-10,5000.0,13684,2023-08-10,same
2023-08-11,5000.0,13684,2023-08-15,before
2023-08-11,4200.0,14957,2023-08-14,before


In [236]:
datetime_index.to_frame().merge(df_test, left_index=True, right_index=True, how='outer')

Unnamed: 0,Date,Nettonennleistung,area_blades,DatumLetzteAktualisierung,in_operation_check
2023-01-01,2023-01-01,,,NaT,
2023-01-02,2023-01-02,,,NaT,
2023-01-03,2023-01-03,,,NaT,
2023-01-04,2023-01-04,,,NaT,
2023-01-05,2023-01-05,,,NaT,
...,...,...,...,...,...
2023-08-10,2023-08-10,1.0,1.0,2023-08-10,same
2023-08-10,2023-08-10,1.0,1.0,2023-08-10,same
2023-08-10,2023-08-10,5000.0,13684.0,2023-08-10,same
2023-08-11,2023-08-11,5000.0,13684.0,2023-08-15,before


In [185]:
df_test

Unnamed: 0_level_0,Nettonennleistung,area_blades,DatumLetzteAktualisierung,in_operation_check
Inbetriebnahmedatum,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2023-08-04,4200.0,14526,2023-08-08,before
2023-08-04,4200.0,14957,2023-08-07,before
2023-08-04,4000.0,12667,2023-08-07,before
2023-08-07,1.0,2,2023-08-07,same
2023-08-09,5700.0,17436,2023-08-15,before
2023-08-10,1.0,1,2023-08-10,same
2023-08-10,1.0,1,2023-08-10,same
2023-08-10,5000.0,13684,2023-08-10,same
2023-08-11,5000.0,13684,2023-08-15,before
2023-08-11,4200.0,14957,2023-08-14,before


In [186]:
datetime_index

DatetimeIndex(['2023-01-01', '2023-01-02', '2023-01-03', '2023-01-04',
               '2023-01-05', '2023-01-06', '2023-01-07', '2023-01-08',
               '2023-01-09', '2023-01-10',
               ...
               '2023-08-02', '2023-08-03', '2023-08-04', '2023-08-05',
               '2023-08-06', '2023-08-07', '2023-08-08', '2023-08-09',
               '2023-08-10', '2023-08-11'],
              dtype='datetime64[ns]', length=223, freq='D')

-----

### Pandas join example

In [187]:
df1 = pd.DataFrame({'key': ['K0', 'K1', 'K2', 'K3', 'K4', 'K5'],
                   'A': ['A0', 'A1', 'A2', 'A3', 'A4', 'A5']})

other = pd.DataFrame({'key': ['K0', 'K1', 'K2'],
                      'B': ['B0', 'B1', 'B2']})

In [188]:
df1

Unnamed: 0,key,A
0,K0,A0
1,K1,A1
2,K2,A2
3,K3,A3
4,K4,A4
5,K5,A5


In [189]:
other

Unnamed: 0,key,B
0,K0,B0
1,K1,B1
2,K2,B2


In [190]:
df1.set_index('key').join(other.set_index('key'))

Unnamed: 0_level_0,A,B
key,Unnamed: 1_level_1,Unnamed: 2_level_1
K0,A0,B0
K1,A1,B1
K2,A2,B2
K3,A3,
K4,A4,
K5,A5,


----

In [191]:
# This works! But how do i drop the duplicated index of dates with an unnamed column?
# Something to do with the datetime index series not having a name?
# Nope I gave it the same name and still throws it into the columns
pd.merge(datetime_index.to_frame(), df_test, left_index=True, right_index=True, how='inner')

Unnamed: 0,0,Nettonennleistung,area_blades,DatumLetzteAktualisierung,in_operation_check
2023-08-04,2023-08-04,4200.0,14526,2023-08-08,before
2023-08-04,2023-08-04,4200.0,14957,2023-08-07,before
2023-08-04,2023-08-04,4000.0,12667,2023-08-07,before
2023-08-07,2023-08-07,1.0,2,2023-08-07,same
2023-08-09,2023-08-09,5700.0,17436,2023-08-15,before
2023-08-10,2023-08-10,1.0,1,2023-08-10,same
2023-08-10,2023-08-10,1.0,1,2023-08-10,same
2023-08-10,2023-08-10,5000.0,13684,2023-08-10,same
2023-08-11,2023-08-11,5000.0,13684,2023-08-15,before
2023-08-11,2023-08-11,4200.0,14957,2023-08-14,before


In [193]:
# pd.merge(datetime_index.to_frame(), df_test.reset_index(), left_index=True, right_on='Inbetriebnahmedatum', how='inner')

In [194]:
df_test.reset_index()

Unnamed: 0,Inbetriebnahmedatum,Nettonennleistung,area_blades,DatumLetzteAktualisierung,in_operation_check
0,2023-08-04,4200.0,14526,2023-08-08,before
1,2023-08-04,4200.0,14957,2023-08-07,before
2,2023-08-04,4000.0,12667,2023-08-07,before
3,2023-08-07,1.0,2,2023-08-07,same
4,2023-08-09,5700.0,17436,2023-08-15,before
5,2023-08-10,1.0,1,2023-08-10,same
6,2023-08-10,1.0,1,2023-08-10,same
7,2023-08-10,5000.0,13684,2023-08-10,same
8,2023-08-11,5000.0,13684,2023-08-15,before
9,2023-08-11,4200.0,14957,2023-08-14,before


------

------

# ChatGPT example

In [131]:
# Sample dataframe
data = {
    'first_time_operational': ['2023-08-01', '2023-08-05'],
    'decommissioned': ['2023-08-30', '2023-08-20'],
    'shutdown': ['2023-08-10', '2023-08-15'],
    'operational_again': ['2023-08-15', '2023-08-18']
}
df_gpt = pd.DataFrame(data)

# Convert strings to datetime64 format
for column in df_gpt.columns:
    df_gpt[column] = pd.to_datetime(df_gpt[column])

In [132]:
df_gpt.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2 entries, 0 to 1
Data columns (total 4 columns):
 #   Column                  Non-Null Count  Dtype         
---  ------                  --------------  -----         
 0   first_time_operational  2 non-null      datetime64[ns]
 1   decommissioned          2 non-null      datetime64[ns]
 2   shutdown                2 non-null      datetime64[ns]
 3   operational_again       2 non-null      datetime64[ns]
dtypes: datetime64[ns](4)
memory usage: 192.0 bytes


In [133]:
# Create a DatetimeIndex for August 2023
date_range = pd.date_range(start='2023-08-01', end='2023-08-31', freq='D')

In [134]:
date_range

DatetimeIndex(['2023-08-01', '2023-08-02', '2023-08-03', '2023-08-04',
               '2023-08-05', '2023-08-06', '2023-08-07', '2023-08-08',
               '2023-08-09', '2023-08-10', '2023-08-11', '2023-08-12',
               '2023-08-13', '2023-08-14', '2023-08-15', '2023-08-16',
               '2023-08-17', '2023-08-18', '2023-08-19', '2023-08-20',
               '2023-08-21', '2023-08-22', '2023-08-23', '2023-08-24',
               '2023-08-25', '2023-08-26', '2023-08-27', '2023-08-28',
               '2023-08-29', '2023-08-30', '2023-08-31'],
              dtype='datetime64[ns]', freq='D')

In [84]:
# Initialize a Series for the results
results = pd.Series(True, index=date_range)
results

2023-08-01    True
2023-08-02    True
2023-08-03    True
2023-08-04    True
2023-08-05    True
2023-08-06    True
2023-08-07    True
2023-08-08    True
2023-08-09    True
2023-08-10    True
2023-08-11    True
2023-08-12    True
2023-08-13    True
2023-08-14    True
2023-08-15    True
2023-08-16    True
2023-08-17    True
2023-08-18    True
2023-08-19    True
2023-08-20    True
2023-08-21    True
2023-08-22    True
2023-08-23    True
2023-08-24    True
2023-08-25    True
2023-08-26    True
2023-08-27    True
2023-08-28    True
2023-08-29    True
2023-08-30    True
2023-08-31    True
Freq: D, dtype: bool

In [85]:
# Loop through each machine and apply conditions
for _, row in df_gpt.iterrows():
    # Check the operational dates
    results.loc[:row['first_time_operational'] - pd.Timedelta(days=1)] = False
    results.loc[row['decommissioned']:] = False
    
    # Check the shutdown dates
    results.loc[row['shutdown']:row['operational_again'] - pd.Timedelta(days=1)] = False

print(results)

2023-08-01    False
2023-08-02    False
2023-08-03    False
2023-08-04    False
2023-08-05     True
2023-08-06     True
2023-08-07     True
2023-08-08     True
2023-08-09     True
2023-08-10    False
2023-08-11    False
2023-08-12    False
2023-08-13    False
2023-08-14    False
2023-08-15    False
2023-08-16    False
2023-08-17    False
2023-08-18     True
2023-08-19     True
2023-08-20    False
2023-08-21    False
2023-08-22    False
2023-08-23    False
2023-08-24    False
2023-08-25    False
2023-08-26    False
2023-08-27    False
2023-08-28    False
2023-08-29    False
2023-08-30    False
2023-08-31    False
Freq: D, dtype: bool


In [86]:
df_gpt.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2 entries, 0 to 1
Data columns (total 4 columns):
 #   Column                  Non-Null Count  Dtype         
---  ------                  --------------  -----         
 0   first_time_operational  2 non-null      datetime64[ns]
 1   decommissioned          2 non-null      datetime64[ns]
 2   shutdown                2 non-null      datetime64[ns]
 3   operational_again       2 non-null      datetime64[ns]
dtypes: datetime64[ns](4)
memory usage: 192.0 bytes


In [135]:
df_gpt

Unnamed: 0,first_time_operational,decommissioned,shutdown,operational_again
0,2023-08-01,2023-08-30,2023-08-10,2023-08-15
1,2023-08-05,2023-08-20,2023-08-15,2023-08-18


## Trying `df.itertuples()`

In [136]:
mylist = []
for row in df_gpt.itertuples(name='Test'):
    row['last_day'] = row.decommissioned.day
    mylist.append(row)

mylist

TypeError: 'Test' object does not support item assignment

In [89]:
type(mylist[0])

pandas.core.frame.Test

In [98]:
mylist[0]

Test(Index=0, first_time_operational=Timestamp('2023-08-01 00:00:00'), decommissioned=Timestamp('2023-08-30 00:00:00'), shutdown=Timestamp('2023-08-10 00:00:00'), operational_again=Timestamp('2023-08-15 00:00:00'), last_day=Timestamp('2023-08-20 00:00:00'))

In [91]:
mylist[0].decommissioned

Timestamp('2023-08-30 00:00:00')

In [99]:
for tup in mylist:
    print(tup.Index)

0
1


In [101]:
df_test

Unnamed: 0_level_0,Nettonennleistung,area_blades,DatumLetzteAktualisierung,in_operation_check
Inbetriebnahmedatum,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2023-08-04,4200.0,14526,2023-08-08,before
2023-08-04,4200.0,14957,2023-08-07,before
2023-08-04,4000.0,12667,2023-08-07,before
2023-08-07,1.0,2,2023-08-07,same
2023-08-09,5700.0,17436,2023-08-15,before
2023-08-10,1.0,1,2023-08-10,same
2023-08-10,1.0,1,2023-08-10,same
2023-08-10,5000.0,13684,2023-08-10,same
2023-08-11,5000.0,13684,2023-08-15,before
2023-08-11,4200.0,14957,2023-08-14,before


In [120]:
def test_itertuples(dt_idx, df):
    # work on copies so I don't modify original
    dt_idx = dt_idx.copy()
    df = df.copy()
    
    
