In [321]:
#Importing all the packages and the CSV file
import pandas as pd
import numpy as np
from pandasql import sqldf
df = pd.read_csv('Customer table.csv')

In [322]:
#Removing the timestamp values from the switch columns
df['switch_start'] = df['switch_start'].apply(lambda x: pd.Timestamp(x).strftime('%Y-%m-%d'))
df['switch_end'] = df['switch_end'].apply(lambda x: pd.Timestamp(x).strftime('%Y-%m-%d'))

#CSV Preview
print(df)

      id  customer_id  switches   value switch_start  switch_end
0      1         1111         5    99,9   2022-01-10  2022-12-31
1      2         1111         5  101,99   2023-01-01  2023-01-31
2      3         1111         5  103,98   2023-02-01  2023-02-28
3      4         1111         5  105,97   2023-03-01  2023-03-31
4      5         1111         5  107,96   2023-04-01  2023-05-01
..   ...          ...       ...     ...          ...         ...
145  146         1239         1    99,9   2023-01-01  2024-01-01
146  147         1240         1    99,9   2023-02-01  2024-02-01
147  148         1241         1    99,9   2023-03-01  2024-02-29
148  149         1242         1    99,9   2023-01-01  2024-01-01
149  150         1243         1    99,9   2023-02-01  2024-02-01

[150 rows x 6 columns]


In [332]:
#Importing datetime to generate a dataframe with all dates from the beginning until today
from datetime import datetime

# start and end date
now = datetime.now()
start_date = datetime.strptime("2022-01-10", "%Y-%m-%d")
end_date = now.strftime("%Y-%m-%d")

#Preview
date_list = pd.date_range(start_date, end_date, freq='D')
print(f"Creating list of dates starting from {start_date} to {end_date}")
print(date_list)

Creating list of dates starting from 2022-01-10 00:00:00 to 2023-05-02
DatetimeIndex(['2022-01-10', '2022-01-11', '2022-01-12', '2022-01-13',
               '2022-01-14', '2022-01-15', '2022-01-16', '2022-01-17',
               '2022-01-18', '2022-01-19',
               ...
               '2023-04-23', '2023-04-24', '2023-04-25', '2023-04-26',
               '2023-04-27', '2023-04-28', '2023-04-29', '2023-04-30',
               '2023-05-01', '2023-05-02'],
              dtype='datetime64[ns]', length=478, freq='D')


In [370]:
#Creating the date dataframe
dates_df = pd.DataFrame({'Datetime': date_list})

#Transforming all the dates in yyyy-mm-dd
dates_df['Datetime'] = dates_df['Datetime'].apply(lambda x: pd.Timestamp(x).strftime('%Y-%m-%d'))

#Overview of dates
print(dates_df)

       Datetime
0    2022-01-10
1    2022-01-11
2    2022-01-12
3    2022-01-13
4    2022-01-14
..          ...
473  2023-04-28
474  2023-04-29
475  2023-04-30
476  2023-05-01
477  2023-05-02

[478 rows x 1 columns]


In [345]:
#Transforming all the dates in yyyy-mm-dd from the CSV file
df['switch_start'] = df['switch_start'].apply(lambda x: pd.Timestamp(x).strftime('%Y-%m-%d'))
df['switch_end'] = df['switch_end'].apply(lambda x: pd.Timestamp(x).strftime('%Y-%m-%d'))

In [346]:
#Testing SQL environment 
pysqldf = lambda q: sqldf(q)
pysqldf("SELECT df.* FROM df")

Unnamed: 0,id,customer_id,switches,value,switch_start,switch_end
0,1,1111,5,999,2022-01-10,2022-12-31
1,2,1111,5,10199,2023-01-01,2023-01-31
2,3,1111,5,10398,2023-02-01,2023-02-28
3,4,1111,5,10597,2023-03-01,2023-03-31
4,5,1111,5,10796,2023-04-01,2023-05-01
...,...,...,...,...,...,...
145,146,1239,1,999,2023-01-01,2024-01-01
146,147,1240,1,999,2023-02-01,2024-02-01
147,148,1241,1,999,2023-03-01,2024-02-29
148,149,1242,1,999,2023-01-01,2024-01-01


In [351]:
# Selecting one customer to test the join
customer_1111 = pysqldf("""
            WITH 
            dates AS 
            (
            SELECT Datetime as date
            FROM dates_df 
            )
            ,
            customer_table AS 
            (
            SELECT   switch_start,
             switch_end,
             customer_id,
            value as switch_value
            FROM df
            )
           select dd.date, 
            ct.*
           from dates dd
               left join customer_table ct
                   on dd.date >= ct.switch_start 
                       and dd.date <= switch_end
           where customer_id = 1111
           order by 1 asc
        """)

print(customer_1111)

           date switch_start  switch_end  customer_id switch_value
0    2022-01-10   2022-01-10  2022-12-31         1111         99,9
1    2022-01-11   2022-01-10  2022-12-31         1111         99,9
2    2022-01-12   2022-01-10  2022-12-31         1111         99,9
3    2022-01-13   2022-01-10  2022-12-31         1111         99,9
4    2022-01-14   2022-01-10  2022-12-31         1111         99,9
..          ...          ...         ...          ...          ...
472  2023-04-27   2023-04-01  2023-05-01         1111       107,96
473  2023-04-28   2023-04-01  2023-05-01         1111       107,96
474  2023-04-29   2023-04-01  2023-05-01         1111       107,96
475  2023-04-30   2023-04-01  2023-05-01         1111       107,96
476  2023-05-01   2023-04-01  2023-05-01         1111       107,96

[477 rows x 5 columns]


In [368]:
#Checking all customer ids and the number of switches, rows and min/max dates
customer_check = pysqldf("""
            WITH 
            dates AS 
            (
            SELECT 
                Datetime as date
            FROM dates_df 
            )
            ,
            customer_table AS 
            (
            SELECT 
                customer_id,
                switch_start,
                 switch_end,
                 switches, 
                value as switch_value
            FROM df
            )
            select 
                distinct customer_id, 
                switches,
                count(distinct switch_value) as distinct_switch_value,
                count (*) as row_count,
                min(date) as min_date, 
                max(date) as max_date
            from dates dd
                left join customer_table ct
                    on dd.date >= ct.switch_start 
                    and dd.date <= switch_end
            group by 1,2 
        """)

print(customer_check)

     customer_id  switches  distinct_switch_value  row_count    min_date  \
0           1111         5                      5        477  2022-01-10   
1           1112         2                      2        122  2023-01-01   
2           1113         3                      3        122  2023-01-01   
3           1114         1                      1        153  2022-12-01   
4           1115         1                      1        153  2022-12-01   
..           ...       ...                    ...        ...         ...   
128         1239         1                      1        122  2023-01-01   
129         1240         1                      1         91  2023-02-01   
130         1241         1                      1         63  2023-03-01   
131         1242         1                      1        122  2023-01-01   
132         1243         1                      1         91  2023-02-01   

       max_date  
0    2023-05-01  
1    2023-05-02  
2    2023-05-02  
3    2023-05-02

In [361]:
#Final output
customer_final = pysqldf("""
            WITH 
            dates AS 
            (
            SELECT 
                Datetime as date
            FROM dates_df 
            )
            ,
            customer_table AS 
            (
            SELECT 
                switch_start,
                switch_end,
                customer_id,
                value as switch_value,
                row_number() over (partition by customer_id order by switch_start desc) as last_idx
            FROM df
            )
           select 
            dd.date, 
            ct.*
           from dates dd
               left join customer_table ct
                   on dd.date >= ct.switch_start 
                       and dd.date <= switch_end
           order by 1 asc
        """)

print(customer_final)

             date switch_start  switch_end  customer_id switch_value  last_idx
0      2022-01-10   2022-01-10  2022-12-31         1111         99,9         5
1      2022-01-10   2022-01-10  2022-12-31         1120         99,9         5
2      2022-01-10   2022-01-10  2022-12-31         1166          101         4
3      2022-01-11   2022-01-10  2022-12-31         1111         99,9         5
4      2022-01-11   2022-01-10  2022-12-31         1120         99,9         5
...           ...          ...         ...          ...          ...       ...
17151  2023-05-02   2023-01-01  2024-01-01         1239         99,9         1
17152  2023-05-02   2023-02-01  2024-02-01         1240         99,9         1
17153  2023-05-02   2023-03-01  2024-02-29         1241         99,9         1
17154  2023-05-02   2023-01-01  2024-01-01         1242         99,9         1
17155  2023-05-02   2023-02-01  2024-02-01         1243         99,9         1

[17156 rows x 6 columns]


In [362]:
#Exporting CSV
customer_final.to_csv('customer_final.csv', index=False)