### Import libraries

In [2]:
import pandas as pd
import time

### Loading and preprocessing data

In [3]:
countries = ['FIN','DEN', 'NOR', 'SWE']

In [4]:
# Input path and filename
path = '../5. Master_thesis/Datasets/'
filename = path + 'Retails_ALL_CY06_CY22.xlsx'
external_file = path + 'External_indicators_extended.xlsx'

start_time = time.time()

for country in countries:

    print("Loading " + country + '...')

    # Load file into pandas dataframe
    data_int = pd.read_excel(filename, sheet_name='Retails')
    data_ext = pd.read_excel(external_file, sheet_name=country)

    df = data_int.copy()
    df_ext = data_ext.copy()

    # Remove potential extra spaces in cells using strip() function
    df['RetailCountry'] = df['RetailCountry'].str.strip()
    df['orderNumber'] = df['orderNumber'].str.strip()

    # Sort by orderNumber and RetailDate
    df.sort_values(by=['orderNumber', 'RetailDate'], inplace=True)

    # Remove duplicates by orderNumber
    df.drop_duplicates(subset=['orderNumber'], keep='last', inplace=True)

    # Get only Finland orders
    df = df[(df['RetailCountry'] == country)]    

    # Convert RetailDate to date
    df['Date'] = pd.to_datetime(df['RetailDate'], dayfirst=True)
    df['Date'] = df['Date'].dt.to_period('M').dt.to_timestamp()    

    # Generating index by date, grouping by and counting
    df = df.groupby(['Date']).size().reset_index(name='Orders') 

    # Set index in both dataframes
    df = df.set_index("Date")
    df_ext = df_ext.set_index("Date")

    # Merge both dataframes
    data = pd.concat([df,df_ext], axis=1)       

    # Save data to Excel  
    with pd.ExcelWriter("Datasets/Output_files/0.xlsx", mode="a", engine="openpyxl", if_sheet_exists="replace") as writer:
        data.to_excel(writer, sheet_name=country) 

end_time = time.time()
time_elapsed = end_time - start_time 

print('Completed in {:0.2f} minutes'.format(time_elapsed/60))

Loading FIN...
Loading DEN...
Loading NOR...
Loading SWE...
Completed in 6.36 minutes
