In [2]:
import numpy as np
import pandas as pd
import sys, os, warnings, glob

warnings.filterwarnings('ignore')

In [3]:
dir_ = 'D:/Github/knowledge/time-series/data/daikin/sell-in/'

raw_data_dir = dir_
processed_data_dir = dir_+'processed/'

os.makedirs(raw_data_dir, exist_ok=True)
os.makedirs(processed_data_dir, exist_ok=True)

In [3]:
def read_multiple_excel_files(file_paths):
    dataframes = []
    
    for file_path in file_paths:
        df = pd.read_excel(file_path)
        dataframes.append(df)
    
    combined_dataframe = pd.concat(dataframes, ignore_index=True)
    return combined_dataframe

# Example usage
file_paths = glob.glob(f"{raw_data_dir}*.xlsx")  # Adjust the path as needed
df = read_multiple_excel_files(file_paths)

In [4]:
df.head()

Unnamed: 0,Date,Customer account,Department,Sales order,VAT Invoice No,Serial,Customer name,Search name,Item,SBU,...,Rebate Y,Discount code,Notes,Description,Pool,Credit limit control,LC Code,The invoice amount in the accounting currency,Cash %,Rebate Q
0,2016-03-01,C110187,S2D,F030001-SO,24000,SA/15T,Công ty TNHH Toàn Châu Á,TOAN CHAU A,FBQ125EVE,SP,...,0,,,Dàn lạnh máy ĐHKK Daikin FBQ125EVE,DEALER,,,79808906,0,0
1,2016-03-01,C110187,S2D,F030001-SO,24000,SA/15T,Công ty TNHH Toàn Châu Á,TOAN CHAU A,RZR125MVM,SP,...,0,,,Dàn nóng máy ĐHKK Daikin RZR125MVM,DEALER,,,79808906,0,0
2,2016-03-01,C110187,S2D,F030001-SO,24000,SA/15T,Công ty TNHH Toàn Châu Á,TOAN CHAU A,BRC1E62,SP,...,0,,,Bộ điều khiển Daikin BRC1E62,DEALER,,,79808906,0,0
3,2016-03-01,C110187,S2D,F030001-SO,24000,SA/15T,Công ty TNHH Toàn Châu Á,TOAN CHAU A,FTNE50MV1V,RA,...,0,,,Dàn lạnh máy ĐHKK Daikin FTNE50MV1V,DEALER,,,79808906,0,0
4,2016-03-01,C110187,S2D,F030001-SO,24000,SA/15T,Công ty TNHH Toàn Châu Á,TOAN CHAU A,RNE50MV1V,RA,...,0,,,Dàn nóng máy ĐHKK Daikin RNE50MV1V,DEALER,,,79808906,0,0


In [5]:
# df = df[df['Customer account'] == 'C150079']
dmx_index = df['Customer name'].str.contains('thế giới di động', case=False, na=False) #& df['department'].isin(['O2X'])
dmx_df = df[dmx_index]

In [6]:
dmx_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 175146 entries, 134 to 2564792
Data columns (total 31 columns):
 #   Column                                         Non-Null Count   Dtype         
---  ------                                         --------------   -----         
 0   Date                                           175146 non-null  datetime64[ns]
 1   Customer account                               175146 non-null  object        
 2   Department                                     175146 non-null  object        
 3   Sales order                                    175146 non-null  object        
 4   VAT Invoice No                                 175146 non-null  object        
 5   Serial                                         175100 non-null  object        
 6   Customer name                                  175146 non-null  object        
 7   Search name                                    175146 non-null  object        
 8   Item                                          

In [13]:
# dmx_df['Customer account'].value_counts().to_csv('dmx_customer.csv', header=True)
# dmx_df[dmx_df['Customer account'] == 'C171137']

In [7]:
# df.dropna(inplace=True)

sellin_df = dmx_df.groupby(['Date','Department','Item']).sum()['Quantity']
sellin_df = sellin_df.reset_index(drop=False)
rename_cols = {
    "Item":'item',
    "Date":'date',
    "Quantity":'qty',
    "Department":'department',
}
sellin_df = sellin_df.rename(rename_cols, axis=1)

In [8]:
sellin_df.dropna(inplace=True)
sellin_df.shape
# sellout_df.dtypes

(17085, 4)

In [9]:
sellin_df.to_parquet(f'{processed_data_dir}sellin.parquet')

In [11]:
df = pd.read_excel(f'{processed_data_dir}dmx_sellin.xlsx')
df['qty_actual'] = df['qty']
df.loc[df['qty'] < 0, 'qty'] = 0

In [12]:
df.to_parquet(f'{processed_data_dir}sellin.parquet')

In [9]:
from geopy.geocoders import Nominatim

geolocator = Nominatim(user_agent="geopy_example")
def find_lat_long(name, country = 'Việt Nam'):
    if name:
        location = f'{name}, {country}'
    else:
        location = country
    address=geolocator.geocode(location)
    # print(dir(address))
    return(address.latitude ,address.longitude)
find_lat_long('Việt Nam')


province_df = sellin_df['location_name'].value_counts().to_frame().reset_index()
province_df.drop('count', axis=1, inplace=True)
province_df['location'] = province_df['location_name'].apply(find_lat_long)
province_df['latitude'] = province_df['location'].apply(lambda x: x[0])
province_df['longitude'] = province_df['location'].apply(lambda x: x[1])
province_df['latitude_longitude'] = province_df['location'].apply(lambda x: f'{x[0]}_{x[1]}')
province_df.to_parquet(f'{processed_data_dir}provinces.parquet')

In [10]:
from meteostat import Point, Daily, Stations
from datetime import datetime

start = datetime(2016, 1, 1)
end = datetime(2024, 12, 31)
dataframes = []
for i, row in province_df.iterrows():
    location_name = row['location_name']
    location = row['location']
    stations = Stations()
    stations = stations.nearby(*location)
    station = stations.fetch(5).dropna(subset=['daily_end'])
    # _location = Point(*location)
    _location = Point(station.latitude.values[0],station.longitude.values[0])
    data = Daily(_location, start, end)
    data = data.fetch()
    data['location_name'] = location_name
    data.reset_index(inplace=True)
    dataframes.append(data)
    # break

weather_df = pd.concat(dataframes, ignore_index=True)
weather_df.to_parquet(f'{processed_data_dir}weather.parquet')
# weather_df

In [10]:
import holidays
from datetime import datetime

start_date = datetime(2016, 1, 1)
end_date = datetime(2024, 12, 31)

# Generate a list of dates within the range
date_list = pd.date_range(start=start_date, end=end_date).tolist()

# Convert the list of dates to a Pandas DataFrame
date_df = pd.DataFrame(date_list, columns=['date'])

# Define the country and year
country = 'VN'
years = range(start_date.year, end_date.year+1)

# Generate the list of holidays
vietnam_holidays = holidays.CountryHoliday(country, years=years)

# Convert the holidays to a list of dictionaries
holiday_list = [{"date": date, "holiday": name} for date, name in vietnam_holidays.items()]

# Create a DataFrame from the list of dictionaries
holiday_df = pd.DataFrame(holiday_list)
holiday_df['date'] = pd.to_datetime(holiday_df['date'])
date_df = date_df.merge(holiday_df, how='left', on='date')
date_df.to_parquet(f'{processed_data_dir}date.parquet')