In [1]:
import numpy as np
import pandas as pd
import sys, os, warnings, glob

warnings.filterwarnings('ignore')

In [2]:
dir_ = 'D:/Github/knowledge/time-series/data/daikin/sell-out/'

raw_data_dir = dir_
processed_data_dir = dir_+'processed/'

os.makedirs(raw_data_dir, exist_ok=True)
os.makedirs(processed_data_dir, exist_ok=True)

In [3]:
def read_multiple_excel_files(file_paths):
    dataframes = []
    
    for file_path in file_paths:
        df = pd.read_excel(file_path)
        dataframes.append(df)
    
    combined_dataframe = pd.concat(dataframes, ignore_index=True)
    return combined_dataframe

# Example usage
file_paths = glob.glob(f"{raw_data_dir}*.xlsx")  # Adjust the path as needed
df = read_multiple_excel_files(file_paths)

In [4]:
df.head()

Unnamed: 0,Model,SerialNr,InvoiceNr,DealerName,InvoiceDate,ActivatedDate,ActivatedBy,ActivatedByDealerID,DealerTechAgent,CustOrganization,CustCity,Phone
0,ARF25UV1V,E000112,SA/19E_0049678,Công ty Cổ Phần Thế Giới Di Động,2020-03-12,2020-11-23,Công ty Cổ Phần Thế Giới Di Động,C150079,Dealer,Cá nhân,Kiên Giang,342756666
1,ARF25UV1V,E000260,SA/19E_0049585,Công ty Cổ Phần Thế Giới Di Động,2020-03-11,2021-03-09,Công ty Cổ Phần Thế Giới Di Động,C150079,Dealer,Cá nhân,Tây Ninh,908061016
2,ARF25UV1V,E000262,SA/19E_0049585,Công ty Cổ Phần Thế Giới Di Động,2020-03-11,2021-02-17,Công ty Cổ Phần Thế Giới Di Động,C150079,Dealer,Cá nhân,Tây Ninh,981714450
3,ARF25UV1V,E000297,SA/19E_0049695,Công ty Cổ Phần Thế Giới Di Động,2020-03-12,2020-05-16,Công ty Cổ Phần Thế Giới Di Động,C150079,Dealer,Cá nhân,Cần Thơ,367200220
4,ARF25UV1V,E000347,SA/19E_0049590,Công ty Cổ Phần Thế Giới Di Động,2020-03-11,2020-12-08,Công ty Cổ Phần Thế Giới Di Động,C150079,Dealer,Cá nhân,Bến Tre,944609659


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1172705 entries, 0 to 1172704
Data columns (total 12 columns):
 #   Column               Non-Null Count    Dtype         
---  ------               --------------    -----         
 0   Model                1172705 non-null  object        
 1   SerialNr             1172705 non-null  object        
 2   InvoiceNr            1172705 non-null  object        
 3   DealerName           1172705 non-null  object        
 4   InvoiceDate          1172705 non-null  datetime64[ns]
 5   ActivatedDate        1172705 non-null  datetime64[ns]
 6   ActivatedBy          1172705 non-null  object        
 7   ActivatedByDealerID  1172705 non-null  object        
 8   DealerTechAgent      1172705 non-null  object        
 9   CustOrganization     1172692 non-null  object        
 10  CustCity             1172692 non-null  object        
 11  Phone                1172692 non-null  object        
dtypes: datetime64[ns](2), object(10)
memory usage: 107.4+ MB

In [6]:
df.dropna(inplace=True)

sellout_df = df.groupby(['Model','ActivatedDate','CustCity']).count()['SerialNr']
sellout_df = sellout_df.reset_index(drop=False)
rename_cols = {
    "Model":'item',
    "ActivatedDate":'date',
    "SerialNr":'qty',
    "CustCity":'location_name',
}
sellout_df = sellout_df.rename(rename_cols, axis=1)

In [7]:
sellout_df.head()
# sellout_df.dtypes

Unnamed: 0,item,date,location_name,qty
0,ARF25UV1V,2020-03-15,Hồ Chí Minh,1
1,ARF25UV1V,2020-04-25,Hồ Chí Minh,1
2,ARF25UV1V,2020-04-26,Bình Thuận,1
3,ARF25UV1V,2020-04-26,Bắc Ninh,1
4,ARF25UV1V,2020-04-26,Hà Nội,4


In [8]:
sellout_df.to_parquet(f'{processed_data_dir}sellout.parquet')

In [9]:
from geopy.geocoders import Nominatim

geolocator = Nominatim(user_agent="geopy_example")
def find_lat_long(name, country = 'Việt Nam'):
    if name:
        location = f'{name}, {country}'
    else:
        location = country
    address=geolocator.geocode(location)
    # print(dir(address))
    return(address.latitude ,address.longitude)
find_lat_long('Việt Nam')


province_df = sellout_df['location_name'].value_counts().to_frame().reset_index()
province_df.drop('count', axis=1, inplace=True)
province_df['location'] = province_df['location_name'].apply(find_lat_long)
province_df['latitude'] = province_df['location'].apply(lambda x: x[0])
province_df['longitude'] = province_df['location'].apply(lambda x: x[1])
province_df['latitude_longitude'] = province_df['location'].apply(lambda x: f'{x[0]}_{x[1]}')
province_df.to_parquet(f'{processed_data_dir}provinces.parquet')

In [10]:
from meteostat import Point, Daily, Stations
from datetime import datetime

start = datetime(2016, 1, 1)
end = datetime(2024, 12, 31)
dataframes = []
for i, row in province_df.iterrows():
    location_name = row['location_name']
    location = row['location']
    stations = Stations()
    stations = stations.nearby(*location)
    station = stations.fetch(5).dropna(subset=['daily_end'])
    # _location = Point(*location)
    _location = Point(station.latitude.values[0],station.longitude.values[0])
    data = Daily(_location, start, end)
    data = data.fetch()
    data['location_name'] = location_name
    data.reset_index(inplace=True)
    dataframes.append(data)
    # break

weather_df = pd.concat(dataframes, ignore_index=True)
weather_df.to_parquet(f'{processed_data_dir}weather.parquet')
# weather_df

In [11]:
import holidays
from datetime import datetime

start_date = datetime(2016, 1, 1)
end_date = datetime(2024, 12, 31)

# Generate a list of dates within the range
date_list = pd.date_range(start=start_date, end=end_date).tolist()

# Convert the list of dates to a Pandas DataFrame
date_df = pd.DataFrame(date_list, columns=['date'])

# Define the country and year
country = 'VN'
years = range(start_date.year, end_date.year+1)

# Generate the list of holidays
vietnam_holidays = holidays.CountryHoliday(country, years=years)

# Convert the holidays to a list of dictionaries
holiday_list = [{"date": date, "holiday": name} for date, name in vietnam_holidays.items()]

# Create a DataFrame from the list of dictionaries
holiday_df = pd.DataFrame(holiday_list)
holiday_df['date'] = pd.to_datetime(holiday_df['date'])
date_df = date_df.merge(holiday_df, how='left', on='date')
date_df.to_parquet(f'{processed_data_dir}date.parquet')