In [260]:
import pandas as pd

In [261]:
months = ['jan', 'feb', 'mar', 'apr', 'may', 'jun', 'jul', 'aug', 'sep', 'oct', 'nov', 'dec']

In [262]:
# import dataset for province names
provinces = pd.read_csv('datasets/map/provinces.csv', header=0)

In [263]:
# import dataset for rainfall data per province per month
rainfall = pd.read_csv('datasets/rainfall/rainfall_province_monthly.csv', index_col=0)

# transform columns to match other datasets
rainfall['PROV_E'] = rainfall['PROV_T'].map(lambda x: provinces.loc[provinces['thName'] == x]['enName'].values[0])
rainfall['MONTH_NAME'] = rainfall['MONTH'].map(lambda x: months[x - 1])

# find avg max rainfall across all years for each month and province
min_rainfall_per_month = rainfall.groupby(['MONTH_NAME'])['MaxRain'].min()
max_rainfall_per_month = rainfall.groupby(['MONTH_NAME'])['MaxRain'].max()
max_rainfall_per_month_per_province = rainfall.groupby(['MONTH_NAME','PROV_E'])['MaxRain'].mean()

In [264]:
# import dataset for tourism number per province per month
tourism = pd.read_csv('datasets/tourism_count/thailand_domestic_tourism_2019_2023_ver2.csv')
domestic_tourists = tourism[tourism['variable'].isin(['no_tourist_thai'])].copy()
tourism_ratio = tourism[tourism['variable'].isin(['ratio_tourist_stay'])].copy()

# transform columns to match other datasets
domestic_tourists['PROV_E'] = domestic_tourists['province_thai'].map(lambda x: provinces.loc[provinces['thName'] == x]['enName'].values[0])
domestic_tourists['month'] = pd.to_datetime(domestic_tourists['date']).dt.month.map(lambda x: months[x - 1])
tourism_ratio['PROV_E'] = tourism_ratio['province_thai'].map(lambda x: provinces.loc[provinces['thName'] == x]['enName'].values[0])
tourism_ratio['month'] = pd.to_datetime(tourism_ratio['date']).dt.month.map(lambda x: months[x - 1])

In [265]:
# tourism transform
min_domestic_tourist_per_month = domestic_tourists.groupby(['month'])['value'].min()
max_domestic_tourist_per_month = domestic_tourists.groupby(['month'])['value'].max()
tourism_per_month_per_province = domestic_tourists.groupby(['month','PROV_E'])['value'].mean()
tourism_ratio_per_month_per_province = tourism_ratio.groupby(['month', 'PROV_E'])['value'].mean()

In [284]:
'''
Define scoring function using the following inputs:
  - maximum historical rainfall in millimeters for the given month
  - number of domestic tourists that visit the province for the given month
  - ratio of tourist to local ratio

The higher rainfall amount the more penalty is applied to the final score.
The number of domestic tourists is used to determine destinations/events known only to locals.
The ratio of tourist to local results in highest score when ratio is closest to 50%.

Output score is in range [0-1]
'''

# Define a set of weights for each variable
rainfall_weight = 0.4
domestic_tourist_weight = 0.35
tourism_ratio_weight = 0.25

def calculate_score(month, province_rainfall, num_domestic_tourist, tourism_ratio):
    # find percent based on min,max range of rainfall that month
    min_rainfall_amount = min_rainfall_per_month[month]
    max_rainfall_amount = max_rainfall_per_month[month]
    rain_score = 1 - ((province_rainfall - min_rainfall_amount) / (max_rainfall_amount - min_rainfall_amount))

    # find percent based on min,max range of domestic tourists that month
    min_tourists = min_domestic_tourist_per_month[month]
    max_tourists = max_domestic_tourist_per_month[month]
    domestic_tourism_score = 1 - ((num_domestic_tourist - min_tourists) / (max_tourists - min_tourists))
    
    # score based on ratio using quadratic formula score to penalize the low and high ends
    tourism_ratio_score = -0.0004*(tourism_ratio**2) + (0.04*tourism_ratio)
    
    overall_score = (rain_score * rainfall_weight) + (domestic_tourism_score * domestic_tourist_weight) + (tourism_ratio_score * tourism_ratio_weight)

    return overall_score

In [287]:
'''
Create output which will be used for UI. Structure of output JSON is:
{
    "jan": {
        "Bangkok": {
          "score": 12.2,
          "num_michelin_restaurants": 1,
          "max_rainfall_mm": 11,
          "tourism_percentage": 38.4
        },
        "Phuket": { ... },
        ...
    "feb": {
        ...
    },
    ...
}
'''
output = {}

for month in months:
    # create an entry in output_dict for month if it doesn't already exists
    output.setdefault(month, {})
    for province in provinces.enName.squeeze().to_list():
        max_rainfall_mm = max_rainfall_per_month_per_province[month][province]
        num_domestic_tourists = tourism_per_month_per_province[month][province]
        tourism_ratio = tourism_ratio_per_month_per_province[month][province]
        output[month][province] = {
            "score": calculate_score(month, max_rainfall_mm, num_domestic_tourists, tourism_ratio),
            "max_rainfall_mm": max_rainfall_mm,
            "tourism_ratio": tourism_ratio
        }

In [289]:
# The final output is saved to a JSON file to be read and visualized by the UI
import json
output_json = json.dumps(output)

with open("data-driven-travel/static/output.json", "w") as outfile:
    outfile.write(output_json)