# 1. Load the JSON data

In [75]:
import json

import pandas as pd

In [76]:
with open('../data/multicity_historical.json', 'r') as file:
    data = json.load(file)

data.keys()

dict_keys(['London', 'Singapore', 'Cairo', 'Buenos Aires', 'Mumbai'])

Let's make an empty list to hold dataframes for each city.

In [77]:
city_dfs = []

Since we have 5 cities, we can use the loop function to stucture it into dataframes.

In [78]:
for city, values in data.items():
    city_df = pd.DataFrame(values)
    city_df['city'] = city  # Add a column for the city name
    city_dfs.append(city_df)


Let's concatenate all the city dataframes into a single dataframe.

In [79]:
df = pd.concat(city_dfs, ignore_index=True)

Display the first few rows to check the structure.

I've used the display() function I searched on Copilot that displays the output in the same table format as W03 Lab NB02 notebooks. 

In [60]:
display(df.head())

Unnamed: 0,time,precipitation_sum,precipitation_hours,city
0,2023-01-01,4.0,12.0,London
1,2023-01-02,0.2,2.0,London
2,2023-01-03,3.2,14.0,London
3,2023-01-04,0.9,5.0,London
4,2023-01-05,0.1,1.0,London


Display the last few rows to check the structure.

In [61]:
display(df.tail())

Unnamed: 0,time,precipitation_sum,precipitation_hours,city
1820,2023-12-27,0.0,0.0,Mumbai
1821,2023-12-28,0.0,0.0,Mumbai
1822,2023-12-29,0.0,0.0,Mumbai
1823,2023-12-30,0.0,0.0,Mumbai
1824,2023-12-31,0.0,0.0,Mumbai


I also want to check that there should be a total of 1825 entries (365 days x 5 cities).

In [62]:
entries = len(df)
print(entries)

1825


# 2. Calculate Key Metrics for Raininess Analysis

Now that we have obtained the relevant data and sorted it into organised tables, we can calculate the following required metrics. 

We will be using groupby() function to obtain the data from the coreresponding columns required.

- Total Rainfall: Sum of precipitation_sum for each city.

- Number of Rainy Days: Days with precipitation_sum > 0

- Average Rain Intensity: Total Rainfall / Number of Rainy Days

- Average Rain Duration: Total precipitation_hours / Number of Rainy Days



## 2.1 Total Rainfall

In [63]:
# Group by 'city' and sum the 'precipitation' for each group
total_rainfall = df.groupby('city')['precipitation_sum'].sum()

# First column is the city name, second column is the sum of precipitation
total_rainfall.columns = ['City', 'Total Rainfall']

# Display the result
display(total_rainfall)


city
Buenos Aires     916.3
Cairo             28.3
London           780.7
Mumbai          2048.1
Singapore       2364.7
Name: precipitation_sum, dtype: float64

Above, we can see there's no column header for total_rainfall. I want to add the column header, and based on Copilot, I can use reset_index()

In [64]:
# Group by 'city' and sum the 'precipitation' for each group
total_rainfall = df.groupby('city')['precipitation_sum'].sum().reset_index()

# Rename the columns for clarity where first column is the city name, second column is the sum of precipitation
total_rainfall.columns = ['City', 'Total Rainfall']

# Display the result
display(total_rainfall)


Unnamed: 0,City,Total Rainfall
0,Buenos Aires,916.3
1,Cairo,28.3
2,London,780.7
3,Mumbai,2048.1
4,Singapore,2364.7


## 2.2 Number of Rainy Days

We introduce the size() function to count the number of occurences in each group. In this case, we want to count the number of days precipitation_sum is greater than 0.

In [65]:
# Filter the DataFrame to include only rows where precipitation_sum > 0
rainy_days_df = df[df['precipitation_sum'] > 0]

# Group by 'city' and count the number of days for each city
rainy_days = rainy_days_df.groupby('city').size().reset_index()

# Rename the columns for clarity
rainy_days.columns = ['City', 'Rainy Days']

# Display the result
display(rainy_days)

Unnamed: 0,City,Rainy Days
0,Buenos Aires,150
1,Cairo,29
2,London,228
3,Mumbai,157
4,Singapore,350


## 2.3 Average Rain Intensity

Now, we can use the variable we have formulated in 3.1 and 3.2 to calculate the average rain intensity for each city.

In [66]:
# Merge the total_rainfall and rainy_days DataFrames on 'City'
rain_data = pd.merge(total_rainfall, rainy_days, on='City')

# Calculate the average rain intensity
rain_data['average_rain_intensity'] = rain_data['Total Rainfall'] / rain_data['Rainy Days']

# Rename the columns for clarity
rain_data.columns = ['City', 'Total Rainfall', 'Rainy Days', 'Average Rain Intensity']

# Display the result
display(rain_data)


Unnamed: 0,City,Total Rainfall,Rainy Days,Average Rain Intensity
0,Buenos Aires,916.3,150,6.108667
1,Cairo,28.3,29,0.975862
2,London,780.7,228,3.424123
3,Mumbai,2048.1,157,13.045223
4,Singapore,2364.7,350,6.756286


Neat! Now we have 3/4 metrics represented in the table.

## 2.4 Average Rain Duration

We have to first calculate the total rain duration for each city. Then, calculate the average rain duration per rainy day for each city.

In [67]:
# Calculate Total Rain Duration for each city
total_rain_duration = df.groupby('city')['precipitation_hours'].sum().reset_index()
total_rain_duration.columns = ['City', 'Total Rain Duration']

# Merge the total_rain_duration with the rain_data DataFrame
rain_data = pd.merge(rain_data, total_rain_duration, on='City')

# Calculate Average Rain Duration per Rainy Day for each city
rain_data['Average Rain Duration'] = rain_data['Total Rain Duration'] / rain_data['Rainy Days']

# Display the result
display(rain_data[['City', 'Average Rain Duration']])

Unnamed: 0,City,Average Rain Duration
0,Buenos Aires,5.7
1,Cairo,2.827586
2,London,6.741228
3,Mumbai,14.050955
4,Singapore,8.522857


## 2.5 Combining all metrics together

Now that we have all our relevant metrics, let's put them all together in one table to help our analysis.

In [68]:
# Merge the total_rainfall, rainy_days, and total_rain_duration DataFrames on 'City'
rain_data = pd.merge(total_rainfall, rainy_days, on='City')
rain_data = pd.merge(rain_data, total_rain_duration, on='City')

# Calculate the Average Rain Intensity and Average Rain Duration per Rainy Day for each city
rain_data['Average Rain Intensity'] = rain_data['Total Rainfall'] / rain_data['Rainy Days']
rain_data['Average Rain Duration'] = rain_data['Total Rain Duration'] / rain_data['Rainy Days']

# Display the final DataFrame with the specified columns
display(rain_data[['City', 'Total Rainfall', 'Rainy Days', 'Average Rain Intensity', 'Average Rain Duration']])


Unnamed: 0,City,Total Rainfall,Rainy Days,Average Rain Intensity,Average Rain Duration
0,Buenos Aires,916.3,150,6.108667,5.7
1,Cairo,28.3,29,0.975862,2.827586
2,London,780.7,228,3.424123,6.741228
3,Mumbai,2048.1,157,13.045223,14.050955
4,Singapore,2364.7,350,6.756286,8.522857


# 3. Visualisations

In [69]:
from lets_plot import *
LetsPlot.setup_html()

## 3.1 Total Rainfall by  City (Bar Chart)

This will allow us to compare the total rainfall for each city over the entire year.

In [70]:
# Prepare data for total rainfall by city with method chaining
total_rainfall_data = (
    rain_data[['City', 'Total Rainfall']]
    .assign(City=lambda df: df['City'].astype('category'))
)

# Create the bar chart with customized colors, data labels, and refined theme
plot = ggplot(total_rainfall_data, aes(x='City', y='Total Rainfall', fill='City')) + \
       geom_bar(stat='identity') + \
       geom_text(aes(label='Total Rainfall'), position=position_nudge(y=30), color='black', size=6.5) + \
       scale_fill_manual(values=['#92C5F9', '#AFDC8F', '#B6A6E9', '#F8AE54', '#A3A3A3']) + \
       ggtitle('Singapore Leads in Total Rainfall Among Major Cities in 2023', subtitle='Mumbai and London follow, with Cairo experiencing the least rainfall') + \
       theme_minimal() + \
       theme(axis_title_x=element_text(size=12), 
             axis_title_y=element_text(size=12),
             plot_title=element_text(size=18, face="bold", color='#333333'),
             plot_subtitle=element_text(size=14, color="gray"),
             axis_text_x=element_text(size=11),
             axis_text_y=element_text(size=11),
             panel_grid_major_y=element_line(color="lightgray", size=0.5, linetype="dotted"),
             panel_grid_major_x=element_blank(),
             legend_position='none') + \
       labs(x='City', y='Total Rainfall (mm)')

# Display the plot
plot

## 3.2 Number of Rainy Days by City (Bar Chart)

This bar chart shows the number of days each city experienced measurable rain, providing insight into the frequency of rainy days.


In [71]:
# Prepare data for number of rainy days by city with method chaining
rainy_days_data = (
    rain_data[['City', 'Rainy Days']]
    .assign(City=lambda df: df['City'].astype('category'))
)

# Create the bar chart with advanced customization
plot = ggplot(rainy_days_data, aes(x='City', y='Rainy Days', fill='City')) + \
       geom_bar(stat='identity', show_legend=False) + \
       geom_text(aes(label='Rainy Days'), position=position_nudge(y=5.5), color='black', size=6.5) + \
       scale_fill_manual(values=['#92C5F9', '#AFDC8F', '#B6A6E9', '#F8AE54', '#A3A3A3']) + \
       ggtitle('Rain or shine? In 2023, Singapore was mostly rain.', subtitle='London experienced ~35% less rainy days in comparison to SG.') + \
       theme_minimal() + \
       theme(axis_title_x=element_text(size=12), 
             axis_title_y=element_text(size=12),
             plot_title=element_text(size=18, face="bold", color='#333333'),
             plot_subtitle=element_text(size=11, color="black"),  # Adjusted subtitle size
             axis_text_x=element_text(size=11),
             axis_text_y=element_text(size=11),
             panel_grid_major_y=element_line(color="grey", size=0.3, linetype="dotted"),
             panel_grid_major_x=element_blank(),
             legend_position='none') + \
       labs(x='City', y='Number of Rainy Days')

# Display the plot
plot

## 3.3 Monthly Total Rainfall by City (Line Graph)

This line graph shows the monthly rainfall for each city, revealing seasonal trends and allowing comparison of rain patterns over time.

In [72]:
# Convert 'time' column to datetime
df['time'] = pd.to_datetime(df['time'])

# Extract month and year from 'time' column
df['month'] = df['time'].dt.to_period('M')

# Prepare data for monthly total rainfall by city with method chaining
monthly_rainfall_data = (
    df.groupby(['city', 'month'])['precipitation_sum']
    .sum()
    .reset_index()
    .assign(month=lambda df: df['month'].astype(str))
)

# Create the line graph with advanced customization
plot = ggplot(monthly_rainfall_data, aes(x='month', y='precipitation_sum', color='city', group='city')) + \
       geom_line(size=1.5) + \
       geom_point(size=3) + \
       scale_color_manual(values=['#377eb8', '#ff7f0e', '#4daf4a', '#e41a1c', '#984ea3']) + \
       ggtitle('Rainfall Roller Coaster: Peaks and Patterns Across Cities', subtitle='Mumbai leads with a sharp monsoon spike while other cities show consistent trend') + \
       theme_minimal() + \
       theme(axis_title_x=element_text(size=12, face="bold"), 
             axis_title_y=element_text(size=12, face="bold"),
             plot_title=element_text(size=18, face="bold", color='#333333'),
             plot_subtitle=element_text(size=14, color="gray"),
             axis_text_x=element_text(size=11, angle=45, hjust=1, face="bold"),
             axis_text_y=element_text(size=11, face="bold"),
             panel_grid_major_y=element_line(color="lightgray", size=0.5, linetype="dotted"),
             panel_grid_major_x=element_blank(),
             legend_position='right') + \
       labs(x='Month', y='Total Rainfall (mm)', color='City')

# Display the plot
plot

## 3.4 Average Rain Intensity by City (Bar Chart)

This bar chart compares the average rainfall intensity on rainy days across cities. It helps differentiate between cities that experience frequent light rain versus those with occasional heavy rain.

In [73]:
# Prepare data for average rain intensity by city with method chaining
average_rain_intensity_data = (
    rain_data[['City', 'Average Rain Intensity']]
    .assign(City=lambda df: df['City'].astype('category'))
)

# Create the bar chart with advanced customization
plot = ggplot(average_rain_intensity_data, aes(x='City', y='Average Rain Intensity', fill='City')) + \
       geom_bar(stat='identity', show_legend=False) + \
       geom_text(aes(label='Average Rain Intensity'), position=position_nudge(y=0.2), color='black', size=6.5) + \
       scale_fill_manual(values=['#92C5F9', '#AFDC8F', '#B6A6E9', '#F8AE54', '#A3A3A3']) + \
       ggtitle('Average Rain Intensity by City (2023)', subtitle='A comparative analysis of rain intensity across cities') + \
       theme_minimal() + \
       theme(axis_title_x=element_text(size=12), 
             axis_title_y=element_text(size=12),
             plot_title=element_text(size=18, face="bold", color='#333333'),
             plot_subtitle=element_text(size=14, color="gray"),
             axis_text_x=element_text(size=11, hjust=1),
             axis_text_y=element_text(size=11),
             panel_grid_major_y=element_line(color="lightgray", size=0.5, linetype="dotted"),
             panel_grid_major_x=element_blank(),
             legend_position='none') + \
       labs(x='City', y='Average Rain Intensity (mm/day)')

# Display the plot
plot

## 3.5 Rain Intensity vs. Rain Duration by City (Scatter Plot)
This scatter plot explores the relationship between average rain intensity and average rain duration per rainy day, in order to see patterns in rain behavior across cities.

In [80]:
# Prepare data for Rain Intensity vs Rain Duration by city with method chaining
rain_intensity_duration_data = (
    rain_data[['City', 'Average Rain Intensity', 'Average Rain Duration']]
    .assign(City=lambda df: df['City'].astype('category'))
)

# Create the scatter plot with advanced customization
plot = ggplot(rain_intensity_duration_data, aes(x='Average Rain Intensity', y='Average Rain Duration', color='City')) + \
       geom_point(size=5, alpha=0.7) + \
       geom_smooth(method='lm', se=False, linetype='dashed', color='gray') + \
       scale_color_manual(values=['#92C5F9', '#AFDC8F', '#B6A6E9', '#F8AE54', '#A3A3A3']) + \
       ggtitle('Rain Intensity vs Rain Duration by City (2023)', subtitle='A comparative analysis of rain intensity and duration across cities') + \
       theme_minimal() + \
       theme(axis_title_x=element_text(size=12), 
             axis_title_y=element_text(size=12),
             plot_title=element_text(size=18, face="bold", color='#333333'),
             plot_subtitle=element_text(size=14, color="gray"),
             axis_text_x=element_text(size=11),
             axis_text_y=element_text(size=11),
             panel_grid_major_y=element_line(color="lightgray", size=0.5, linetype="dotted"),
             panel_grid_major_x=element_line(color="lightgray", size=0.5, linetype="dotted"),
             legend_position='right') + \
       labs(x='Average Rain Intensity (mm/day)', y='Average Rain Duration (hours)', color='City')

# Display the plot
plot