# 1. Load the JSON data

In [1]:
import json

import pandas as pd

  from pandas.core import (


In [2]:
with open('../data/multicity_historical.json', 'r') as file:
    data = json.load(file)

data.keys()

dict_keys(['London', 'Singapore', 'Cairo', 'Buenos Aires', 'Mumbai'])

Let's make an empty list to hold dataframes for each city.

In [3]:
city_dfs = []

Since we have 5 cities, we can use the loop function to stucture it into dataframes.

In [4]:
for city, values in data.items():
    city_df = pd.DataFrame(values)
    city_df['city'] = city  # Add a column for the city name
    city_dfs.append(city_df)

Let's concatenate all the city dataframes into a single dataframe.

In [5]:
df = pd.concat(city_dfs, ignore_index=True)

In [6]:
type(df)

pandas.core.frame.DataFrame

Display the first few rows to check the structure.

I've used the display() function I searched on Copilot that displays the output in the same table format as W03 Lab NB02 notebooks. 

In [7]:
display(df.head())

Unnamed: 0,time,precipitation_sum,precipitation_hours,city
0,2023-01-01,4.0,12.0,London
1,2023-01-02,0.2,2.0,London
2,2023-01-03,3.2,14.0,London
3,2023-01-04,0.9,5.0,London
4,2023-01-05,0.1,1.0,London


Display the last few rows to check the structure.

In [8]:
display(df.tail())

Unnamed: 0,time,precipitation_sum,precipitation_hours,city
1820,2023-12-27,0.0,0.0,Mumbai
1821,2023-12-28,0.0,0.0,Mumbai
1822,2023-12-29,0.0,0.0,Mumbai
1823,2023-12-30,0.0,0.0,Mumbai
1824,2023-12-31,0.0,0.0,Mumbai


I also want to check that there should be a total of 1825 entries (365 days x 5 cities).

In [9]:
entries = len(df)
print(entries)

1825


# 2. Calculate Key Metrics for Raininess Analysis

Now that we have obtained the relevant data and sorted it into organised tables, we can calculate the following required metrics. 

We will be using groupby() function to obtain the data from the corresponding columns required.

- Total Rainfall: Sum of precipitation_sum for each city.

- Number of Rainy Days: Days with precipitation_sum > 0

- Average Rain Intensity: Total Rainfall / Number of Rainy Days




## 2.1 Total Rainfall

This metric gives us a sense of the absolute amount of rainfall occurring in each city which will be helpful to analyse the **rainfall volume** in our plot for Monthly Total Rainfall.

In [10]:
# Group by 'city' and sum the 'precipitation' for each group
total_rainfall = df.groupby('city')['precipitation_sum'].sum().reset_index()

# Rename the columns for clarity where first column is the city name, second column is the sum of precipitation
total_rainfall.columns = ['City', 'Total Rainfall']

# Display the result
display(total_rainfall)


Unnamed: 0,City,Total Rainfall
0,Buenos Aires,916.3
1,Cairo,28.3
2,London,780.7
3,Mumbai,2048.1
4,Singapore,2364.7


## 2.2 Number of Rainy Days

We introduce the size() function to count the number of occurences in each group. In this case, we want to count the number of days precipitation_sum is greater than 0.

This metric will give us a sense of the **frequency** and helps put London’s raininess in perspective compared to other cities.

In [11]:
# Filter the DataFrame to include only rows where precipitation_sum > 0
rainy_days_df = df[df['precipitation_sum'] > 0]

# Group by 'city' and count the number of days for each city
rainy_days = rainy_days_df.groupby('city').size().reset_index()

# Rename the columns for clarity
rainy_days.columns = ['City', 'Rainy Days']

# Display the result
display(rainy_days)

Unnamed: 0,City,Rainy Days
0,Buenos Aires,150
1,Cairo,29
2,London,228
3,Mumbai,157
4,Singapore,350


## 2.3 Average Rain Intensity

Now, we can use the variable we have formulated in 3.1 and 3.2 to calculate the average rain intensity for each city.

This metric will give us a sense of the **lightiness/heaviness** of the rain in London as compared to other cities.

In [12]:
# Merge the total_rainfall and rainy_days DataFrames on 'City'
rain_data = pd.merge(total_rainfall, rainy_days, on='City')

# Calculate the average rain intensity
rain_data['average_rain_intensity'] = rain_data['Total Rainfall'] / rain_data['Rainy Days']

# Rename the columns for clarity
rain_data.columns = ['City', 'Total Rainfall (mm)', 'Rainy Days', 'Average Rain Intensity (mm/day)']

# Display the result
display(rain_data)


Unnamed: 0,City,Total Rainfall (mm),Rainy Days,Average Rain Intensity (mm/day)
0,Buenos Aires,916.3,150,6.108667
1,Cairo,28.3,29,0.975862
2,London,780.7,228,3.424123
3,Mumbai,2048.1,157,13.045223
4,Singapore,2364.7,350,6.756286


# 3. Visualisations

In [13]:
from lets_plot import *
LetsPlot.setup_html()

from datetime import datetime

## 3.1 Number of Rainy Days (Bar Chart)

This metric shows the total number of days with rainfall throughout the year in each city.
 
**Rationale: A higher number of rainy days can contribute to the perception of a 'rainy' city, even if the rain is light.**



In [47]:
# Sort the data by 'Rainy Days' in ascending order
rainy_days_data = (
    rain_data[['City', 'Rainy Days']]
    .assign(City=lambda df: df['City'].astype('category'))
    .sort_values(by='Rainy Days', ascending=True)  # Sort by 'Rainy Days' in ascending order
)

# Create the bar chart with advanced customization
plot = (ggplot(rainy_days_data, aes(x='City', y='Rainy Days', fill='Rainy Days')) + 
       geom_bar(stat='identity', show_legend=False) + 
       geom_text(aes(label='Rainy Days'), position=position_nudge(y=5.5), color='black', size=10) + 
       scale_fill_manual(values=['#92C5F9', '#4394E5', '#0066CC', '#004D99', '#003366']) + 
       ggtitle('Figure 1: Singapore Had More Rainy Days Than London', 
               subtitle='London experienced 35% fewer rainy days compared to Singapore.') + 
       theme_minimal() + 
       theme(axis_title_x=element_text(size=20), 
             axis_title_y=element_text(size=20),
             plot_title=element_text(size=30, face="bold", color='#333333'),
             plot_subtitle=element_text(size=20, color="grey"),  # Adjusted subtitle size
             axis_text_x=element_text(size=20),
             axis_text_y=element_text(size=20),
             panel_grid_major_y=element_line(color="grey", size=0.3, linetype="dotted"),
             panel_grid_major_x=element_blank(),
             legend_position='none') + 
       labs(x='City', y='Number of Rainy Days') + 
       ggsize(1200, 800))

# Display the plot
plot


## 3.2 Monthly Total Rainfall by City (Line Graph)

This metric displays the total amount of rainfall each month, highlighting seasonal patterns. 

**Rationale: It helps compare the actual volume of rain London receives compared to other cities, providing context to its 'rainy' reputation.**

In [46]:
# Convert 'time' column to datetime
df['time'] = pd.to_datetime(df['time'])

# Extract month and year from 'time' column
df['month'] = df['time'].dt.to_period('M')

# Prepare data for monthly total rainfall by city with method chaining
monthly_rainfall_data = (
    df.groupby(['city', 'month'])['precipitation_sum']
    .sum()
    .reset_index()
    .assign(month=lambda df: df['month'].astype(str))
)

plot = (ggplot(monthly_rainfall_data, aes(x='month', y='precipitation_sum', color='city', group='city', linetype='city')) + 
       geom_line(size=2.5) +  # Increase line thickness for better visibility in plot and legend
       scale_color_manual(values=['#000000', '#e7298a', '#d95f02', '#1b9e77', '#FF0000']) +  # Colors: black, pink, orange, green, red
       scale_linetype_manual(values=['solid', 'dashed', 'dotted', 'dotdash', 'twodash']) +  # Different line types for each city
       ggtitle('Figure 2: Mumbai exhibits extreme rainfall peaks', subtitle='Steady, lower rainfall trends observed in London.') + 
       theme_minimal() + 
       theme(axis_title_x=element_text(size=20, face="bold"), 
             axis_title_y=element_text(size=20, face="bold"),
             plot_title=element_text(size=30, face="bold", color='#333333'),
             plot_subtitle=element_text(size=20, color="gray"),
             axis_text_x=element_text(size=20, hjust=1, face="bold"),
             axis_text_y=element_text(size=20, face="bold"),
             panel_grid_major_y=element_line(color="lightgray", size=0.5, linetype="dotted"),
             panel_grid_major_x=element_blank(),
             legend_position='right',
             legend_title=element_text(size=24),  # Increase legend title size
             legend_text=element_text(size=24)  # Increase legend text size
            ) + 
       scale_x_discrete(name='Month', labels=['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']) + 
       labs(x='Month', y='Total Rainfall (mm)', color='City', linetype='City') + ggsize(1700, 1000))

# Display the plot
plot







## 3.3 Average Rain Intensity by City (Bar Graph)

This metric shows the average amount of rain per rainy day, indicating the intensity of rainfall. 

**Rationale: A lower intensity suggests lighter, more frequent rain, whereas higher intensity indicates heavier downpours on rainy days.**

In [48]:
# Sort the data by 'Average Rain Intensity (mm/day)' in ascending order
average_rain_intensity_data = (
    rain_data[['City', 'Average Rain Intensity (mm/day)']]
    .assign(City=lambda df: df['City'].astype('category'))
    .sort_values(by='Average Rain Intensity (mm/day)', ascending=True)  # Sort by 'Average Rain Intensity' in ascending order
)

# Create the bar chart with advanced customization
plot = (ggplot(average_rain_intensity_data, aes(x='City', y='Average Rain Intensity (mm/day)', fill='Average Rain Intensity (mm/day)')) + 
       geom_bar(stat='identity', show_legend=False) + 
       geom_text(aes(label=average_rain_intensity_data['Average Rain Intensity (mm/day)'].round(2)),  # Round to 2 decimal place
                 position=position_nudge(y=0.25), color='black', size=10) +  # Adjusted position to bring labels closer to bars
       scale_fill_manual(values=['#92C5F9', '#4394E5', '#0066CC', '#004D99', '#003366']) + 
       ggtitle('Figure 3: Mumbai had the Highest Average Rain Intensity', 
               subtitle='London’s average rain intensity is 74% lower than Mumbai, highlighting a significant difference.') + 
       theme_minimal() + 
       theme(axis_title_x=element_text(size=20), 
             axis_title_y=element_text(size=20),
             plot_title=element_text(size=30, face="bold", color='#333333'),
             plot_subtitle=element_text(size=20, color="grey"),
             axis_text_x=element_text(size=20),
             axis_text_y=element_text(size=20),
             panel_grid_major_y=element_line(color="grey", size=0.3, linetype="dotted"),
             panel_grid_major_x=element_blank(),
             legend_position='none') + 
       labs(x='City', y='Average Rain Intensity (mm/day)') + 
       ggsize(1200, 800))

# Display the plot
plot


# 4. Overall Conclusion 

Null and Alternative Hypothesis. Reject or fail to reject null based on conclusions.

Fig 1: 

Fig 2:

Fig 3: 