# Data Analysis

In [2]:
import json 
import pandas as pd
from lets_plot import *
LetsPlot.setup_html()


In [4]:
all_data = pd.read_csv("rainfall_data.csv")

### Has London been getting rainier since 1990 (compared to other countries)

# WARNINGS HELP

In [73]:
# Calculating the annual rainfall per city per year
all_data['Date'] = pd.to_datetime(all_data['Date'])
all_data['Year'] = all_data['Date'].dt.year
annual_rainfall = all_data.groupby(['City', 'Year']).agg(
    annual_rainfall=('Rain Sum', 'sum')
).reset_index()

# Filter data for the years 1970 and 2020
rainfall_1970 = annual_rainfall[annual_rainfall['Year'] == 1970][['City', 'annual_rainfall']].rename(columns={'annual_rainfall': 'Rainfall_1970'})
rainfall_2020 = annual_rainfall[annual_rainfall['Year'] == 2020][['City', 'annual_rainfall']].rename(columns={'annual_rainfall': 'Rainfall_2020'})

# Merge the two years' data on the 'City' column
rainfall_diff = pd.merge(rainfall_1970, rainfall_2020, on='City')

# Calculate the difference in rainfall between 2020 and 1970
rainfall_diff['Rainfall_Difference'] = rainfall_diff['Rainfall_2020'] - rainfall_diff['Rainfall_1970']

# Plot the data
ggplot(rainfall_diff, aes(x='City', y='Rainfall_Difference')) + \
geom_bar(stat='identity', fill='#1F627D') + \
ggsize(800,500) + \
labs(y='Change in Annual Rainfall (mm)',
     fill='Change',
     title= "London's change in Annual Rainfall between 1970 and 2020 is minimal",
     subtitle= 'Cities like Mumbai may now be rainier than London and historically cities like Bogota may have been rainier') + \
 theme(
        plot_title=element_text(size=16, hjust=0.5, margin=margin(b=10)),  # Margin for title
        plot_subtitle=element_text(size=14, color='grey', face='italic', hjust=0.5, margin=margin(b=80)),  # Margin for subtitle
        plot_margin=margin(t=30, r=40, b=30, l=3)  # Overall margins
    ) 




WARN: The margin() is deprecated and will be removed in future releases.
      Please, use a number or list of numbers to specify margins (see description of the parameter used).
WARN: The margin() is deprecated and will be removed in future releases.
      Please, use a number or list of numbers to specify margins (see description of the parameter used).
WARN: The margin() is deprecated and will be removed in future releases.
      Please, use a number or list of numbers to specify margins (see description of the parameter used).


In [69]:
# Filter data for London in 1970 and 2020
london_1970 = all_data[(all_data['City'] == 'London') & (all_data['Date'].dt.year == 1970)].copy()
london_2020 = all_data[(all_data['City'] == 'London') & (all_data['Date'].dt.year == 2020)].copy()

# Add a 'Year' column to distinguish each dataset in the plot
london_1970['Year'] = '1970'
london_2020['Year'] = '2020'

# Combine data and extract month from Date
combined_data = pd.concat([london_1970, london_2020])
combined_data['Month'] = combined_data['Date'].dt.strftime('%B')  # Gets month name

# Group by month and year, then sum rainfall for each month
monthly_data = combined_data.groupby(['Year', 'Month'], as_index=False)['Rain Sum'].sum()

# Sort months in calendar order for consistent plotting
months_order = ["January", "February", "March", "April", "May", "June", 
                "July", "August", "September", "October", "November", "December"]
monthly_data['Month'] = pd.Categorical(monthly_data['Month'], categories=months_order, ordered=True)
monthly_data = monthly_data.sort_values('Month')

# Initialize Let's Plot
LetsPlot.setup_html()

# Create the plot
plot = (ggplot(monthly_data) +
        geom_line(aes(x='Month', y='Rain Sum', color='Year', group='Year'), size=1) +
        geom_point(aes(x='Month', y='Rain Sum', color='Year'), size=4) +
        labs(title='Monthly Rainfall in London for 1970 and 2020',
             x='Month',
             y='Total Rainfall (mm)',
             color='Year') +
        theme(legend_position='top'))

plot


In [24]:

# Sample data structure: replace this with your actual data
df = pd.DataFrame(all_data)

# Convert the 'Date' column to datetime
df['Date'] = pd.to_datetime(df['Date'])

# Filter the DataFrame for the years 2010 to 2020
rec_rainfall_df = df[(df['Date'].dt.year >= 2010) & (df['Date'].dt.year <= 2020)]

# Group by country and year, summing daily rainfall
annual_rainfall = rec_rainfall_df.groupby(['City', rec_rainfall_df['Date'].dt.year]).agg(
    Annual_Rainfall=('Rain Sum', 'sum')
).reset_index()

# Rename columns for clarity
annual_rainfall.rename(columns={'Date': 'Year'}, inplace=True)

# Calculate average annual rainfall
average_annual_rainfall = annual_rainfall.groupby('City')['Annual_Rainfall'].mean().reset_index()
average_annual_rainfall.rename(columns={'Annual_Rainfall': 'Average_Annual_Rainfall'}, inplace=True)

city_coords = [
    ("London", 51.5072, -0.1276),  
    ("Manchester", 53.4808, -2.2426),  
    ("Edinburgh", 55.9533, -3.1883),
    ("Cork", 51.8985, -8.4756),
    ("Paris", 48.8575, 2.3514),
    ("Rome", 41.8967, 12.4822),
    ("Seattle", 47.6061, -122.3328),    
    ("Bogota", 4.7110, -74.0721),  
    ("Cairo", 30.0444, 31.2357),
    ("Cape Town", -33.9221, 18.4231),
    ("Mumbai", 19.0760, 72.8777)
]

# Convert city_coords to DataFrame
coords_df = pd.DataFrame(city_coords, columns=['City', 'Latitude', 'Longitude'])

# Merge the average annual rainfall DataFrame with city coordinates
merged_rainfall_df = average_annual_rainfall.merge(coords_df)


# Display the average annual rainfall
print(merged_rainfall_df)


          City  Average_Annual_Rainfall  Latitude  Longitude
0       Bogota              1103.236364    4.7110   -74.0721
1        Cairo                24.172727   30.0444    31.2357
2    Cape Town               529.809091  -33.9221    18.4231
3         Cork               942.654545   51.8985    -8.4756
4    Edinburgh               831.218182   55.9533    -3.1883
5       London               660.145455   51.5072    -0.1276
6   Manchester              1034.272727   53.4808    -2.2426
7       Mumbai              2092.990909   19.0760    72.8777
8        Paris               662.209091   48.8575     2.3514
9         Rome               900.027273   41.8967    12.4822
10     Seattle              1272.590909   47.6061  -122.3328


# ADD SUBTITLE HELP / change legend name without underscores


In [173]:
def plot_city_map(city_data, point_colour='blue', 
                  title="London's mean rainfall between 2010 and 2020 is less than the majority of cities", 
                  title_size=14, 
                  subtitle="London ranks 7th in terms of most annual rainfall out of the chosen cities", 
                  subtitle_size=10):
   
    # Plotting the points on a map
    plot = (
        ggplot() +
        geom_livemap() +
        geom_point(aes(x='Longitude', y='Latitude', size='Average_Annual_Rainfall'), 
                   colour=point_colour, 
                   show_legend=True, 
                   data=city_data) +
        ggtitle(title) +
        theme_minimal() + 
        theme(plot_title=element_text(size=title_size, hjust=0.5))
    )
    
    return plot

city_map_plot = plot_city_map(merged_rainfall_df, point_colour='#1F627D')
city_map_plot

# Call the function to plot the city map with average rainfall



In [171]:
# Step 1: Filter the data for the two time periods
data_1970_1980 = all_data[(all_data['Date'] >= '1970-01-01') & (all_data['Date'] < '1980-01-01')]
data_2010_2020 = all_data[(all_data['Date'] >= '2010-01-01') & (all_data['Date'] < '2020-01-01')]

# Step 2: Calculate the mean annual rainfall for each city for both periods
mean_rainfall_1970_1980 = data_1970_1980.groupby('City')['Precipitation Sum'].mean().reset_index()
mean_rainfall_2010_2020 = data_2010_2020.groupby('City')['Precipitation Sum'].mean().reset_index()

# Step 3: Rename columns for clarity and add a 'Time Period' column
mean_rainfall_1970_1980.rename(columns={'Precipitation Sum': 'Mean Rainfall'}, inplace=True)
mean_rainfall_1970_1980['Time Period'] = '1970-1980'

mean_rainfall_2010_2020.rename(columns={'Precipitation Sum': 'Mean Rainfall'}, inplace=True)
mean_rainfall_2010_2020['Time Period'] = '2010-2020'

# Step 4: Combine the two DataFrames
mean_rainfall_combined = pd.concat([mean_rainfall_1970_1980, mean_rainfall_2010_2020])
print(mean_rainfall_combined)

# Plot directly

# Create the bar chart
bar_chart = (
    ggplot(mean_rainfall_combined, aes(x='City', y='Mean Rainfall', fill='Time Period')) +
    geom_bar(stat='identity', position='dodge') +
    labs(title='Mean Average Rainfall for Two Time Periods',
         x='City',
         y='Mean Average Rainfall (mm)') +
    theme_minimal() +
    scale_fill_manual(values={'1970-1980': '#9EBECF', '2010-2020': '#0875AE'})
)

bar_chart




          City  Mean Rainfall Time Period
0       Bogota       7.000986   1970-1980
1        Cairo       0.078094   1970-1980
2    Cape Town       1.216347   1970-1980
3         Cork       2.256216   1970-1980
4    Edinburgh       1.970071   1970-1980
5       London       1.789540   1970-1980
6   Manchester       2.600219   1970-1980
7       Mumbai       4.787486   1970-1980
8        Paris       1.971276   1970-1980
9         Rome       2.961720   1970-1980
10     Seattle       4.078094   1970-1980
0       Bogota       3.105038   2010-2020
1        Cairo       0.053012   2010-2020
2    Cape Town       1.447645   2010-2020
3         Cork       2.509830   2010-2020
4    Edinburgh       2.350137   2010-2020
5       London       1.824233   2010-2020
6   Manchester       2.852711   2010-2020
7       Mumbai       5.544715   2010-2020
8        Paris       1.871550   2010-2020
9         Rome       2.485706   2010-2020
10     Seattle       3.586008   2010-2020


In [172]:
# Assuming 'Precipitation Hours' is the name of the column you want
# Select the relevant columns including 'Precipitation Hours'
selected_data = all_data[['City', 'Precipitation Hours']]

# Group by 'City' and sum the 'Precipitation Hours'
Total_Precipitation_Hours = selected_data.groupby('City', as_index=False).agg(Total_Precipitation_Hours=('Precipitation Hours', 'sum'))

# Create the bar chart
ggplot(Total_Precipitation_Hours, aes(x='City', y='Total_Precipitation_Hours')) + \
    geom_bar(stat='identity', fill='#1F627D') + \
    labs(title='London does not have a significant amount of hours of precipitation',
         y='Total Precipitation Hours',
         x='City') 


# THOUGHTS ON THE GAP BETWEEN THE TWO TIME PERIODS BEING SHOWN ON THE GRAPH 


In [57]:
# Melt data for distinct color and legend handling
melted_data = london_data.melt(id_vars=['Date'], value_vars=['Cumulative_Rain_Sum', 'Cumulative_Precipitation_Sum'],
                               var_name='Line_Type', value_name='Cumulative_Sum')

# Plot with legend
plot = (ggplot(melted_data) +
        geom_line(aes(x='Date', y='Cumulative_Sum', color='Line_Type'), size=1.5) +
        labs(title='Cumulative Sum of Rain and Precipitation in London',
             x='Date',
             y='Cumulative Sum (mm)',
             color='Legend') +
        theme(legend_position='bottom') +
        scale_color_manual(values={'Cumulative_Rain_Sum': '#5C9CDA', 'Cumulative_Precipitation_Sum': '#6C658E'},
        ))

plot


In [64]:
# Filter data for London in 2020
london_2020 = all_data[(all_data['City'] == 'London') & (all_data['Date'].dt.year == 2020)].copy()

# Calculate cumulative sums
london_2020['Cumulative_Rain_Sum'] = london_2020['Rain Sum'].cumsum()
london_2020['Cumulative_Precipitation_Sum'] = london_2020['Precipitation Sum'].cumsum()

# Melt the DataFrame for plotting
melted_data = london_2020.melt(id_vars=['Date'], value_vars=['Cumulative_Rain_Sum', 'Cumulative_Precipitation_Sum'],
                                var_name='Line_Type', value_name='Cumulative_Sum')

print(melted_data)

# Plot the cumulative rain sum with a legend
plot = (ggplot(melted_data) +
        geom_line(aes(x='Date', y='Cumulative_Sum', color='Line_Type'), size=1) +
        ggsize(500, 400) +
        labs(title='The Majority of Precipitation in London is from Rainfall (2020)',
             x='Date',
             y='Cumulative Sum of Rain/Precipitation in London (mm)',
             color='Legend') +
        theme(legend_position='top') +
        scale_color_manual(values={'Cumulative_Rain_Sum': '#5C9CDA', 'Cumulative_Precipitation_Sum': '#6C658E'}))

plot


          Date                     Line_Type  Cumulative_Sum
0   2020-01-01           Cumulative_Rain_Sum             0.0
1   2020-01-02           Cumulative_Rain_Sum             0.0
2   2020-01-03           Cumulative_Rain_Sum             3.1
3   2020-01-04           Cumulative_Rain_Sum             3.1
4   2020-01-05           Cumulative_Rain_Sum             3.1
..         ...                           ...             ...
727 2020-12-27  Cumulative_Precipitation_Sum           764.2
728 2020-12-28  Cumulative_Precipitation_Sum           764.5
729 2020-12-29  Cumulative_Precipitation_Sum           764.8
730 2020-12-30  Cumulative_Precipitation_Sum           764.8
731 2020-12-31  Cumulative_Precipitation_Sum           764.8

[732 rows x 3 columns]


In [136]:
# Create a new column for Year and Month-Day for grouping
all_data['Year'] = all_data['Date'].dt.year
all_data['Month_Day'] = all_data['Date'].dt.strftime('%m-%d')

# Filter and aggregate for the period 1970-1980
rain_1970_1980 = all_data[(all_data['Year'] >= 1970) & (all_data['Year'] < 1980)]
rain_1970_1980_daily = rain_1970_1980.groupby(['City', 'Month_Day'])['Rain Sum'].sum().reset_index()
rain_1970_1980_daily['Cumulative_Rain'] = rain_1970_1980_daily.groupby('City')['Rain Sum'].cumsum()
rain_1970_1980_daily['Period'] = '1970-1980'

# Filter and aggregate for the period 2010-2020
rain_2010_2020 = all_data[(all_data['Year'] >= 2010) & (all_data['Year'] < 2020)]
rain_2010_2020_daily = rain_2010_2020.groupby(['City', 'Month_Day'])['Rain Sum'].sum().reset_index()
rain_2010_2020_daily['Cumulative_Rain'] = rain_2010_2020_daily.groupby('City')['Rain Sum'].cumsum()
rain_2010_2020_daily['Period'] = '2010-2020'

# Combine the two DataFrames
combined_data = pd.concat([rain_1970_1980_daily, rain_2010_2020_daily])

custom_colors = {
    'Bogota': '#8AB58B',  # Replace 'City1' with the actual city name
    'Cairo': '#35B560',  # Replace 'City2' with the actual city name
    'Cape Town': '#114E13',
    'Cork': '#4A685D',
    'Edinburgh': '#04B18C',
    'London': '#FF0004',
    'Manchester': '#BFDCD5',
    'Mumbai': '#0AD4D4',
    'Paris': '#61BBBB',
    'Rome': '#2D5671',
    'Seattle': '#04395C' # Add more cities and colors as needed
}
# Create the plot with facets and separate lines for each city
(
    ggplot(data=combined_data, mapping=aes(x='Month_Day', y='Cumulative_Rain', color='City')) +
    geom_line(size=1) +  # Only lines
    geom_point(size=0) +      # Points at each data point
    ggsize(1200,500) +
    labs(title='Cumulative Rainfall from 1970-1980 and 2010-2020',
         x='Day-Month',
         y='Cumulative Rain (mm)') +
    facet_wrap('Period', ncol=2, dir='v') +
    theme(legend_position='top',
         legend_text=element_text(size=10),
         plot_title=element_text(hjust=0.5)) +          
    scale_color_manual(values=custom_colors)  
)


In [193]:
# Ensure 'Date' is in datetime format
all_data['Date'] = pd.to_datetime(all_data['Date'])

# Create 'Month' column from the 'Date' column
all_data['Month'] = all_data['Date'].dt.month

# Now filter and group by City and Month for 2024
data_2024 = all_data[all_data['Year'] == 2020]

# Group by City and Month to sum rain
monthly_rain = data_2024.groupby(['City', 'Month'])['Rain Sum'].sum().reset_index()

# Ensure the 'Month' column is treated as a categorical variable
monthly_rain['Month'] = monthly_rain['Month'].astype('category')

# Create the stacked bar chart with custom colors for each month
ggplot(monthly_rain, aes(x='City', y='Rain Sum', fill='Month'))   # Use 'Month' directly
geom_bar(stat='identity', position='stack')  # Stacked bars
ggsize(600,800)
scale_fill_manual(values=month_colors, name="Month")  # Assign colors based on month
labs(title="Monthly Rain Sum by City for 2024",
     x="City",
     y="Total Rain Sum (mm)") 
theme_minimal() 
theme(axis_text_x=element_text( hjust=1),
     plot_title=element_text(size=20, face='bold'),
     axis_title=element_text(size=14))





<lets_plot.plot.core.FeatureSpec at 0x7f1f866da010>