# Data Analysis

In [2]:
import json 
import pandas as pd
from lets_plot import *
LetsPlot.setup_html()


### Has London been getting rainier since 1990 (compared to other countries)

In [3]:
import pandas as pd
from lets_plot import *
LetsPlot.setup_html()




In [4]:
all_data = pd.read_csv("rainfall_data.csv")

# WARNINGS HELP

In [23]:
# Calculating the annual rainfall per city per year
all_data['Date'] = pd.to_datetime(all_data['Date'])
all_data['Year'] = all_data['Date'].dt.year
annual_rainfall = all_data.groupby(['City', 'Year']).agg(
    annual_rainfall=('Rain Sum', 'sum')
).reset_index()
# Filtering for just 1970 and 2020


# Filter data for the years 1970 and 2020
rainfall_1970 = annual_rainfall[annual_rainfall['Year'] == 1970][['City', 'annual_rainfall']].rename(columns={'annual_rainfall': 'Rainfall_1970'})
rainfall_2020 = annual_rainfall[annual_rainfall['Year'] == 2020][['City', 'annual_rainfall']].rename(columns={'annual_rainfall': 'Rainfall_2020'})

# Merge the two years' data on the 'City' column
rainfall_diff = pd.merge(rainfall_1970, rainfall_2020, on='City')

# Calculate the difference in rainfall between 2020 and 1970
rainfall_diff['Rainfall_Difference'] = rainfall_diff['Rainfall_2020'] - rainfall_diff['Rainfall_1970']

# Plot the data
ggplot(rainfall_diff, aes(x='City', y='Rainfall_Difference')) + \
geom_bar(stat='identity', fill='#1F627D') + \
ggsize(800,500) + \
labs(y='Change in Annual Rainfall (mm)',
     fill='Change',
     title= "London's change in Annual Rainfall between 1970 and 2020 is minimal",
     subtitle= 'Cities like Mumbai may now be rainier than London and historically cities like Bogota may have been rainier') + \
 theme(
        plot_title=element_text(size=14, hjust=0.5, margin=margin(b=10)),  # Margin for title
        plot_subtitle=element_text(size=12, color='grey', face='italic', hjust=0.5, margin=margin(b=80)),  # Margin for subtitle
        plot_margin=margin(t=30, r=40, b=30, l=3)  # Overall margins
    ) 




WARN: The margin() is deprecated and will be removed in future releases.
      Please, use a number or list of numbers to specify margins (see description of the parameter used).
WARN: The margin() is deprecated and will be removed in future releases.
      Please, use a number or list of numbers to specify margins (see description of the parameter used).
WARN: The margin() is deprecated and will be removed in future releases.
      Please, use a number or list of numbers to specify margins (see description of the parameter used).


In [24]:

# Sample data structure: replace this with your actual data
df = pd.DataFrame(all_data)

# Convert the 'Date' column to datetime
df['Date'] = pd.to_datetime(df['Date'])

# Filter the DataFrame for the years 2010 to 2020
rec_rainfall_df = df[(df['Date'].dt.year >= 2010) & (df['Date'].dt.year <= 2020)]

# Group by country and year, summing daily rainfall
annual_rainfall = rec_rainfall_df.groupby(['City', rec_rainfall_df['Date'].dt.year]).agg(
    Annual_Rainfall=('Rain Sum', 'sum')
).reset_index()

# Rename columns for clarity
annual_rainfall.rename(columns={'Date': 'Year'}, inplace=True)

# Calculate average annual rainfall
average_annual_rainfall = annual_rainfall.groupby('City')['Annual_Rainfall'].mean().reset_index()
average_annual_rainfall.rename(columns={'Annual_Rainfall': 'Average_Annual_Rainfall'}, inplace=True)

city_coords = [
    ("London", 51.5072, -0.1276),  
    ("Manchester", 53.4808, -2.2426),  
    ("Edinburgh", 55.9533, -3.1883),
    ("Cork", 51.8985, -8.4756),
    ("Paris", 48.8575, 2.3514),
    ("Rome", 41.8967, 12.4822),
    ("Seattle", 47.6061, -122.3328),    
    ("Bogota", 4.7110, -74.0721),  
    ("Cairo", 30.0444, 31.2357),
    ("Cape Town", -33.9221, 18.4231),
    ("Mumbai", 19.0760, 72.8777)
]

# Convert city_coords to DataFrame
coords_df = pd.DataFrame(city_coords, columns=['City', 'Latitude', 'Longitude'])

# Merge the average annual rainfall DataFrame with city coordinates
merged_rainfall_df = average_annual_rainfall.merge(coords_df)


# Display the average annual rainfall
print(merged_rainfall_df)


          City  Average_Annual_Rainfall  Latitude  Longitude
0       Bogota              1103.236364    4.7110   -74.0721
1        Cairo                24.172727   30.0444    31.2357
2    Cape Town               529.809091  -33.9221    18.4231
3         Cork               942.654545   51.8985    -8.4756
4    Edinburgh               831.218182   55.9533    -3.1883
5       London               660.145455   51.5072    -0.1276
6   Manchester              1034.272727   53.4808    -2.2426
7       Mumbai              2092.990909   19.0760    72.8777
8        Paris               662.209091   48.8575     2.3514
9         Rome               900.027273   41.8967    12.4822
10     Seattle              1272.590909   47.6061  -122.3328


# ADD SUBTITLE HELP / change legend name without underscores


In [25]:
import pandas as pd
from lets_plot import *

def plot_city_map(city_data, point_colour='blue', 
                  title="London's mean rainfall between 2010 and 2020 is less than the majority of cities", 
                  title_size=14, 
                  subtitle="London ranks 7th in terms of most annual rainfall out of the chosen cities", 
                  subtitle_size=10):
   
    # Plotting the points on a map
    plot = (
        ggplot() +
        geom_livemap() +
        geom_point(aes(x='Longitude', y='Latitude', size='Average_Annual_Rainfall'), 
                   colour=point_colour, 
                   show_legend=True, 
                   data=city_data) +
        ggtitle(title) +
        theme_minimal() + 
        theme(plot_title=element_text(size=title_size, hjust=0.5))
    )
    
    return plot

city_map_plot = plot_city_map(merged_rainfall_df)
city_map_plot

# Call the function to plot the city map with average rainfall



In [38]:
import pandas as pd


# Load your JSON data (assuming you've already created `all_data` from your previous code)

# Step 1: Filter the data for the two time periods
data_1970_1980 = all_data[(all_data['Date'] >= '1970-01-01') & (all_data['Date'] < '1980-01-01')]
data_2010_2020 = all_data[(all_data['Date'] >= '2010-01-01') & (all_data['Date'] < '2020-01-01')]

# Step 2: Calculate the mean annual rainfall for each city for both periods
mean_rainfall_1970_1980 = data_1970_1980.groupby('City')['Precipitation Sum'].mean().reset_index()
mean_rainfall_2010_2020 = data_2010_2020.groupby('City')['Precipitation Sum'].mean().reset_index()

# Step 3: Rename columns for clarity and add a 'Time Period' column
mean_rainfall_1970_1980.rename(columns={'Precipitation Sum': 'Mean Rainfall'}, inplace=True)
mean_rainfall_1970_1980['Time Period'] = '1970-1980'

mean_rainfall_2010_2020.rename(columns={'Precipitation Sum': 'Mean Rainfall'}, inplace=True)
mean_rainfall_2010_2020['Time Period'] = '2010-2020'

# Step 4: Combine the two DataFrames
mean_rainfall_combined = pd.concat([mean_rainfall_1970_1980, mean_rainfall_2010_2020])
print(mean_rainfall_combined)

# Plot directly

# Create the bar chart
bar_chart = (
    ggplot(mean_rainfall_combined, aes(x='City', y='Mean Rainfall', fill='Time Period')) +
    geom_bar(stat='identity', position='dodge') +
    labs(title='Mean Average Rainfall for Two Time Periods',
         x='City',
         y='Mean Average Rainfall (mm)') +
    theme_minimal() +
    scale_fill_manual(values={'1970-1980': '#1f77b4', '2010-2020': '#ff7f0e'})
)

bar_chart




          City  Mean Rainfall Time Period
0       Bogota       7.000986   1970-1980
1        Cairo       0.078094   1970-1980
2    Cape Town       1.216347   1970-1980
3         Cork       2.256216   1970-1980
4    Edinburgh       1.970071   1970-1980
5       London       1.789540   1970-1980
6   Manchester       2.600219   1970-1980
7       Mumbai       4.787486   1970-1980
8        Paris       1.971276   1970-1980
9         Rome       2.961720   1970-1980
10     Seattle       4.078094   1970-1980
0       Bogota       3.105038   2010-2020
1        Cairo       0.053012   2010-2020
2    Cape Town       1.447645   2010-2020
3         Cork       2.509830   2010-2020
4    Edinburgh       2.350137   2010-2020
5       London       1.824233   2010-2020
6   Manchester       2.852711   2010-2020
7       Mumbai       5.544715   2010-2020
8        Paris       1.871550   2010-2020
9         Rome       2.485706   2010-2020
10     Seattle       3.586008   2010-2020


## next one

In [None]:
# Assuming 'Precipitation Hours' is the name of the column you want
# Select the relevant columns including 'Precipitation Hours'
selected_data = all_data[['City', 'Precipitation Hours']]

# Group by 'City' and sum the 'Precipitation Hours'
Total_Precipitation_Hours = selected_data.groupby('City', as_index=False).agg(Total_Precipitation_Hours=('Precipitation Hours', 'sum'))

# Create the bar chart
ggplot(Total_Precipitation_Hours, aes(x='City', y='Total_Precipitation_Hours')) + \
    geom_bar(stat='identity', fill='blue') + \
    labs(title='London does not have a significant amount of hours of precipitation',
         y='Total Precipitation Hours',
         x='City') 


# THOUGHTS ON SIZE OF BLUE VS ORANGE 


In [16]:
london_data.loc[:, 'Cumulative_Rain_Sum'] = london_data['Rain Sum'].cumsum()
london_data.loc[:, 'Cumulative_Precipitation_Sum'] = london_data['Precipitation Sum'].cumsum()


# Initialize Let's Plot
LetsPlot.setup_html()

# Plot the cumulative rain sum
plot = (ggplot(london_data) +
        geom_line(aes(x='Date', y='Cumulative_Rain_Sum'), color='blue', size=2, label='Cumulative Rain Sum') +
        geom_line(aes(x='Date', y='Cumulative_Precipitation_Sum'), color='orange', size=1, label='Cumulative Precipitation Sum') +
        labs(title='Cumulative Sum of Rain and Precipitation in London',
             x='Date',
             y='Cumulative Sum of Rain/Precipitation in London (mm)',
             color='Legend') +
        theme(legend_position='bottom') +
        scale_color_manual(values=['blue', 'orange'], 
                           labels=['Cumulative Rain Sum', 'Cumulative Precipitation Sum']))
plot

NameError: name 'london_data' is not defined

In [17]:
london_data.loc[:, 'Cumulative_Rain_Sum'] = london_data['Rain Sum'].cumsum()
london_data.loc[:, 'Cumulative_Precipitation_Sum'] = london_data['Precipitation Sum'].cumsum()


# Initialize Let's Plot
LetsPlot.setup_html()

# Plot the cumulative rain sum
# Initialize Let's Plot
LetsPlot.setup_html()

# Plot the cumulative rain sum with a legend
plot = (ggplot(london_data) +
        geom_line(aes(x='Date', y='Cumulative_Rain_Sum', color='"Cumulative Rain Sum"'), size=2) +
        geom_line(aes(x='Date', y='Cumulative_Precipitation_Sum', color='"Cumulative Precipitation Sum"'), size=1) +
        labs(title='Cumulative Sum of Rain and Precipitation in London',
             x='Date',
             y='Cumulative Sum of Rain/Precipitation in London (mm)',
             color='Legend') +
        theme(legend_position='bottom')) 

plot


plot

NameError: name 'london_data' is not defined

In [None]:
# Melt the DataFrame to long format for plotting

london_data.loc[:, 'Cumulative_Rain_Sum'] = london_data['Rain Sum'].cumsum()
london_data.loc[:, 'Cumulative_Precipitation_Sum'] = london_data['Precipitation Sum'].cumsum()

# Plot the cumulative rain sum with a legend
plot = (ggplot(melted_data) +
        geom_line(aes(x='Date', y='Cumulative_Sum', color='Line_Type'), size=1) +
        ggsize(500,400) +
        labs(title='The majority of precipitation in the UK is rainfall',
             x='Date',
             y='Cumulative Sum of Rain/Precipitation in London (mm)',
             color='Legend') +
        theme(legend_position='top')) 

plot


In [18]:
import pandas as pd

# Create a DataFrame with sample data for five days
data = {
    'City': ['London'] * 5,
    'Date': pd.date_range(start='2020-01-01', periods=365, freq='D'),  # Only five days
    'Rain Sum': [10, 20, 30, 5, 15],
    'Precipitation Sum': [12, 22, 32, 6, 16]
}
london_data = pd.DataFrame(data)

# Calculate cumulative sums
london_data['Cumulative_Rain_Sum'] = london_data['Rain Sum'].cumsum()
london_data['Cumulative_Precipitation_Sum'] = london_data['Precipitation Sum'].cumsum()


ValueError: All arrays must be of the same length

In [None]:
import pandas as pd
from lets_plot import *

# Calculate cumulative sums
london_data['Cumulative_Rain_Sum'] = london_data['Rain Sum'].cumsum()
london_data['Cumulative_Precipitation_Sum'] = london_data['Precipitation Sum'].cumsum()

# Melt the DataFrame to long format for plotting
melted_data = london_data.melt(id_vars=['Date'], 
                                value_vars=['Cumulative_Rain_Sum', 'Cumulative_Precipitation_Sum'],
                                var_name='Line_Type', 
                                value_name='Cumulative_Sum')

# Initialize Let's Plot
LetsPlot.setup_html()

# Plot the cumulative rain sum and cumulative precipitation sum
plot = (ggplot(melted_data) +
        geom_line(aes(x='Date', y='Cumulative_Sum', color='Line_Type'), size=1) +
        ggsize(500, 700) +
        labs(title='Cumulative Sum of Rain and Precipitation in London',
             x='Date',
             y='Cumulative Sum of Rain/Precipitation in London (mm)',
             color='Legend') +
        theme(legend_position='top') +
        scale_color_manual(values=['#1F627D', '#FFC107'])  # Customize colors if needed
)

# Display the plot
print(plot)


plot
data:         Date                     Line_Type  Cumulative_Sum
0 2020-01-01           Cumulative_Rain_Sum              10
1 2020-01-02           Cumulative_Rain_Sum              30
2 2020-01-03           Cumulative_Rain_Sum              60
3 2020-01-04           Cumulative_Rain_Sum              65
4 2020-01-05           Cumulative_Rain_Sum              80
5 2020-01-01  Cumulative_Precipitation_Sum              12
6 2020-01-02  Cumulative_Precipitation_Sum              34
7 2020-01-03  Cumulative_Precipitation_Sum              66
8 2020-01-04  Cumulative_Precipitation_Sum              72
9 2020-01-05  Cumulative_Precipitation_Sum              88
mapping: {}
data_meta: {'series_annotations': [{'type': 'datetime', 'column': 'Date'}, {'type': 'str', 'column': 'Line_Type'}, {'type': 'int', 'column': 'Cumulative_Sum'}]}
ggsize: {
  "width": 500,
  "height": 700
}
ggtitle: {
  "text": "Cumulative Sum of Rain and Precipitation in London"
}
guides: {'x': {'title': 'Date'}, 'y': {'title':

In [19]:
import pandas as pd

# Ensure that 'Date' is in datetime format
all_data['Date'] = pd.to_datetime(all_data['Date'])

# Filter data for the year 2020
data_2020 = all_data[all_data['Date'].dt.year == 2020].copy()

# Calculate cumulative sums
data_2020['Cumulative_Rain_Sum'] = data_2020['Rain Sum'].cumsum()
data_2020['Cumulative_Precipitation_Sum'] = data_2020['Precipitation Sum'].cumsum()

# Initialize Let's Plot
LetsPlot.setup_html()

# Plot the cumulative sums
plot = (ggplot(data_2020) +
        geom_line(aes(x='Date', y='Cumulative_Rain_Sum'), color='blue', size=2, label='Cumulative Rain Sum') +
        geom_line(aes(x='Date', y='Cumulative_Precipitation_Sum'), color='orange', size=1, label='Cumulative Precipitation Sum') +
        labs(title='Cumulative Sum of Rain and Precipitation in London for 2020',
             x='Date',
             y='Cumulative Sum of Rain/Precipitation in London (mm)',
             color='Legend') +
        theme(legend_position='bottom'))

plot


In [20]:
# Filter for London data
london_data = all_data[all_data['City'] == 'London'].copy()
print (london_data)
# Calculate cumulative sums
london_data['Cumulative_Rain_Sum'] = london_data['Rain_Sum'].cumsum()
london_data['Cumulative_Precipitation_Sum'] = london_data['Precipitation_Sum'].cumsum()

# Initialize Let's Plot
LetsPlot.setup_html()

# Melt the DataFrame to long format for plotting
melted_data = london_data.melt(id_vars=['Date'], 
                                value_vars=['Cumulative_Rain_Sum', 'Cumulative_Precipitation_Sum'],
                                var_name='Line_Type', 
                                value_name='Cumulative_Sum')

# Plot the cumulative sums with a legend
plot = (ggplot(melted_data) +
        geom_line(aes(x='Date', y='Cumulative_Sum', color='Line_Type'), size=1) +
        ggsize(500, 400) +
        labs(title='Cumulative Sum of Rain and Precipitation in London',
             x='Date',
             y='Cumulative Sum of Rain/Precipitation in London (mm)',
             color='Legend') +
        theme(legend_position='top')) 

plot


         City       Date  Precipitation Sum  Rain Sum  Precipitation Hours  \
0      London 1970-01-01                0.0       0.0                  0.0   
1      London 1970-01-02                0.3       0.3                  3.0   
2      London 1970-01-03                0.3       0.3                  3.0   
3      London 1970-01-04                0.0       0.0                  0.0   
4      London 1970-01-05                0.0       0.0                  0.0   
...       ...        ...                ...       ...                  ...   
48211  London 2020-12-27               13.3      13.3                 13.0   
48212  London 2020-12-28                0.3       0.3                  1.0   
48213  London 2020-12-29                0.3       0.0                  2.0   
48214  London 2020-12-30                0.0       0.0                  0.0   
48215  London 2020-12-31                0.0       0.0                  0.0   

       Year  
0      1970  
1      1970  
2      1970  
3      

KeyError: 'Rain_Sum'