# COMP90024 - Cluster and Cloud Computing Assignment 2 - Big Data Analytics on the Cloud
* **Team**
    * Team 53
*  **Team Members**
    * Niket Singla (1288512)
    * Jason Phan (1180106)
    * Patipan Rochanapon (1117537)
    * Liam Brennan (1269948)
    * Parsa Babadi Noroozi (1271605)

In [None]:
# Install the necessary libraries

!pip install elasticsearch
!pip install ipywidgets
!pip install matplotlib
!pip install folium
!pip install geopandas
!pip install shapely
!pip install vincent
!pip install seaborn
!pip install plotly

In [None]:
import warnings
warnings.filterwarnings("ignore")

## Initialise the necessary global variables below

In [None]:
API_BASE_URL = "http://router.fission.svc.cluster.local"

In [None]:
import pandas as pd
import numpy as np
import folium
import geopandas as gpd
from shapely.geometry import shape
import matplotlib.pyplot as plt
import plotly.express as px

In [None]:


# Create a map centered around Australia
m = folium.Map(location=[-37.8136, 144.9631], zoom_start=7.5)
m.get_root().html.add_child(folium.JavascriptLink('https://cdn.plot.ly/plotly-2.32.0.min.js'))

In [None]:
js = """
        function mfData(lt, lng){
            
            // Request path
            // {baseurl} + /mortalityfertilitydata/lat/{lat:[0-9.-]*}/long/{long:[0-9.-]*}
            
            const apiUrl = 'http://localhost:9090/mortalityfertilitydata/lat/' + lt + '/long/' + lng;
            console.log(apiUrl);
            //const apiUrl = 'http://localhost:9090/mortalityfertilitydata/lat/-37.846/long/144.945'


            const attributeMapping = {
                'M0_tfr_184': 'Total Births',
                'dth_bst_00': 'Breast Cancer',
                'dth_cop_05': 'Chronic pulmonory disease',
                'dth_dia_10': 'Diabetes',
                'dths_can15': 'Cancer',
                'dths_cer20': 'Cerebrovascular disease',
                'dths_cir25': 'Circulatory disease',
                'dths_col30': 'Colorectal cancer',
                'dths_ext35': 'External causes',
                'dths_f_040': 'Female deaths',
                'dths_isc45': 'Ischaemic heart disease',
                'dths_lun50': 'Lung Cancer',
                'dths_m_055': 'Male deaths',
                'dths_res60': 'Respiratory disease',
                'dths_rti65': 'Road Traffic Injuries',
                'dths_sui70': 'Suicide',
                'dths_tot75': 'Total death'
            };
            
            // Make a GET request
            data = fetch(apiUrl)
              .then(response => {
                if (!response.ok) {
                  throw new Error('Network response was not ok');
                }
                return response.json();
              })
              .then(data => {
                // var x = [];
                console.log(data[0]._source);
                var values = [];
                var attributes = [];
                
                Object.keys(data[0]._source).forEach(key => {
                    if (key !== 'area_code' && key !== 'area_name' && key !== 'geometry'){
                        attributes.push(attributeMapping[key] || key);
                        values.push(data[0]._source[key]);
                        // x.push(data[0]._source[key]);
                    }
                });
                var trace = {
                    x: attributes,
                    y: values,
                    type: 'bar',
                };
                var data = [trace];
                var layout = {
                    xaxis: {title: 'Attributes', tickangle: -45},
                    yaxis: {title: 'Value'},
                    height: 600,
                    width: 600
                }
                Plotly.newPlot('myDiv', data, layout);
                  })
              .catch(error => {
                console.error('Error:', error);
              });
            // console.log(lat, lng);
            
        }
        function exploreData(dict, yval) {
            console.log("yval: ", yval.val);
            console.log('x: ', dict);
            var values = [];
            var attributes = []

            var container = document.getElementById('myDiv');
            var availableWidth = container.clientWidth;
            var availableHeight = container.clientHeight;

            Object.keys(dict).forEach(key => {
                if (key === 'x') {
                    Object.keys(dict[key]).forEach(k => {
                        attributes.push(dict[key][k]);
                    })
                }
                else {
                    Object.keys(dict[key]).forEach(k => {
                        values.push(dict[key][k]);
                    })
                }
            });
            var trace = {
                    x: attributes,
                    y: values,
                    mode: 'markers',
                    type: 'scatter',
                };
                var data = [trace];
                var layout = {
                    xaxis: {
                        title: 'Dates', 
                        automargin: true, 
                        range: [0, Math.max(...attributes)+5]
                        },
                    yaxis: {
                        title: yval.val,
                        range: [0, Math.max(...values)+5]
                        },
                    height: availableHeight,
                    width: availableWidth,
                    margin: {t: 50},
                    autosize: true
                }
                Plotly.newPlot('myDiv', data, layout);
        }
        """

In [None]:
def plot_geojson(gdf: gpd.GeoDataFrame, join_col: str, value_col: str, site_name: str):
    # Create a Folium map centered at a specific location
    m = folium.Map(location=[-37.8136, 144.9631], zoom_start=9.5)

    # Iterate over each feature in the GeoDataFrame and add it to the map as a GeoJson layer
    for _, row in gdf.iterrows():
        # Extract the properties for the tooltip
        tooltip_text = f"{row[site_name]}: {row[value_col]}"

        # Create a GeoJson object for the feature, with custom tooltip
        folium.GeoJson(
            row.geometry,
            tooltip=tooltip_text,
            style_function=lambda x: {'fillColor': 'YlOrRd', 'fillOpacity': 0.7, 'lineOpacity': 0.2},
            name=f'GeoJson_{row[join_col]}'  # Use the join column as the layer name
        ).add_to(m)

    # Add a LayerControl to the map to toggle the layer visibility
    folium.LayerControl().add_to(m)

    # Return the map
    return m


In [None]:
# Plot Choropleth

def plot_choropleth(gdf: gpd.GeoDataFrame, join_col: str, value_col: str):
    m = folium.Map(location=[-37.8136, 144.9631], zoom_start=6.5)

    # Add a choropleth layer to the Folium map using the GeoPandas DataFrame
    folium.Choropleth(
        geo_data=gdf,
        name='choropleth',
        data=gdf,
        columns=[join_col, value_col],
        key_on=f'feature.properties.{join_col}',  # GeoJSON key to join with 'id' in DataFrame
        fill_color='YlOrRd',  # Color scheme
        fill_opacity=0.7,
        line_opacity=0.2,
        legend_name=f'Legend value: {value_col}',
    ).add_to(m)

    # Add a LayerControl to the map to toggle the choropleth layer visibility
    folium.LayerControl().add_to(m)

    # Display the map
    return m

In [None]:
# Plot colored points

def plot_colored_points(gdf: gpd.GeoDataFrame, site_name: str, value_col: str, 
                        med_thresh: int, high_thresh: int):
    m = folium.Map(location=[-37.8136, 144.9631], zoom_start=6.5)

    # Iterate over the GeoDataFrame rows to add markers with different colors based on the values
    for _, row in gdf.iterrows():
        # Determine marker color based on the value column
        if row[value_col] > high_thresh:
            color = 'red'
        elif row[value_col] > med_thresh:
            color = 'orange'
        else:
            color = 'green'

        # Create a custom tooltip using HTML and CSS
        tooltip_html = f"""
            <div style="background-color: white; padding: 10px; 
            border-radius: 5px; box-shadow: 2px 2px 5px rgba(0, 0, 0, 0.3);">
                <div style="font-weight: bold;">{site_name}: {row[site_name]}</div>
                <div>{value_col}: {row[value_col]}</div>
            </div>
        """
        
        # Create a marker with the determined color and add it to the map
        folium.CircleMarker(
            location=[row.geometry.y, row.geometry.x],  # Lat, Lon order
            radius=5,
            color=color,
            fill=True,
            fill_color=color,
            fill_opacity=0.7,
            # tooltip=f"{value_col}: {row[value_col]} {site_name}: {row[site_name]} "# Tooltip with value
            tooltip=tooltip_html,
            popup = folium.Popup(f"{site_name}: {row[site_name]} {value_col}: {row[value_col]}" , parse_html=True)

        ).add_to(m)
    
    # Display the map
    return m


In [None]:
# Plot colored points with vega

import vincent
import json
import numpy as np
from branca.element import Element


def plot_colored_points_with_vega(gdf: gpd.GeoDataFrame, site_name: str, value_col: str,
                                  data, y_val: str, med_thresh: int, high_thresh: int):

    m = folium.Map(location=[-37.8136, 144.9631], zoom_start=7.5)
    m.get_root().html.add_child(folium.JavascriptLink('https://cdn.plot.ly/plotly-2.32.0.min.js'))
    m.get_root().script.add_child(Element(js))

    # Iterate over the GeoDataFrame rows to add markers with different colors based on the values
    for _, row in gdf.iterrows():
        # Determine marker color based on the value column
        if row[value_col] > high_thresh:
            color = 'red'
        elif row[value_col] > med_thresh:
            color = 'orange'
        else:
            color = 'green'

        x = data[data['site_name'] == row[site_name]]['dates'].tolist()
        y = data[data['site_name'] == row[site_name]][value_col].tolist()
        dic = {
            row[site_name] : y[0]
            if len(y) > 0 else [0.0] ,
            "x" : x[0]
            if len(x) > 0 else ["0"],
        }

        y_axis = {
            "val": y_val,
        }

        # Create a custom tooltip using HTML and CSS
        tooltip_html = f"""
            <div style="background-color: white; padding: 10px; 
            border-radius: 5px; box-shadow: 2px 2px 5px rgba(0, 0, 0, 0.3);">
                <div style="font-weight: bold;">{site_name}: {row[site_name]}</div>
                <div style="font-weight: bold;">{value_col}: {row[value_col]}</div>
            </div>
        """

        lng = row.geometry.x
        lat = row.geometry.y

        # Create a popup for the marker
        popup_html = f"""
        <div id style="width: 464px;">
            <button id="dailydata" style="background-color: #4CAF50; /* Green */
            border: none;
            color: white;
            padding: 15px 32px;
            text-align: center;
            text-decoration: none;
            display: inline-block;
            font-size: 16px;
            margin: 4px 2px;
            cursor: pointer;
            border-radius: 12px;
            box-shadow: 0 8px #4CAF50;"
            onclick="exploreData({dic}, {y_axis})">Explore this data</button>
            <button id="mfdata" style="background-color: #008CBA; /* Blue */
            border: none;
            color: white;
            padding: 15px 32px;
            text-align: center;
            text-decoration: none;
            display: inline-block;
            font-size: 16px;
            margin: 4px 2px;
            cursor: pointer;
            border-radius: 12px;
            box-shadow: 0 8px #008CBA;"
            onclick="mfData({lat}, {lng})">Explore fertility and mortality</button>
        </div>
        <div id = "myDiv">
        </div>
        """


        # Create a marker with the determined color and add it to the map
        marker = folium.CircleMarker(
            location=[row.geometry.y, row.geometry.x],  # Lat, Lon order
            radius=5,
            color=color,
            fill=True,
            fill_color=color,
            fill_opacity=0.7,
            tooltip=tooltip_html
        ).add_to(m)




        popup = folium.Popup(popup_html,
                             height=700,
                             width=700)

        marker.add_child(popup)
    # Display the map
    return m


## BASE API Function to call the required endpoint without having to repeat the code at multiple place

In [None]:
# Make the api request

import requests
def api_request(endpoint, hdr = None):
    url = API_BASE_URL + "/" + endpoint
    if hdr is None:
        response = requests.get(url)
    else:
        response = requests.get(url, headers=hdr)
    if response.status_code == 200:
        return response.json()
    else:
        return response.status_code

### Map data on the australia map

In [None]:
# Retrieve the api data

def retrieve_data(api: str):

    from shapely.geometry import Point
    import pandas as pd
    from datetime import datetime

    res = api_request(api)
    list_geo = []
    list_nogeo = []

    for doc in res['aggregations']['sites']['buckets']:
        site_name = None
        count_val = 0
        daily_val = []
        dates = []
        if api == 'getepadata':
            daily_val = [0 if val['avg_value']['value'] is None else
                         val['avg_value']['value'] for val in doc['daily_particle_val']['buckets']]
            count_val = len([1 if val['avg_value']['value'] is None else
                             val['avg_value']['value'] for val in doc['daily_particle_val']['buckets']])
            dates = [datetime.strptime(val['key_as_string'], "%Y-%m-%dT%H:%M:%S.%fZ").day
                     for val in doc['daily_particle_val']['buckets']
                     if val['key_as_string'] is not None]

        elif api == "getweatherdata":
            daily_val = [0 if val['avg_temp']['value'] is None else
                         val['avg_temp']['value'] for val in doc['daily_avg_temp']['buckets']]
            count_val = len([1 if val['avg_temp']['value'] is None else
                             val['avg_temp']['value'] for val in doc['daily_avg_temp']['buckets']])
            dates = [datetime.strptime(val['key_as_string'], "%Y-%m-%dT%H:%M:%S.%fZ").day
                     for val in doc['daily_avg_temp']['buckets']
                     if val['key_as_string'] is not None]

        site = doc['site_info']['hits']['hits'][0]['_source'].get('sitename')

        if site is not None and len(daily_val) > 0:
            site_name = site

        sum_val = sum(daily_val)
        avg_val = round(sum_val/count_val ,2) if count_val > 0 else None
        geo_value = doc['site_info']['hits']['hits'][0]['_source'].get('geo')
        geometry = None
        if geo_value is not None:
            latitude, longitude = map(float, geo_value.split(','))
            geometry = Point(longitude, latitude)
            list_geo.append({
                **doc,
                'site_name': site_name,
                'daily_avg': avg_val,
                'geometry': geometry
            })
        list_nogeo.append({
            **doc,
            'dates': dates,
            'site_name': site_name,
            'daily_avg': daily_val
        })
    df_nogeo = pd.DataFrame(list_nogeo)
    # df_nogeo['site_name'] = df_nogeo['sitename']
    df_geo = gpd.GeoDataFrame(list_geo, crs='EPSG:4326', geometry='geometry')
    # print("API: ", api, "site_name: ", df_nogeo[df_nogeo['site_name']=='Melton'])
    return df_geo, df_nogeo

In [None]:
# Retrieve Sudo data

import json
def retrieve_sudo_data(coordinates=None):

    if coordinates is not None:
        ## Request path
        ## {baseurl} + /mortalityfertilitydata

        res = api_request("mortalityfertilitydata")
        lst_res = [{**doc, **doc['_source'], 'geometry': shape(doc['_source']['geometry'])} for doc in res]
        df_geo = gpd.GeoDataFrame(lst_res, crs='EPSG:4326', geometry='geometry')
        # df_geo = gpd.GeoDataFrame.from_features(lst_res, crs="EPSG:4283")
        return df_geo
    else:

        ## Request path
        ## {baseurl} + /mortalityfertilitydata/lat/{lat:[0-9.-]*}/long/{long:[0-9.-]*}

        res = api_request("mortalityfertilitydata/lat/-37.846/long/144.945")
        source = res[0]['_source']
        json_data = {
            'area_name': source['area_name'],
            'total_breast_cancer': source['dth_bst_00'],
            'total_chronic_pulmonory': source['dth_cop_05'],
            'total_diabetes': source['dth_dia_10'],
            'total_cancer': source['dths_can15'],
            'total_cerebro_vascular': source['dths_cer20'],
            'total_circulatory': source['dths_cir25'],
            'total_colorectal': source['dths_col30'],
            'total_external': source['dths_ext35'],
            'total_female': source['dths_f_040'],
            'total_ischaemic': source['dths_isc45'],
            'total_lung': source['dths_lun50'],
            'total_male': source['dths_m_055'],
            'total_respiratory': source['dths_res60'],
            'total_roadtraffic': source['dths_rti65'],
            'total_suicide': source['dths_sui70'],
            'total_death': source['dths_tot75'],
            'total_fertility': source['M0_tfr_184']
        }

In [None]:
# Retrieve Sudo data

import json
import asyncio

def retrieve_sorted_sudo_data(attribute):

    if attribute is not None:
        ## Request path
        ## {baseurl} + /mortalityfertilitydata/attribute/
        res = api_request(f"sortmortalityfertilitydata/attribute/{attribute}")
        lst_res = [{**doc, **doc['_source'], 'geometry': shape(doc['_source']['geometry'])} for doc in res]
        df_geo = gpd.GeoDataFrame(lst_res, crs='EPSG:4326', geometry='geometry')
        # df_geo = gpd.GeoDataFrame.from_features(lst_res, crs="EPSG:4283")
        return df_geo
    else:

        ## Request path
        ## {baseurl} + /mortalityfertilitydata/lat/{lat:[0-9.-]*}/long/{long:[0-9.-]*}

        res = api_request("mortalityfertilitydata/lat/-37.846/long/144.945")
        source = res[0]['_source']
        json_data = {
            'area_name': source['area_name'],
            'total_breast_cancer': source['dth_bst_00'],
            'total_chronic_pulmonory': source['dth_cop_05'],
            'total_diabetes': source['dth_dia_10'],
            'total_cancer': source['dths_can15'],
            'total_cerebro_vascular': source['dths_cer20'],
            'total_circulatory': source['dths_cir25'],
            'total_colorectal': source['dths_col30'],
            'total_external': source['dths_ext35'],
            'total_female': source['dths_f_040'],
            'total_ischaemic': source['dths_isc45'],
            'total_lung': source['dths_lun50'],
            'total_male': source['dths_m_055'],
            'total_respiratory': source['dths_res60'],
            'total_roadtraffic': source['dths_rti65'],
            'total_suicide': source['dths_sui70'],
            'total_death': source['dths_tot75'],
            'total_fertility': source['M0_tfr_184']
        }

In [None]:
# Pre-retrieve the api data

weather_geo, weather_nogeo  = retrieve_data("getweatherdata")
epa_geo, epa_nogeo  = retrieve_data("getepadata")

In [None]:
from ipywidgets import interact

def display_mf_dropdown():
    mf_dropdown = widgets.Dropdown(
        options= ['Breast Cancer',
                  'Chronic Pulmonory Disease',
                  'Diabetes',
                  'Total Cancer',
                  'Cerebro Vascular Disease',
                  'Circulatory obstruction',
                  'Colorectal Cancer',
                  'Total External Causes',
                  'Total Female',
                  'Ischaemic Failure',
                  'Lung Cancer',
                  'Total Male',
                  'Respiratory Disease',
                  'Road Traffic incidents',
                  'Suicide',
                  'Total Death',
                  'Total Birth',],
        value=None,
        description='Category:',
        disabled=False,)
    mf = mf_dropdown.observe(update_mf_dd, names='value')
    display(mf_dropdown)

def on_button_clicked(b):
    with output_analysis:
        plot = None
        print("button clicked: ", b.description)
        clear_output()
        val = b.description
        if val == 'Weather':
            plot = plot_colored_points_with_vega(epa_geo, 'site_name', 'daily_avg', epa_nogeo, "Average daily particle value", 5, 10)
        elif val == "Air Quality":
            plot = plot_colored_points_with_vega(weather_geo, 'site_name', 'daily_avg', weather_nogeo, "Average daily temperature", 10, 13)
        elif val == "Top 7 sites for mortality and fertility":
            # print(val)
            display_mf_dropdown()
        if plot is not None:
            display(plot)



#Define a function to execute the sudo call
def update_mf_dd(change):
    value = change['new']
    print(value)
    with output_analysis:
        attribute = ""
        if value == 'Breast Cancer':
            attribute = 'dth_bst_00'
        elif value == 'Chronic Pulmonory Disease':
            attribute = 'dth_cop_05'
        elif value == 'Diabetes':
            attribute = 'dth_dia_10'
        elif value == 'Total Cancer':
            attribute = 'dths_can15'
        elif value == 'Cerebro Vascular Disease':
            attribute = 'dths_cer20'
        elif value == 'Circulatory obstruction':
            attribute = 'dths_cir25'
        elif value == 'Colorectal Cancer':
            attribute = 'dths_col30'
        elif value == 'Total Female':
            attribute = 'dths_f_040'
        elif value == 'Ischaemic Failure':
            attribute = 'dths_isc45'
        elif value == 'Lung Cancer':
            attribute = 'dths_lun50'
        elif value == 'Total Male':
            attribute = 'dths_m_055'
        elif value == 'Respiratory Disease':
            attribute = 'dths_res60'
        elif value == 'Road Traffic incidents':
            attribute = 'dths_rti65'
        elif value == 'Suicide':
            attribute = 'dths_sui70'
        elif value == 'Total Death':
            attribute = 'dths_tot75'
        elif value == 'Total Birth':
            attribute = 'M0_tfr_184'
        geo_df = retrieve_sorted_sudo_data(attribute)
        clear_output(wait=True)
        print(value)
        # plot = plot_choropleth(geo_df, '_id', attribute, 'area_name')
        plot = plot_geojson(geo_df, '_id', attribute, 'area_name')
        display_mf_dropdown()
        display(plot)

# Define a function to update site_name dropdown options based on the selected category
def update_site_options(value):
    if value == 'Weather':
        site_options = weather_geo['site_name'].sort_values()
    elif value == 'Air Quality':
        site_options = epa_geo['site_name'].sort_values()
    else:
        site_options = []
    site_name.options = site_options

# Function to update the plot based on dropdown selection
def update_plot(value, value2):
    if value == 'Weather':
        df = pd.DataFrame(weather_nogeo['site_name'])
        df['daily_avg'] = weather_nogeo['daily_avg']
        df_geo = pd.DataFrame(weather_geo['site_name'])
        df_geo['daily_avg'] = weather_geo['daily_avg']
        yval = 'Daily avg temperature'
    elif value == 'Air Quality':
        df = pd.DataFrame(epa_nogeo['site_name'])
        df['daily_avg'] = epa_nogeo['daily_avg']
        df_geo = pd.DataFrame(epa_geo['site_name'])
        df_geo['daily_avg'] = epa_geo['daily_avg']
        yval = 'Avg daily particle value'

    if value != None:
        plt.figure(figsize=(20, 12))
        # fig, axs = plt.subplots(2, 2, figsize=(20, 12))
        ax1 = plt.subplot2grid(shape=(2, 2), loc=(0, 0), colspan=2)
        ax1.boxplot(df['daily_avg']);
        ax1.set_title('Box plot');
        ax1.set_xlabel(value2);
        ax1.set_ylabel(yval);
        df_sorted = df_geo.sort_values(by='daily_avg', ascending=False)

        # Select the top 7 sites with the highest temperature
        top_sites = df_sorted.head(7)
        ax2 = plt.subplot2grid(shape=(2, 2), loc=(1, 0), colspan=1)
        ax2.bar(top_sites['site_name'], top_sites['daily_avg'], label= top_sites['site_name']);
        ax2.set_title(f'Top 7 sites with highest {yval}');
        ax2.set_xlabel(value2);
        ax2.set_ylabel(yval);
        if value != None and value2 != None:
            ax3 = plt.subplot2grid(shape=(2, 2), loc=(1, 1), colspan=1)
            ax3.boxplot(df['daily_avg'][df['site_name']== value2]);
            ax3.set_title('Box plot');
            ax3.set_xlabel(value2);
            ax3.set_ylabel(yval);

        plt.tight_layout()
        plt.show()


In [None]:
import ipywidgets as widgets
import plotly.graph_objs as go
import chart_studio.plotly as py
import plotly.offline as po
import plotly.express as px
from ipywidgets import Layout, Button, Box, interactive, VBox, HBox
from IPython.display import display, clear_output
import matplotlib.pyplot as plt
import seaborn as sns

# Add header widgets with logo and title
svg_html = """
            <div style="height: 100px; background-color: #000f46; margin: 25; padding: 5;">
                <svg width="100" height="100" viewBox="0 0 100 100" xmlns="http://www.w3.org/2000/svg">
                    <image xlink:href="https://d2h9b02ioca40d.cloudfront.net/v14.15.1/assets/logo-5abab1a775357bcb43a8.svg" width="100" height="100"/>
                </svg>
            </div>
           """
header_html = """
                <div style="height: 100px; width: 1224px; background-color: #000f46; margin: 25; padding: 25; display: flex; align-items: center; justify-content: center;">
                    <h1 id="title" style="margin: 10; color: white;" >Data analysis on weather, air quality and its effect on mortality and fertility</h1>
                </div>
            """
header_widget = widgets.HTML(value=header_html)
svg_widget = widgets.HTML(value=svg_html)
header_box = HBox([svg_widget, header_widget])

In [None]:
# Row 1

category = widgets.Dropdown(
    options= ['Weather', 'Air Quality'],
    value=None,
    description='Category:',
    disabled=False,)

site_name = widgets.Dropdown(
    options= [],
    value=None,
    description='Sites:',
    disabled=False,)

# category.observe(update_site_options, names='values')
w = interactive(update_site_options, value=category)
wid = interactive(update_plot, value=category, value2=site_name)
row1_box = HBox([wid])

In [None]:
# Row 2
description_html = '<b style="font-size:large; text-align:center;">Click on one of the button to explore data</b>'
description_widget = widgets.HTML(value=description_html)

In [None]:
# Row 3
import seaborn as sns
merged_gdf = gpd.sjoin_nearest(epa_geo, weather_geo,
                               how='inner', max_distance=1.0,
                               distance_col='distance')
merged_gdf['Daily Avg Particle Value'] = merged_gdf['daily_avg_left']
merged_gdf['Daily Avg Temp'] = merged_gdf['daily_avg_right']


def calculate_correlation(b):
    with output_analysis:
        clear_output()
        correlation_matrix = merged_gdf[['Daily Avg Particle Value', 'Daily Avg Temp']].corr()

        # Plot heatmap
        plt.figure(figsize=(8, 6))
        sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt=".3f")
        plt.title('Correlation Matrix')
        plt.xlabel('Variables')
        plt.ylabel('Variables')
        plt.show();


In [None]:
#Row 4
# Add button widget
box_layout = Layout(display='flex',
                    flex_flow='row',
                    align_items='stretch',
                    width='70%')

corrbtn = widgets.Button(
    description='Correlation b/w Air quality and weather',
    disabled=False,
    value = None,
    button_style='info', # 'success', 'info', 'warning', 'danger' or ''
    tooltips=['Calculate correlation between weather and air quality'],
    layout= Layout(flex='1 1 auto', width='auto')
)
btn1 = widgets.Button(
    description='Weather',
    disabled=False,
    value = None,
    button_style='info', # 'success', 'info', 'warning', 'danger' or ''
    tooltips=['Analyse weather data'],
    layout= Layout(flex='1 1 auto', width='auto')
)
btn2 = widgets.Button(
    description='Air Quality',
    disabled=False,
    value = None,
    button_style='info', # 'success', 'info', 'warning', 'danger' or ''
    tooltips=['Analyse air quality data'],
    layout= Layout(flex='1 1 auto', width='auto')
)
btn3 = widgets.Button(
    description='Top 7 sites for mortality and fertility',
    disabled=False,
    value = None,
    button_style='info', # 'success', 'info', 'warning', 'danger' or ''
    tooltips=['Explore the sites with highest mortality rate'],
    layout= Layout(flex='1 1 auto', width='auto')
)

corrbtn.on_click(calculate_correlation)
btn1.on_click(on_button_clicked)
btn2.on_click(on_button_clicked)
btn3.on_click(on_button_clicked)
analysis = [corrbtn, btn1, btn2, btn3]
button_box = Box(children=analysis, layout=box_layout)
output_analysis = widgets.Output()

In [None]:
full_layout = VBox([header_box,
                    row1_box,
                    HBox([description_widget]),
                    VBox([button_box, output_analysis])])


display(full_layout)

## Crash and health risks analytics

In [None]:
# Download SA2 region geometry

sa2_geo = api_request("sa2/geometry")
sa2_gpd = gpd.GeoDataFrame.from_features(sa2_geo, crs="EPSG:4283")
sa2_gpd = sa2_gpd.set_index("sa2_main11", drop=False)
sa2_gpd = sa2_gpd.rename(columns={"sa2_main11": "sa2_code"})

In [None]:
# Query the number of crashes in each SA2 district

crash_by_sa2 = api_request("crashes/by/sa2")
crash_by_sa2_df = pd.DataFrame(crash_by_sa2, columns=["sa2.properties.sa2_main11.keyword", "count"])

# Normalise the number of crashes against the area of the region and set to log scale
crash_by_sa2_df = crash_by_sa2_df.rename(columns={"sa2.properties.sa2_main11.keyword": "sa2_main11"})
crash_by_sa2_df = crash_by_sa2_df.join(sa2_gpd.area.rename("area"), on="sa2_main11")
crash_by_sa2_df = crash_by_sa2_df.join(sa2_gpd["sa2_name11"], on="sa2_main11")
crash_by_sa2_df["normalised_count"] = np.log(crash_by_sa2_df["count"] / crash_by_sa2_df["area"])

In [None]:
# Query the alcohol health risks in each SA2 district

sa2_risks = api_request("crashes/by/sa2/avg/health_risks.alcohol_cons_1_no_3_11_7_13")
sa2_risks_df = pd.DataFrame(sa2_risks, columns=["health_risks.alcohol_cons_1_no_3_11_7_13", "sa2.properties.sa2_main11.keyword", "count"])
sa2_risks_df = sa2_risks_df.rename(columns={"sa2.properties.sa2_main11.keyword": "sa2_main11"})

## Area-normalised number of car crashes per sa2 district

In [None]:
fig = px.choropleth_mapbox(
    crash_by_sa2_df,
    geojson=sa2_gpd,
    color="normalised_count",
    locations=crash_by_sa2_df["sa2_main11"],
    center={"lat": -37, "lon": 144.9631},
    mapbox_style="carto-positron",
    hover_name="sa2_name11",
    hover_data=["normalised_count"],
    custom_data=["sa2_name11", "normalised_count"],
    title="<b>Number of Car Crashes in Victorian SA2 Districts per Region Area</b>",
    color_continuous_scale='ylorrd',
    opacity=0.75,
    width=1200,
    height=800,
    zoom=6,
)

fig.update_traces(
    hovertemplate="<b>%{customdata[0]}</b><br>%{customdata[1]:.2f}",
    marker_line_width=0.1,
)

fig.update_layout(
    coloraxis_colorbar={"title": "Log Count<br>per SA2 Area"},
    margin={"r": 50, "t": 50, "l": 50, "b": 50},
)

fig.show()

In [None]:
# Query a subsample of the crashes data

crash_sample = api_request("crashes/sample/5000")
crash_sample_gdf = gpd.GeoDataFrame.from_features(crash_sample)
crash_sample_gdf["lat"] = crash_sample_gdf["geometry"].y
crash_sample_gdf["lon"] = crash_sample_gdf["geometry"].x

## Heatmap of car crash locations

In [None]:
fig = px.density_mapbox(
    crash_sample_gdf,
    lat='lat',
    lon='lon',
    radius=3,
    center={"lat": -37, "lon": 144.9631},
    mapbox_style="carto-positron",
    title="<b>Heatmap of Car Crashes in Victoria</b>",
    width=1200,
    height=800,
    zoom=6,
)

fig.show()

In [None]:
# Query the average number of fatalities in car crashes vs alcohol health risk per SA2 district

fatalities_vs_alcohol = api_request("crashes/by/sa2/avg/properties.FATALITY/with/avg/health_risks.alcohol_cons_2_rate_3_11_7_13")
fatalities_vs_alcohol_df = pd.DataFrame(fatalities_vs_alcohol)

## Car crash fatalities vs alcohol consumption rate

In [None]:
fig = px.scatter(
    fatalities_vs_alcohol_df,
    x="health_risks.alcohol_cons_2_rate_3_11_7_13",
    y="properties.FATALITY",
    width=1200,
    height=800,
    range_y=(-0.02, 0.14),
    trendline="ols",
    trendline_color_override='purple',
    labels={
        "properties.FATALITY": "Average Fatalities",
        "health_risks.alcohol_cons_2_rate_3_11_7_13": "Alcohol Consumption Rate"
    },
    title="Average Crash Fatalities vs Alcohol Consumption Rate by SA2 District"
)
fig.show()

In [None]:
# Query the average SA2 alcohol consumption proportion by the number of serious injuries in each car crash

alcohol_by_pedestrians = api_request("crashes/by/properties.SERIOUSINJURY/avg/health_risks.alcohol_cons_2_rate_3_11_7_13")
alcohol_by_pedestrians_df = pd.DataFrame(alcohol_by_pedestrians)

## Average alcohol consumption rate vs number of serious injuries

In [None]:
fig = px.bar(
    alcohol_by_pedestrians_df[alcohol_by_pedestrians_df["count"] > 1],
    x="properties.SERIOUSINJURY",
    y="health_risks.alcohol_cons_2_rate_3_11_7_13",
    range_y=(2.5, 3.5),
    width=1200,
    height=800,
    title="Average Alcohol Consumption Rate vs Number of Serious Injuries",
    labels={
        "properties.SERIOUSINJURY": "Number of Serious Injuries",
        "health_risks.alcohol_cons_2_rate_3_11_7_13": "Average Alcohol Consumption Rate"
    },
)
fig.show()

In [None]:
sa2_risks_df = sa2_risks_df.rename(columns={"sa2_main11": "sa2_code"})
health_geo = sa2_gpd.merge(sa2_risks_df, on="sa2_code", how="left")

## Choropleth of alcohol consumption with density of crashes involving injuries or fatalities

In [None]:
from folium.plugins import MarkerCluster

map = folium.Map(location=[-37.8136, 144.9631], zoom_start=6)

# Create health risk choropleth
choropleth = folium.Choropleth(
    geo_data=health_geo,
    data=health_geo,
    columns=['sa2_code', 'health_risks.alcohol_cons_1_no_3_11_7_13'],
    key_on='properties.sa2_code',
    fill_color='Greys',
    fill_opacity=0.7,
    line_opacity=0.2,
    legend_name='health_risks.alcohol_cons_1_no_3_11_7_13 by Region',
    highlight=True,
    nan_fill_color="blue",
).add_to(map)

marker_cluster = MarkerCluster().add_to(map)

# Add crash bubbles
for index, row in crash_sample_gdf.iterrows():
    folium.CircleMarker(
        location=[row['geometry'].y, row['geometry'].x],
        radius=4, # Adjust the size for better visualization
        color='crimson',
        fill=True,
        stroke=False,
        fill_color='crimson',
        fill_opacity=0.4,
        tooltip=f"{row['LGA_NAME']}",
    ).add_to(marker_cluster)

tooltip = folium.GeoJsonTooltip(
    fields=["sa2_code", "health_risks.alcohol_cons_1_no_3_11_7_13"],
    aliases=["Area:", "Alc cons:"],
    localize=True,
    sticky=False,
    labels=True,
    max_width=800,
).add_to(choropleth.geojson)

folium.LayerControl().add_to(map)
map

# Tweet Analysis

## Retrieve data from API and load into dataframes

In [None]:
# Retrieve SA3 Geometries
geo_data = api_request("get-sa3-geojson")
SA3_GDF = gpd.GeoDataFrame.from_features(geo_data, crs="EPSG:4326")


# Retrieve SUDO data 
joined_data = pd.DataFrame(api_request('sa3-joined/all'))

# Join the dataframes
gdf = pd.merge(SA3_GDF, joined_data, how='left', left_on=['SA3_CODE21'], right_on=['sa3_code_2021'])

# Flatten nested columns
gdf = gdf.join(pd.json_normalize(gdf["age_personal_income_sa3"]))
gdf = gdf.join(pd.json_normalize(gdf["school_sa3"]))
gdf = gdf.join(pd.json_normalize(gdf["person_age_sex_sa3"]))
gdf = gdf.drop(columns=["age_personal_income_sa3", "school_sa3", "person_age_sex_sa3"])

## Population data vs Tweet Activity

In [None]:
gdf['twitter_count_ratio'] = gdf['twitter_count'] / gdf['tot_persons_c21_p']
gdf.fillna(0, inplace=True)

In [None]:
top_10_pop = gdf.nlargest(10, 'tot_persons_c21_p')
top_10_twc = gdf.nlargest(10, 'twitter_count')
top_10_rt = gdf.nlargest(10, 'twitter_count_ratio')

In [None]:
# Plot all graphs in the same figure
fig, axes = plt.subplots(nrows=2, ncols=2, figsize=(14, 8))

# Plot Twitter count for each top 10 SA3
axes[0, 0].bar(top_10_twc['sa3_code_2021'], top_10_twc['twitter_count'])
axes[0, 0].set_title('Twitter Count for Top 10 SA3 Codes')
axes[0, 0].set_xlabel('SA3 Code')
axes[0, 0].set_ylabel('Twitter Count')
axes[0, 0].tick_params(axis='x', rotation=45)

# Plot Population for each top 10 SA3
axes[0, 1].bar(top_10_pop['sa3_code_2021'], top_10_pop['tot_persons_c21_m'], color='blue', label='Male')
axes[0, 1].bar(top_10_pop['sa3_code_2021'], top_10_pop['tot_persons_c21_f'], bottom=top_10_pop['tot_persons_c21_m'], color='pink', label='Female')
axes[0, 1].set_title('Male and Female Population for Top 10 SA3 Codes')
axes[0, 1].set_xlabel('SA3 Code')
axes[0, 1].set_ylabel('Total Population')
axes[0, 1].tick_params(axis='x', rotation=45)
axes[0, 1].legend()

# Plot Ratio for each top 10 SA3
axes[1, 0].bar(top_10_rt['sa3_code_2021'], top_10_rt['twitter_count_ratio'], color='green')
axes[1, 0].set_title('Twitter Count Ratio for Top 10 SA3 Codes')
axes[1, 0].set_xlabel('SA3 Code')
axes[1, 0].set_ylabel('Twitter Count Ratio')
axes[1, 0].tick_params(axis='x', rotation=45)

# Plot Total Population vs Twitter Count for the top 10 SA3 Codes with the highest Twitter Count Ratio
for i, row in top_10_rt.iterrows():
    axes[1, 1].scatter(row['tot_persons_c21_p'], row['twitter_count'], label=row['sa3_code_2021'])

axes[1, 1].set_xlabel('Total Population')
axes[1, 1].set_ylabel('Twitter Count')
axes[1, 1].set_title('Total Population vs Twitter Count for Top 10 SA3 Codes with Highest Twitter Count Ratio')
axes[1, 1].legend(title='SA3 Code', bbox_to_anchor=(1.05, 1), loc='upper left')
axes[1, 1].grid(True)

# Adjust layout
plt.tight_layout()

# Show plot
plt.show()

## Average Sentiment vs Median Age

In [None]:
import folium
# Map comparing sentiment and age
twt = gdf.explore(
                column="average_sentiment",
                legend=True,legend_kwds={"label": "Average Twitter Sentiment", "orientation": "horizontal"},
                scheme="NaturalBreaks",
                popup=["median_age_persons", "average_sentiment", "SA3_NAME21", "SA3_CODE21"],
                tooltip=False,
                name="sentiment"
                )
gdf.explore(
    m=twt,
    column="median_age_persons",
    tooltip=False,
    scheme="NaturalBreaks",
    popup=["median_age_persons", "average_sentiment", "SA3_NAME21", "SA3_CODE21"],
    name="median_age",
    
)
folium.LayerControl().add_to(twt)
twt

## Average Sentiment vs Education

In [None]:
import folium
import pandas as pd
import matplotlib.pyplot as plt
from io import BytesIO
import base64
import json

# Function to create graph and return it as a BytesIO object
def create_graph(sa3_code):
    plt.figure(figsize=(7, 3))
    sa3_data = gdf[gdf['sa3_code_2021'] == sa3_code]
    plt.bar(['year12', 'year11', 'year10', 'year9'], [sa3_data['p_y12e_tot'].iloc[0], sa3_data['p_y11e_tot'].iloc[0]
            , sa3_data['p_y10e_tot'].iloc[0], sa3_data['p_y9e_tot'].iloc[0]], color=['blue', 'pink', 'red', 'green'])
    plt.title(f'Highest Level of Education Completed for Population in {sa3_data["SA3_NAME21"].iloc[0]}')
    plt.xlabel('Highest Level Completed')
    plt.ylabel('Population')
    plt.xticks(rotation=45)
    plt.tight_layout()
    # Save the plot as BytesIO object
    img_data = BytesIO()
    plt.savefig(img_data, format='png')
    plt.close()
    img_data.seek(0)
    return img_data

# Create Folium map
m = folium.Map(location=[-27.5, 153], zoom_start=4)

# Add SA3 regions as polygons to the map
for _, row in gdf.iterrows():
    graph = create_graph(row['sa3_code_2021'])
    popup_html = f'<img src="data:image/png;base64,{base64.b64encode(graph.getvalue()).decode()}" alt="Graph">'
    folium.GeoJson(row['geometry'], 
                   popup=folium.Popup(popup_html, max_width=700)).add_to(m)
# Display the map
m


In [None]:
# Map - Sentiment vs Education
gdf['educated_ratio'] = (
    gdf['p_y12e_tot'] / gdf['tot_persons_c21_p']
)

m = gdf.explore(
    column='educated_ratio',
    legend=True,
    legend_kwds={"label": "Ratio of people that completed Y12 vs total area population", "orientation": "horizontal"},
    scheme="NaturalBreaks",
    popup=['average_sentiment','educated_ratio',"SA3_NAME21", "SA3_CODE21"],
    tooltip=False,
    popup_kwds={'aliases':['Average Sentiment','educated_ratio', "SA3 Name", "SA3 Code"]},
)    
m

In [None]:
top_10_educated = gdf.nlargest(10, 'educated_ratio')
for i, row in top_10_educated.iterrows():
    plt.scatter(row['educated_ratio'], row['average_sentiment'], label=row['sa3_code_2021'])

plt.xlabel('Education Ratio')
plt.ylabel('Average Sentiment')
plt.title('Education Ratio vs Average Sentiment for Top 10 most educated SA3 Codes by Education Ratio')
plt.legend(title='SA3 Code', bbox_to_anchor=(1.05, 1), loc='upper left')

## Average Sentiment vs Income

In [None]:
plt.scatter(gdf['average_sentiment'], gdf['median_tot_prsnl_inc_weekly'])
plt.xlabel('average sentiment')
plt.ylabel('median personal weekly income')
plt.show()

# To retrieve the IP address of the elastic search endpoint

```bash
kubectl get services --namespace elastic
kubectl describe service elasticsearch-master --namespace elastic
```

Get one of the IPs from the `Endpoint` attribute and use it below
