In [18]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
from plotly.offline import *
from plotly.graph_objs import *

init_notebook_mode(connected = True) # plot in the notebook

In [29]:
df = pd.read_csv('T_UWWTPS.csv')

In [30]:
df_toclean = pd.DataFrame(data = {'aggID': df['aggID'],'Latitude': df['uwwLatitude'],'Longitude': df['uwwLongitude'],
                                  'LoadEntering': df['uwwLoadEnteringUWWTP'],'Capacity': df['uwwCapacity']})
df_indexed = df_toclean.set_index('aggID')
df_no_missing = df_indexed.dropna()

# for unit test...
if df_no_missing.isnull().any().any() == True:
    raise Exception('At least one missing element still exist!')

In [31]:
trace = Scatter(x=df_no_missing['LoadEntering'].tolist(), y=df_no_missing['Capacity'].tolist(), mode = 'markers', 
                   marker = dict(color = '#FFBAD2', line = dict(width = 1)))
layout = Layout(title = 'Load Entering vs. Capacity', 
                   xaxis = dict(title = 'Load Entering'), yaxis = dict(title = 'Capacity'), showlegend = False)
fig = Figure(data = [trace], layout = layout)
iplot(fig)

### Things I am going to do next:
0. avoid any zero capacity or load entering in our scatter plot.
1. linear regression (least square)
2. cut off 20%-40% of data points based on distribution.
3. color the points based on country and check if there is a relation between different country.
4. find population and temperature to get volume entering WWTPs and use volume data to predict capacity.

In [32]:
iplot([Histogram(x=df_no_missing['LoadEntering'].tolist())])

In [36]:
site_lat = df_no_missing.Latitude
site_lon = df_no_missing.Longitude
loc_name = df_no_missing.index

data = Data([Scattermapbox(lat = site_lat,
                          lon = site_lon,
                          mode = 'markers',
                          marker = Marker(size = 20,
                                         color = 'rgb(255, 0, 0)'),
                          text = loc_name,
                          hoverinfo = 'aggID'),
            Scattermapbox(lat = site_lat,
                         lon = site_lon,
                         mode = 'markers',
                         marker = Marker(size = 20,
                                        color = 'rgb(242, 177, 172)'),
                         hoverinfo = 'none'),
            ])
layout = Layout(title = 'WWTPs in Europe',
               autosize = True,
               hovermode = 'closest',
               showlegend = False,
               mapbox = dict(bearing = 0,
                            center = dict(lat = 50, lon = 50),
                            pitch = 0,
                            zoom = 3,
                            style = 'light')
               )
fig = dict(data = data, layout = layout)
iplot(fig)