In [1]:
!pip install --quiet geocoder
print('pip installed geocoder')

pip installed geocoder


In [2]:
import pandas as pd

from warnings import filterwarnings
filterwarnings(action='ignore', category=FutureWarning)

CITIES = '/kaggle/input/water-and-air-quality/Cities1.csv'

df = pd.read_csv(filepath_or_buffer=CITIES)
df.head()

Unnamed: 0,City,Region,Country,AirQuality,WaterPollution
0,New York City,New York,United States of America,46.816038,49.50495
1,"Washington, D.C.",District of Columbia,United States of America,66.129032,49.107143
2,San Francisco,California,United States of America,60.514019,43.0
3,Berlin,,Germany,62.36413,28.612717
4,Los Angeles,California,United States of America,36.621622,61.299435


In [3]:
from arrow import now
from geocoder import arcgis
from os.path import exists

OUTPUT_FOLDER = '/kaggle/working/'
OUTPUT_FILE = 'locations.csv'

# this takes a little over a half hour 
# so we don't want to do it if we don't have to 

input_file = OUTPUT_FOLDER + OUTPUT_FILE
if exists(path=input_file):
    result_df = pd.read_csv(filepath_or_buffer=input_file, )
    print('read {} rows from {}'.format(len(result_df), input_file))
else:
    time_start = now()
    result = {}
    for city in df['City'].unique().tolist():
        result[city] = arcgis(location=city).latlng
        if len(result) % 200 == 0:
            print(now() - time_start, city, len(result))
    output_file = OUTPUT_FOLDER + OUTPUT_FILE
    result_df = pd.DataFrame.from_dict(data=result, orient='index').reset_index()
    result_df.columns = ['City', 'latitude', 'longitude']
    result_df.to_csv(path_or_buf=output_file, index=False)
    print('done in {}'.format(now() - time_start))
result_df.head()

0:01:54.408455 Bochum 200
0:03:43.921746 Talca 400
0:05:42.997844 Ronda 600
0:07:35.359577 Saint Paul 800
0:09:28.235260 Cincinnati 1000
0:11:21.974384 Shaoguan 1200
0:13:17.349233 Zielona Gora 1400
0:15:12.706258 Veles 1600
0:17:05.702831 Preston 1800
0:19:04.391114 Visaginas 2000
0:20:56.319161 Bani Walid 2200
0:22:49.760254 Sumperk 2400
0:24:46.823710 Costa Mesa 2600
0:26:40.411423 Longford 2800
0:28:36.567573 Itajai 3000
0:30:29.507935 Fort Walton Beach 3200
0:32:24.514350 Te Anau 3400
0:34:16.562002 Kyaukpyu 3600
done in 0:36:14.410984


Unnamed: 0,City,latitude,longitude
0,New York City,40.71453,-74.00712
1,"Washington, D.C.",38.89037,-77.03196
2,San Francisco,37.77712,-122.41966
3,Berlin,52.51604,13.37691
4,Los Angeles,34.05357,-118.24545


In [4]:
df = df.merge(right=result_df, on='City', how='inner')
df.head()

Unnamed: 0,City,Region,Country,AirQuality,WaterPollution,latitude,longitude
0,New York City,New York,United States of America,46.816038,49.50495,40.71453,-74.00712
1,"Washington, D.C.",District of Columbia,United States of America,66.129032,49.107143,38.89037,-77.03196
2,San Francisco,California,United States of America,60.514019,43.0,37.77712,-122.41966
3,Berlin,,Germany,62.36413,28.612717,52.51604,13.37691
4,Los Angeles,California,United States of America,36.621622,61.299435,34.05357,-118.24545


In [5]:
from plotly.express import scatter_mapbox
scatter_mapbox(height=900, mapbox_style='open-street-map', data_frame=df, lat='latitude', lon='longitude', hover_name='City', hover_data=['Country'], color='AirQuality', zoom=1,).show()
scatter_mapbox(height=900, mapbox_style='open-street-map', data_frame=df, lat='latitude', lon='longitude', hover_name='City', hover_data=['Country'], color='WaterPollution', zoom=1,).show()

We had to wait a half-hour to geocode all our locations, but now we have a zoomable map of all the data.

In [6]:
from plotly.express import histogram
for x in ['AirQuality', 'WaterPollution']:
    histogram(data_frame=df, x=x).show()

It should be pretty clear at this point that a lot of our data could be charitably described as approximate and uncharitably described as fake; a lot of this data appears to have been converted from a five-point scale to a 100-point scale.

In [7]:
from plotly.express import scatter

scatter(data_frame=df, x='WaterPollution', y='AirQuality', hover_name='City')

It's clear from this plot we have a lot of data that has been converted from some other scale. Let's see what we get when we clean that up a little.

In [8]:
reduced_df = df.copy()
exclude = { 100.00 * fraction for fraction in [0, 1/8, 1/4, 3/8, 1/3, 1/2, 5/8, 2/3, 3/4, 7/8, 1]}
reduced_df = df[(~df['WaterPollution'].isin(exclude)) & (~df['AirQuality'].isin(exclude))]
reduced_df = reduced_df.merge(right=pd.read_csv(filepath_or_buffer='/kaggle/input/country-mapping-iso-continent-region/continents2.csv', usecols=['name', 'region']), left_on='Country', right_on='name', how='inner')
scatter(data_frame=reduced_df, x='WaterPollution', y='AirQuality', hover_name='City', height=900, hover_data=['Country'], color='region',
       trendline='ols', trendline_scope='overall',)

Our cleanup effort is imperfect, but what we have left tells us something interesting: high air quality and low water pollution tend to be correlated: our linear model has a negative slope (showing that high air quality and high water pollution are negatively correlated), and that the correlation is weak (our linear model has an R2 of 0.4). We do see a broad tendency for cities in different regions (read: continents) to be more concentrated in one quadrant rather than another. Let's break that down a little.

In [9]:
scatter(data_frame=reduced_df[['region', 'AirQuality', 'WaterPollution']].groupby(by='region').mean().reset_index(), x='WaterPollution', y='AirQuality', hover_name='region', color='region', range_x=[0, 100], range_y=[0, 100], height=900)

Plotting the means on the same axes as the original data is undramatic but tells a pretty simple story. It is important to remember that our original data is not necessarily a representative sample of the regions they are sampled from, and in cleaning the data we made an arguable change in these results, so it would be best to interpret the result carefully.