# Setup
### Importing the packages

In [40]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import csv
from pprint import pprint
from collections import defaultdict

### Checking the Pandas version

In [41]:
pd.__version__

'1.4.1'

# Pure Python - Analytics
### Task
* Read in header & first 5 lines of the file
* Compute a total of the population
* Compute a group total of population by country
* Return the top 5 populous countries
### Read Data - File Reader

In [42]:
lines = 0
with open('worldcities.csv', 'r', encoding='utf8') as file:
    for line in file:
        print(line)
        lines += 1
        if lines > 5:
            break

"city","city_ascii","lat","lng","country","iso2","iso3","admin_name","capital","population","id"

"Tokyo","Tokyo","35.6897","139.6922","Japan","JP","JPN","Tōkyō","primary","37977000","1392685764"

"Jakarta","Jakarta","-6.2146","106.8451","Indonesia","ID","IDN","Jakarta","primary","34540000","1360771077"

"Delhi","Delhi","28.6600","77.2300","India","IN","IND","Delhi","admin","29617000","1356872604"

"Mumbai","Mumbai","18.9667","72.8333","India","IN","IND","Mahārāshtra","admin","23355000","1356226629"

"Manila","Manila","14.5958","120.9772","Philippines","PH","PHL","Manila","primary","23088000","1608618140"



In [43]:
extract = line.replace('"', '').split(',')
print(f'{extract[1]} = {int(extract[9]):,}')

Manila = 23,088,000


### Read Data - CSV Reader
* Expand on file reader by parsing the CSV in each row
* CSV Reader yields back an array of data per line
* Improvement - We can index into the array by value to find population and country

In [44]:
lines = 0
with open('worldcities.csv', 'r', encoding='utf8') as file:
    reader = csv.reader(file, delimiter=',', quotechar='"')
    for line in reader:
        print(line)
        lines += 1
        if lines > 5:
            break

['city', 'city_ascii', 'lat', 'lng', 'country', 'iso2', 'iso3', 'admin_name', 'capital', 'population', 'id']
['Tokyo', 'Tokyo', '35.6897', '139.6922', 'Japan', 'JP', 'JPN', 'Tōkyō', 'primary', '37977000', '1392685764']
['Jakarta', 'Jakarta', '-6.2146', '106.8451', 'Indonesia', 'ID', 'IDN', 'Jakarta', 'primary', '34540000', '1360771077']
['Delhi', 'Delhi', '28.6600', '77.2300', 'India', 'IN', 'IND', 'Delhi', 'admin', '29617000', '1356872604']
['Mumbai', 'Mumbai', '18.9667', '72.8333', 'India', 'IN', 'IND', 'Mahārāshtra', 'admin', '23355000', '1356226629']
['Manila', 'Manila', '14.5958', '120.9772', 'Philippines', 'PH', 'PHL', 'Manila', 'primary', '23088000', '1608618140']


In [45]:
extract = line
extract

['Manila',
 'Manila',
 '14.5958',
 '120.9772',
 'Philippines',
 'PH',
 'PHL',
 'Manila',
 'primary',
 '23088000',
 '1608618140']

In [46]:
print(f'{extract[1]} = {int(extract[9]):,}')

Manila = 23,088,000


### Read Data - Dict Reader
* The Dict Reader offers further improvement
* Note our header row has been consumed and used to give us a name-value dictionary for each row

In [47]:
lines = 0
with open('worldcities.csv', 'r', encoding='utf8') as file:
    reader = csv.DictReader(file, delimiter=',', quotechar='"')
    for line in reader:
        print(line)
        lines += 1
        if lines > 4:
            break

{'city': 'Tokyo', 'city_ascii': 'Tokyo', 'lat': '35.6897', 'lng': '139.6922', 'country': 'Japan', 'iso2': 'JP', 'iso3': 'JPN', 'admin_name': 'Tōkyō', 'capital': 'primary', 'population': '37977000', 'id': '1392685764'}
{'city': 'Jakarta', 'city_ascii': 'Jakarta', 'lat': '-6.2146', 'lng': '106.8451', 'country': 'Indonesia', 'iso2': 'ID', 'iso3': 'IDN', 'admin_name': 'Jakarta', 'capital': 'primary', 'population': '34540000', 'id': '1360771077'}
{'city': 'Delhi', 'city_ascii': 'Delhi', 'lat': '28.6600', 'lng': '77.2300', 'country': 'India', 'iso2': 'IN', 'iso3': 'IND', 'admin_name': 'Delhi', 'capital': 'admin', 'population': '29617000', 'id': '1356872604'}
{'city': 'Mumbai', 'city_ascii': 'Mumbai', 'lat': '18.9667', 'lng': '72.8333', 'country': 'India', 'iso2': 'IN', 'iso3': 'IND', 'admin_name': 'Mahārāshtra', 'capital': 'admin', 'population': '23355000', 'id': '1356226629'}
{'city': 'Manila', 'city_ascii': 'Manila', 'lat': '14.5958', 'lng': '120.9772', 'country': 'Philippines', 'iso2': 'P

In [48]:
extract = line
extract

{'city': 'Manila',
 'city_ascii': 'Manila',
 'lat': '14.5958',
 'lng': '120.9772',
 'country': 'Philippines',
 'iso2': 'PH',
 'iso3': 'PHL',
 'admin_name': 'Manila',
 'capital': 'primary',
 'population': '23088000',
 'id': '1608618140'}

In [49]:
print(f"{extract['city_ascii']} = {int(extract['population']):,}")

Manila = 23,088,000


### Sum Population (With Floats)
* We're expecting an integer, but receive a float
* We need to parse float then int
* Then we can get an answer

In [50]:
# Handle casting to float then to int
population = 0
with open('worldcities.csv', 'r', encoding='utf8') as file:
    reader = csv.DictReader(file, delimiter=',', quotechar='"')
    for line in reader:
        population += 0 if line['population'] == '' else int(float(line['population']))
print(f'Population = {population:,}')

Population = 4,155,400,545


### Population by County

In [51]:
country_population = defaultdict(int)

with open('worldcities.csv', 'r', encoding='utf8') as file:
    reader = csv.DictReader(file, delimiter=',', quotechar='"')
    for line in reader:
        country_population[line['country']] += 0 if line['population'] == '' else int(float(line['population']))

for key, value in country_population.items():
    print(f'{key} = {value:,}')

Japan = 148,273,773
Indonesia = 85,283,563
India = 270,170,371
Philippines = 62,547,674
China = 1,388,868,247
Brazil = 135,416,434
Korea, South = 58,770,399
Mexico = 103,464,676
Egypt = 38,873,354
United States = 400,521,452
Russia = 105,990,872
Thailand = 23,598,357
Argentina = 36,192,503
Bangladesh = 23,288,987
Nigeria = 54,771,880
Turkey = 70,958,511
Pakistan = 56,952,881
Iran = 48,493,926
Congo (Kinshasa) = 27,429,409
Vietnam = 37,934,404
France = 38,429,395
United Kingdom = 72,555,282
Peru = 21,273,383
Colombia = 28,491,631
Angola = 11,609,969
Malaysia = 19,509,922
Hong Kong = 7,347,000
Sudan = 15,752,339
Chile = 13,775,014
Saudi Arabia = 22,327,128
Tanzania = 11,276,325
Iraq = 18,624,715
Singapore = 5,745,000
Kenya = 11,135,325
Burma = 12,306,878
Canada = 37,747,786
Australia = 23,563,278
Côte D’Ivoire = 8,012,496
Spain = 22,741,191
South Africa = 15,430,094
Morocco = 16,986,585
Jordan = 6,057,680
Afghanistan = 7,474,525
Germany = 58,243,288
Algeria = 10,828,214
Bolivia = 9,579,0

### Most Populous Countries

In [52]:
# Chain array operations (pick top & reverse sort)
grp = sorted(country_population.items(), key=lambda kv: kv[1])[-5:][::-1]
for country, populatio in grp:
    print(f'{country} {population:,}')

China 4,155,400,545
United States 4,155,400,545
India 4,155,400,545
Japan 4,155,400,545
Brazil 4,155,400,545


### Notes
* Our pure python approach
    * Wrestling with and reading files
    * Deciding on a reader class
    * Missing data & different types
    * Writing loops to read data & compute aggregates
* More productive ways to perform analysis
    * If you've done a lot of python programming not SO bad, but slow
    * If you're getting started it's a heavy lift to be productive