# Ch. 2 Introductory Examples

In [2]:
# peak into file
path = '/home/Neil/python_for_data_analysis/pydata-book/datasets/bitly_usagov/example.txt'
open(path).readline()

'{ "a": "Mozilla\\/5.0 (Windows NT 6.1; WOW64) AppleWebKit\\/535.11 (KHTML, like Gecko) Chrome\\/17.0.963.78 Safari\\/535.11", "c": "US", "nk": 1, "tz": "America\\/New_York", "gr": "MA", "g": "A6qOVH", "h": "wfLQtf", "l": "orofrog", "al": "en-US,en;q=0.8", "hh": "1.usa.gov", "r": "http:\\/\\/www.facebook.com\\/l\\/7AQEFzjSi\\/1.usa.gov\\/wfLQtf", "u": "http:\\/\\/www.ncbi.nlm.nih.gov\\/pubmed\\/22415991", "t": 1331923247, "hc": 1331822918, "cy": "Danvers", "ll": [ 42.576698, -70.954903 ] }\n'

In [3]:
# parse json
import json
records = [json.loads(line) for line in open(path)]
records[0]

{'a': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.78 Safari/535.11',
 'al': 'en-US,en;q=0.8',
 'c': 'US',
 'cy': 'Danvers',
 'g': 'A6qOVH',
 'gr': 'MA',
 'h': 'wfLQtf',
 'hc': 1331822918,
 'hh': '1.usa.gov',
 'l': 'orofrog',
 'll': [42.576698, -70.954903],
 'nk': 1,
 'r': 'http://www.facebook.com/l/7AQEFzjSi/1.usa.gov/wfLQtf',
 't': 1331923247,
 'tz': 'America/New_York',
 'u': 'http://www.ncbi.nlm.nih.gov/pubmed/22415991'}

In [4]:
records[0]['tz']

'America/New_York'

In [10]:
# check timezone frequency
tzs = [record['tz'] for record in records if 'tz' in record]

# option 1: the hard Python way
def get_counts(sequence):
    counts = {}
    for x in sequence:
        if x in counts:
            counts[x] += 1
        else:
            counts[x] = 1
    return counts

# option 1-2: Python way using standard Python library
from collections import defaultdict

def get_counts2(sequence):
    counts = defaultdict(int)
    for x in sequence:
        counts[x] += 1
    return counts

print(get_counts(tzs))
get_counts2(tzs)

# option 2: Pandas way


{'America/New_York': 1251, 'America/Denver': 191, 'America/Sao_Paulo': 33, 'Europe/Warsaw': 16, '': 521, 'America/Los_Angeles': 382, 'Asia/Hong_Kong': 10, 'Europe/Rome': 27, 'Africa/Ceuta': 2, 'Europe/Madrid': 35, 'Asia/Kuala_Lumpur': 3, 'Asia/Nicosia': 1, 'Europe/London': 74, 'Pacific/Honolulu': 36, 'America/Chicago': 400, 'Europe/Malta': 2, 'Europe/Lisbon': 8, 'Europe/Paris': 14, 'Europe/Copenhagen': 5, 'America/Mazatlan': 1, 'Europe/Dublin': 3, 'Europe/Brussels': 4, 'America/Vancouver': 12, 'Europe/Amsterdam': 22, 'Europe/Prague': 10, 'Europe/Stockholm': 14, 'America/Anchorage': 5, 'Asia/Bangkok': 6, 'Europe/Berlin': 28, 'America/Rainy_River': 25, 'Europe/Budapest': 5, 'Asia/Tokyo': 37, 'Europe/Vienna': 6, 'America/Phoenix': 20, 'Asia/Jerusalem': 3, 'Asia/Karachi': 3, 'America/Bogota': 3, 'America/Indianapolis': 20, 'America/Montreal': 9, 'Asia/Calcutta': 9, 'Europe/Skopje': 1, 'Asia/Beirut': 4, 'Australia/NSW': 6, 'Chile/Continental': 6, 'America/Halifax': 4, 'America/Edmonton': 6,

defaultdict(int,
            {'': 521,
             'Africa/Cairo': 3,
             'Africa/Casablanca': 1,
             'Africa/Ceuta': 2,
             'Africa/Johannesburg': 1,
             'Africa/Lusaka': 1,
             'America/Anchorage': 5,
             'America/Argentina/Buenos_Aires': 1,
             'America/Argentina/Cordoba': 1,
             'America/Argentina/Mendoza': 1,
             'America/Bogota': 3,
             'America/Caracas': 1,
             'America/Chicago': 400,
             'America/Chihuahua': 2,
             'America/Costa_Rica': 1,
             'America/Denver': 191,
             'America/Edmonton': 6,
             'America/Guayaquil': 2,
             'America/Halifax': 4,
             'America/Indianapolis': 20,
             'America/La_Paz': 1,
             'America/Lima': 1,
             'America/Los_Angeles': 382,
             'America/Managua': 3,
             'America/Mazatlan': 1,
             'America/Mexico_City': 15,
             'America/Monte

In [41]:
# most common timezones (ranking) - option #1

dic = get_counts(tzs)

def top_counts(count_dict, n=10):
    value_key_pairs = [(count, tz) for tz, count in count_dict.items()]
    value_key_pairs.sort()
    return value_key_pairs[-n:]

top_counts(dic)

[(33, 'America/Sao_Paulo'),
 (35, 'Europe/Madrid'),
 (36, 'Pacific/Honolulu'),
 (37, 'Asia/Tokyo'),
 (74, 'Europe/London'),
 (191, 'America/Denver'),
 (382, 'America/Los_Angeles'),
 (400, 'America/Chicago'),
 (521, ''),
 (1251, 'America/New_York')]

In [48]:
# sort behavior of list of tuple
value_key_pairs = [(tz, count) for tz, count in dic.items()]
value_key_pairs.sort(key = lambda tup: tup[1])  # sorts in-place
value_key_pairs

[('Asia/Nicosia', 1),
 ('America/Mazatlan', 1),
 ('Europe/Skopje', 1),
 ('Asia/Novosibirsk', 1),
 ('Europe/Sofia', 1),
 ('Europe/Ljubljana', 1),
 ('America/Monterrey', 1),
 ('America/Argentina/Buenos_Aires', 1),
 ('Asia/Yekaterinburg', 1),
 ('Asia/Manila', 1),
 ('America/Caracas', 1),
 ('Asia/Riyadh', 1),
 ('America/Montevideo', 1),
 ('America/Argentina/Mendoza', 1),
 ('Europe/Uzhgorod', 1),
 ('Australia/Queensland', 1),
 ('America/Costa_Rica', 1),
 ('America/Lima', 1),
 ('Asia/Pontianak', 1),
 ('Africa/Lusaka', 1),
 ('Africa/Johannesburg', 1),
 ('America/St_Kitts', 1),
 ('America/Santo_Domingo', 1),
 ('America/Argentina/Cordoba', 1),
 ('Asia/Kuching', 1),
 ('Europe/Volgograd', 1),
 ('America/La_Paz', 1),
 ('Africa/Casablanca', 1),
 ('America/Tegucigalpa', 1),
 ('Africa/Ceuta', 2),
 ('Europe/Malta', 2),
 ('America/Recife', 2),
 ('Europe/Riga', 2),
 ('Europe/Belgrade', 2),
 ('America/Chihuahua', 2),
 ('Europe/Vilnius', 2),
 ('America/Guayaquil', 2),
 ('Asia/Amman', 2),
 ('Asia/Kuala_Lum

In [52]:
# most common timezones (ranking) - option #2: using Python native Counter object
from collections import Counter
ct = Counter(dic)
ct.most_common(10)

[('America/New_York', 1251),
 ('', 521),
 ('America/Chicago', 400),
 ('America/Los_Angeles', 382),
 ('America/Denver', 191),
 ('Europe/London', 74),
 ('Asia/Tokyo', 37),
 ('Pacific/Honolulu', 36),
 ('Europe/Madrid', 35),
 ('America/Sao_Paulo', 33)]

In [3]:
import numpy as np

AttributeError: module 'numpy' has no attribute 'core'