# Python Data Handling: A Deeper Dive

Data sources

http://www.dabeaz.com/datadeepdive/

In [1]:
print("Hello World")

## Data Structure Shootout 

Which structure is better in terms of memory

### Challenge 

The file "ctabus.csv" is a CSV file containing ridershiip  data from the Chicago Transit Authority bus system.

What's the most efficient way to read it into a Python list so that you can work with it?


In [2]:
f= open('ctabus.csv')


In [3]:
next(f)

In [4]:
next(f)

In [5]:
import csv
rows = csv.reader(f)
next(f)


In [6]:
import csv
import tracemalloc
def read_data(filename,make_record_funtion):
    tracemalloc.start() #Monitor Python memory allocations
    records=[]
    with open(filename) as f:
        rows = csv.reader(f)
        headers = next(rows) 
        for row in rows :
            record = make_record_funtion(row)
            records.append(record)
    print ("Current {}. Max {}".format(*tracemalloc.get_traced_memory())) 
    tracemalloc.stop()
    return records

In [7]:
# with list

def make_record_list(row):
    # Take a raw row of data, turn into Python data structure
    return row
    
rows = read_data('ctabus.csv',make_record_list )     
print(len(rows))
rows[0]

Current 194758009. Max 194770414
736461


['3', '01/01/2001', 'U', '7354']

In [8]:
#Tuplet
def make_record_tuple(row):
    
    return (row[0],row[1],row[2],int(row[3]))

rows = read_data('ctabus.csv',make_record_tuple)
rows[0]

Current 163690217. Max 163702894


('3', '01/01/2001', 'U', 7354)

In [9]:
#Dictionary
def make_record_dictionary(row):
    
    return {
        'route':row[0],
        'date':row[1],
        'daytype':row[2],
        'rides': int(row[3])
    }

rows = read_data('ctabus.csv',make_record_dictionary)
rows[0]

Current 281519161. Max 281531838


{'date': '01/01/2001', 'daytype': 'U', 'rides': 7354, 'route': '3'}

In [10]:
class Row:
    def __init__(self,route,date,daytype,rides):
        self.route=route
        self.date=date
        self.daytype=daytype
        self.rides= rides
    def __repr__(self):
        return f'Row({self.route},{self.date},{self.daytype},{self.rides})'
    
def make_records_class(row):
    return Row(row[0],row[1],row[2],int(row[3]))
rows = read_data('ctabus.csv',make_records_class)
rows[0]

Current 228498017. Max 228510478


Row(3,01/01/2001,U,7354)

In [11]:
class RowSlots:
    __slots__=('route','date','daytype','rides')
    def __init__(self,route,date,daytype,rides):
        self.route=route
        self.date=date
        self.daytype=daytype
        self.rides= rides
    def __repr__(self):
        return f'Row({self.route},{self.date},{self.daytype},{self.rides})'
    
def make_records_class_with_slots(row):
    return RowSlots(row[0],row[1],row[2],int(row[3]))

rows = read_data('ctabus.csv',make_records_class_with_slots)
rows[0]

Current 157799081. Max 157811542


Row(3,01/01/2001,U,7354)

In [12]:
from collections import namedtuple
Row = namedtuple('Row',['route','date','daytype','rides'])
def make_records_namedtuple(row):
    return Row(row[0],row[1],row[2],int(row[3]))

rows = read_data('ctabus.csv',make_records_namedtuple)
rows[0]

Current 169582473. Max 169594934


Row(route='3', date='01/01/2001', daytype='U', rides=7354)

### Conclusion

* list: behaves worst than tuplets 
* tuplets: pretty nice but you lose the indexing of a dictionary
* dictionary: pretty heavy compared with tuplets
* class: behaves slightly better than dictionary
* class with slots: behaves better than a tuplet -->winner but check what the hell is slots
* named tuplets: behaves like normal tuplet

### the thing with slots

In [13]:
class Point:
    __slots__=('x','y')
    def __init__(self,x,y):
        self.x=x
        self.y= y

In [64]:
p=Point(2,3)
p.x=6
# p.z=35  AttributeError: 'Point' object has no attribute 'z'


With slots  you can't add atributes like a normal class in python. Check now a normal class

In [15]:
class Point:

    def __init__(self,x,y):
        self.x=x
        self.y= y
p=Point(2,3)
p.x=6
p.z=35
p.__dict__

{'x': 6, 'y': 3, 'z': 35}

## The collections module 

if you have to work with relation within objects. Depends on your problem.

* Keeping things in order -> list
* Keeping distinct items -> use sets. unique things
* Building an Index -> Use a dictionary
* Composite Keys -> use tuplets as a key in a dictionary
* One to many -> key of the lookup and then a list as value -> collections defaultdict
* For Counting use a dictionary for doing aggregation 


* Question 1: How many bus routes in Chicago?

In [16]:
# Solution: Use a set (distinct items)

routes = set()
for row in rows:
    routes.add(row.route)
len(routes)

185

* Question2: How many people rode route 22 on 9-04-2007
Don't do search in a for

In [17]:
for row in rows:
    if row.route=='22' and row.date == '04/09/2007':
        print(row.rides)
        break
    

24154


In [19]:
route_date={}
for row in rows:
    route_date[row.route,row.date]=row.rides
    
route_date['22','04/09/2007']

24154

* Question: What are ten most popular routes?

In [20]:
from collections import Counter

In [22]:
totals = Counter()
for row in rows:
    totals[row.route]+=row.rides

In [23]:
totals.most_common(10)

[('79', 165309712),
 ('9', 147507182),
 ('49', 121673788),
 ('66', 120662106),
 ('4', 120176371),
 ('77', 114243228),
 ('22', 111886681),
 ('3', 111559864),
 ('151', 110893971),
 ('53', 109925583)]

*Question 4: What are the 10 most popular in 2016?

In [26]:
from collections import defaultdict
d= defaultdict(list)
for row in rows:
    d[row.route].append((row.date,row.rides))

In [27]:
d['22']

[('01/01/2001', 7877),
 ('01/02/2001', 19558),
 ('01/03/2001', 19286),
 ('01/04/2001', 20265),
 ('01/05/2001', 20680),
 ('01/06/2001', 14834),
 ('01/07/2001', 11382),
 ('01/08/2001', 20388),
 ('01/09/2001', 21672),
 ('01/10/2001', 22404),
 ('01/11/2001', 21232),
 ('01/12/2001', 21447),
 ('01/13/2001', 14144),
 ('01/14/2001', 11315),
 ('01/15/2001', 18344),
 ('01/16/2001', 22330),
 ('01/17/2001', 21275),
 ('01/18/2001', 21950),
 ('01/19/2001', 21070),
 ('01/20/2001', 14778),
 ('01/21/2001', 11255),
 ('01/22/2001', 21522),
 ('01/23/2001', 21761),
 ('01/24/2001', 22108),
 ('01/25/2001', 21100),
 ('01/26/2001', 20149),
 ('01/27/2001', 14782),
 ('01/28/2001', 10481),
 ('01/29/2001', 21998),
 ('01/30/2001', 22459),
 ('01/31/2001', 23540),
 ('02/01/2001', 23003),
 ('02/02/2001', 20949),
 ('02/03/2001', 15751),
 ('02/04/2001', 11001),
 ('02/05/2001', 22796),
 ('02/06/2001', 21305),
 ('02/07/2001', 22435),
 ('02/08/2001', 21793),
 ('02/09/2001', 20869),
 ('02/10/2001', 15709),
 ('02/11/2001', 1

In [28]:
totals_by_year=defaultdict(Counter)
for row in rows:
    year= row.date[-4:]
    totals_by_year[year][row.route]+=row.rides

In [30]:
totals_by_year['2016'].most_common(10)

[('79', 8268367),
 ('66', 7088030),
 ('77', 6671135),
 ('4', 6424587),
 ('8', 6375504),
 ('9', 6329587),
 ('53', 5895532),
 ('82', 5775101),
 ('3', 5698439),
 ('22', 5620134)]

* What 10 routes had greatest increase 20001-2006?


In [33]:
diff = totals_by_year['2016']-totals_by_year['2001']
diff.most_common(10)

[('J14', 3296029),
 ('15', 2301408),
 ('X9', 2075111),
 ('146', 1516226),
 ('147', 1315199),
 ('115', 1114940),
 ('12', 1001598),
 ('26', 920307),
 ('134', 722110),
 ('18', 684923)]

## Python Object Model

* Everything in Python is an "object". Even functions.
* All objects have an id, class and a reference count.
* id is the memory address
* The class is the "type"
* Reference count used for garabage collection

In [34]:
x = 42 

In [36]:
id(x) # This the memory location where it is


int

In [37]:
x.__class__

int

In [38]:
import sys 
sys.getrefcount(x)

286

* Assigments operations never make a copy of the value being assigned
* All assignments sotre he memory address only(object id). Increase the refcount.
* Mutability Caution

In [47]:
a= [1,2,3]
b= a
b

[1, 2, 3]

In [48]:
b.append(999)
a ## a has changed altering b

[1, 2, 3, 999]

In [49]:
b

[1, 2, 3, 999]

In [50]:
id(a)

4666960648

In [51]:
id(b)

4666960648

In [52]:
a=[4,5,6] # There is  a new assigment to another object
b

[1, 2, 3, 999]

In [53]:
a

[4, 5, 6]

In [54]:
id(a)

4666961480

In [55]:
id(b)

4666960648

How to make a copy? Using a constructor list(), dict(), etc.

In [59]:
a=[1,2,3]
b=list(a)

In [60]:
print(id(a),id(b))

4666978632 4666981064


In [62]:
b.append(99)
a

[1, 2, 3]

In [63]:
b

[1, 2, 3, 99, 99]

### Exploiting Immutability 
Immutable values can be safely shared.

Challenge use cache to decrease memory

In [68]:
_cache={}
def cached(s):
    if s in _cache :
        return _cache[s]
    else:
        _cache[s]=s
        return s
def make_record_with_1_cache(row):
    return Row(cached(row[0]),row[1],row[2],int(row[3]))
def make_record_with_2_cache(row):
    return Row(cached(row[0]),cached(row[1]),row[2],int(row[3]))

rows = read_data('ctabus.csv',make_records_namedtuple)
rows = read_data('ctabus.csv',make_record_with_1_cache)
rows = read_data('ctabus.csv',make_record_with_2_cache)


Current 169582049. Max 169594510
Current 134106552. Max 134119013
Current 91285488. Max 91297949


As you can see the memory is decreasing

In [69]:
rows[0]

Row(route='3', date='01/01/2001', daytype='U', rides=7354)

### Builtin Representation

Python objects there are not compacts as might look like because the interpreter is written in c and insome cases there is an overload depending on the value. Take the  following example and check the number of bytes represented.


In [71]:
a= 'n'
b= 'ñ'
print(sys.getsizeof(a),sys.getsizeof(b))

50 74


The reason is because python tries to obtimized for appending and insert things. let check an example.

In [72]:
d = {
    'name':'guido',
    'x':42
}

In [77]:
a=[1,2,3]
print('name'.__hash__(),'guido'.__hash__())
# a.__hash__() a list is not hashable for that reason cannot be a key in a dictionary

-1477684061720128961 3541162750340227233


For a dictionary, the dictionary allocates memory  and there is a kind of a threshold that when you add more keysthat pass that threshold then  allocates more memory. But instances of classess in keys it is kind of difference that optimize thosed things.

With slots  get rid off the dictionary and works on memory and it is used to problems with RAM. 

## Functional Programming. Thinking in Functions

In [78]:
def square(x):
    return x*x
def recip(x):
    return 1/x
def sum_invsquares(start,stop):
    total=0
    for n in range(start,stop+1):
        total+=recip(square(n))
    return total


In [79]:
sum_invsquares(1,100)

1.6349839001848923

In [85]:
def sum_terms(start,stop,func):
    total=0
    for n in range(start,stop+1):
        total+=func(n) # use a function instead of coping yourself
    return total

def invsquare(x):
    return 1/square(x)

def compose(f,g):# Return a function of composing a functions
    def h(x):
        return f(g(x))
    return h

sum_terms(1,100, invsquare)


1.6349839001848923

In [86]:
sum_terms(1,100,compose(recip,square))

1.6349839001848923

### Challenge

* Rewrite the bus data code using list comprehensions and a functinal programming style
* Find out what day the route 22 bus had the highest ridership

In [89]:
# Question 1: How many bus routes in Chicago?

routes = {row.route for row in rows}

In [88]:
len (routes)

185

In [92]:
# What day did route 22 have most riders?
rt22=[row for row in rows if row.route == '22']
len(rt22)
rides =[(row.rides,row.date) for row in rt22]
max(rides)

(26896, '06/11/2008')

In [93]:
route_date={(row.route, row.date):row.rides for row in rows}
route_date['22','04/09/2016']

14252

## Thinking in Columns

Instead of a list of tuples, lets do a tuple of lists. Per record overhead first approach 72 bytes, 24 bytes on the second approach.


### Challenge 
Read the bus data into separate lists representing columns. Does it make a difference? Can you still work with the data?

In [108]:
import csv
def read_data_columns(filename):
    routes=[]
    dates=[]
    daytypes=[]
    rides=[]
    tracemalloc.start()
    with open(filename) as f:
        rows=csv.reader (f)
        headers= next(rows)
        for row in rows:
            routes.append(row[0])
            dates.append(row[1])
            daytypes.append(row[2])
            rides.append(int(row[3]))
    print ("Current {}. Max {}".format(*tracemalloc.get_traced_memory())) 
    tracemalloc.stop()
    return (routes,dates, daytypes, rides)

In [109]:
data = read_data_columns('ctabus.csv')

Current 123099889. Max 123112566


In [110]:
data[0][0:10]

['3', '4', '6', '8', '9', '10', '11', '12', '18', '20']

In [111]:
data[1][0:10]

['01/01/2001',
 '01/01/2001',
 '01/01/2001',
 '01/01/2001',
 '01/01/2001',
 '01/01/2001',
 '01/01/2001',
 '01/01/2001',
 '01/01/2001',
 '01/01/2001']

In [112]:
data[2][0:10]

['U', 'U', 'U', 'U', 'U', 'U', 'U', 'U', 'U', 'U']

In [113]:
data[3][0:10]

[7354, 9288, 6048, 6309, 11207, 385, 610, 3678, 375, 7096]

In [114]:
import numpy
a = numpy.array([1,2,3,4])# They store like arrays in C so there is no oveload in memory. 
#The array must be in a specific type

In [115]:
# a[1]= 'hello'


In [116]:
a + 100


array([101, 102, 103, 104])

In [121]:
b = numpy.array([10,11,12,13])

In [122]:
a +b

array([11, 13, 15, 17])

In [123]:
a <3

array([ True,  True, False, False], dtype=bool)

In [124]:
a[a<3]

array([1, 2])

In [125]:
numpy.where(a<3,10,-10)

array([ 10,  10, -10, -10])

## Challenge

Read the bus data using Pandas. Compare with earlies approaches.

In [126]:
import pandas  as pd


In [127]:
data=pd.read_csv('ctabus.csv')

In [130]:
rt22=data[data['route']=='22']

In [133]:
rt22['rides'].max()

26896

There is a conceptual barrier instead of records work, think in columns


## Thinking in Streams.

With GB files of data, it is better to think in a workflow. 

In [134]:
a= [1,2,3,4]
b=['w','x','y','z']

In [135]:
pairs=zip(a,b)
for p in pairs:
    print(p)

(1, 'w')
(2, 'x')
(3, 'y')
(4, 'z')


In [142]:
for p in pairs:
    print(p)

In [141]:
indexed = enumerate(b)
for x in indexed:
    print(x)# you work at the moment of the cycle instead of integrate everything at once

(0, 'w')
(1, 'x')
(2, 'y')
(3, 'z')


In [139]:
f= open('ctabus.csv')
f

<_io.TextIOWrapper name='ctabus.csv' mode='r' encoding='UTF-8'>

In [140]:
indexed= enumerate(f, start=1)
next(indexed)

(1, 'route,date,daytype,rides\n')

In [143]:
def countdown(n):
    print("Counting down from",n)
    while n >0:
        yield n
        n-=1
c = countdown(5)
c

<generator object countdown at 0x12ac6d990>

In [144]:
for x in c:
    print(x)

Counting down from 5
5
4
3
2
1


In [145]:
for x in c:
    print(x)  

In [162]:
import csv
f = open('ctabus.csv')
rows = csv.reader(f)
def match_route(rows, route):
    for row in rows:
        if row[0]==route:
            yield row
            
rt22=match_route(rows,'22')


In [163]:
rt22
next(rt22)

['22', '01/01/2001', 'U', '7877']

In [167]:
def square(nums):
    for x in nums:
        yield x*x
def recip(nums):
    for x in nums:
        yield 1/x
terms=range(1,101)
result= sum(recip(square(terms)))

In [169]:
result

1.6349839001848923

In [170]:
# Challenge try to use stream
class Row:
    def __init__(self,route,date,daytype,rides):
        self.route=route
        self.date=date
        self.daytype=daytype
        self.rides= rides
    def __repr__(self):
        return f'Row({self.route},{self.date},{self.daytype},{self.rides})'
    
def make_records_class(row):
    return Row(row[0],row[1],row[2],int(row[3]))
rows = read_data('ctabus.csv',make_records_class)
rows[0]


Current 228488435. Max 228500896


Row(3,01/01/2001,U,7354)

In [171]:
# Find the date of maximum riders on route 22

In [172]:
rt22=[row for row in rows if row.route=='22']
len(rt22)
rides_and_rate =[(row.rides,row.date) for row in rt22]
max(rides_and_rate)

(26896, '06/11/2008')

In [178]:
tracemalloc.start()
f = open('ctabus.csv')
rows = csv.reader(f)
next(rows)
rows=(Row(row[0],row[1],row[2],int(row[3])) for row in rows)
rt22=(row for row in rows if row.route=='22')
rides_and_dates=((row.rides,row.date) for row in rt22)
print(max(rides_and_dates))
print(tracemalloc.get_traced_memory())
tracemalloc.stop()

(26896, '06/11/2008')
(9265, 47505)


* 