## New York Times Clicks

In [54]:
%%timeit
range(100)

The slowest run took 9.46 times longer than the fastest. This could mean that an intermediate result is being cached.
1000000 loops, best of 3: 756 ns per loop


In [62]:
!pwd

/home/io/ga/ds/DS_HK_9/notebooks/demos


In [10]:
DATA_DIR = '../../data/'

In [8]:
!head {DATA_DIR}nytimes.csv

"Age","Gender","Impressions","Clicks","Signed_In"
36,0,3,0,1
73,1,3,0,1
30,0,3,0,1
49,1,3,0,1
47,1,11,0,1
47,0,11,1,1
0,0,7,1,0
46,0,5,0,1
16,0,3,0,1


In [11]:
#!/usr/bin/python

# Start a counter
count = 0

# Open up reference to the data file
with open(DATA_DIR + 'nytimes.csv') as fn:
    # Store the textfile in memory
    lines = fn.readlines()
    
# Pop off the header file
lines.pop(0)

# For each line, find the sum of index 2 in the list.
for line in lines:
  count = count + int(line.strip().split(',')[2])

print count

2295559


1. Right now the python script finds a sum of the impressions column. Update it to also return:
	1. The average age in the file
	1. Click through rate (avg clicks per impression)
	1. The oldest person in the file
1. This should all be write out to the standard out using a few lines with "print"

**EXTRA CREDIT**: 

How can you change the Python script a bit to:

* Check out the distirbution of ages in the dataset. What do you notice?
* Ignore the users whom are ... too young to be browsing the NYT ;)
* Output the results into a new text file? Read a guide on [file manipulation](http://www.pythonforbeginners.com/systems-programming/reading-and-writing-files-in-python/) to get you started.

## Example Solution I

In [36]:
#!/usr/bin/python
from __future__ import division

# initialization of variables
impressions = 0
age = 0
clicks = 0
max_age = 0

def readCSV(filename, header=True):
    # Open up reference to the data file
    with open(DATA_DIR + filename) as fn:
        # Store the textfile in memory
        lines = fn.readlines()
    if header:
        # Pop off the header file
        lines.pop(0)
    lines = [[int(item) for item in line.strip().split(',')] for line in lines]
    return lines
        

# def getColumn(line,idx):
# 	return int(line[idx])

lines = readCSV('nytimes.csv')
    
# For each line, find the sum of index 0,2 & 3 in the list.
for line in lines:
    age = age + line[0]
    impressions = impressions + line[2]
    clicks = clicks + line[3]
    
    max_age = max(max_age, line[0])

print 'No. Impressions:', impressions
print 'Mean Age:', age / len(lines)
print 'Click-Through-Rate:', clicks / impressions
print 'Oldest Age:', max_age

No. Impressions: 2295559
Mean Age: 29.4825506445
Click-Through-Rate: 0.0184917921953
Oldest Age: 108


## Extra Credit Example Solution I

In [68]:
from collections import Counter

lines = readCSV('nytimes.csv')

ages = Counter()
for line in lines:
    ages.update({line[0]:1})
    
ages.most_common(10)

[(0, 137106),
 (37, 7983),
 (36, 7933),
 (42, 7919),
 (41, 7906),
 (38, 7891),
 (39, 7804),
 (43, 7784),
 (40, 7702),
 (48, 7277)]

In [39]:
old_enough = [line for line in lines if line[0] > 10]

## Example Solution II

137106

In [88]:
# initialization of variables
max_age = 0
        
lines = readCSV('nytimes.csv')
c = Counter()

# For each line, find the sum of index 0,2 & 3 in the list.
for line in lines:
    c.update({
        'age': line[0],
        'impressions': line[2],
        'clicks': line[3]
        })    
    
    max_age = max(max_age, line[0])

print 'No. Impressions:', c['impressions']
print 'Mean Age:', c['age'] / (len(lines) - ages[0])
print 'Click-Through-Rate:', c['clicks'] / c['impressions']
print 'Oldest Age:', max_age

No. Impressions: 2295559
Mean Age: 42.0620536201
Click-Through-Rate: 0.0184917921953
Oldest Age: 108


In [94]:
ages

Counter({0: 137106,
         7: 5,
         8: 15,
         9: 48,
         10: 113,
         11: 283,
         12: 586,
         13: 1059,
         14: 1718,
         15: 2605,
         16: 3443,
         17: 3953,
         18: 5424,
         19: 6509,
         20: 6337,
         21: 6384,
         22: 6394,
         23: 6488,
         24: 3158,
         25: 3260,
         26: 6405,
         27: 6551,
         28: 6359,
         29: 6379,
         30: 6617,
         31: 6439,
         32: 6443,
         33: 6431,
         34: 3290,
         35: 3932,
         36: 7933,
         37: 7983,
         38: 7891,
         39: 7804,
         40: 7702,
         41: 7906,
         42: 7919,
         43: 7784,
         44: 4006,
         45: 3645,
         46: 7202,
         47: 7070,
         48: 7277,
         49: 7054,
         50: 6982,
         51: 7147,
         52: 7165,
         53: 7231,
         54: 3515,
         55: 2502,
         56: 5034,
         57: 5014,
         58: 4916,
     

In [71]:
counter

Counter({'age': 13516010, 'clicks': 42449, 'impressions': 2295559})

In [74]:
pattern = Counter()

for line in lines:
    if line[4]:
        pattern.update({'signed_in': line[0]})
    else:
        pattern.update({'not_signed_in': line[0]})

In [77]:
pattern

Counter({0: 137106})

### Which is faster?

In [83]:
%%timeit
pattern = Counter()

def is_signed_on(line):
    return line[4] == 1

for line in lines:
    if is_signed_on(line):
        pattern.update({line[0]: 1})

1 loop, best of 3: 1.04 s per loop


In [84]:
%%timeit
pattern = Counter()

for line in lines:
    if line[4] == 1:
        pattern.update({line[0]: 1})

1 loop, best of 3: 890 ms per loop


['36,0,3,0,1',
 '73,1,3,0,1',
 '30,0,3,0,1',
 '49,1,3,0,1',
 '47,1,11,0,1',
 '47,0,11,1,1',
 '0,0,7,1,0',
 '46,0,5,0,1',
 '16,0,3,0,1',
 '52,0,4,0,1',
 '0,0,8,1,0',
 '21,0,3,0,1',
 '0,0,4,0,0',
 '57,0,6,0,1',
 '31,0,5,0,1',
 '0,0,6,0,0',
 '40,1,3,0,1',
 '31,1,5,0,1',
 '38,0,4,0,1',
 '0,0,5,0,0']

In [124]:
[[str(item) for item in line] for line in lines[:20]]

[['36', '0', '3', '0', '1'],
 ['73', '1', '3', '0', '1'],
 ['30', '0', '3', '0', '1'],
 ['49', '1', '3', '0', '1'],
 ['47', '1', '11', '0', '1'],
 ['47', '0', '11', '1', '1'],
 ['0', '0', '7', '1', '0'],
 ['46', '0', '5', '0', '1'],
 ['16', '0', '3', '0', '1'],
 ['52', '0', '4', '0', '1'],
 ['0', '0', '8', '1', '0'],
 ['21', '0', '3', '0', '1'],
 ['0', '0', '4', '0', '0'],
 ['57', '0', '6', '0', '1'],
 ['31', '0', '5', '0', '1'],
 ['0', '0', '6', '0', '0'],
 ['40', '1', '3', '0', '1'],
 ['31', '1', '5', '0', '1'],
 ['38', '0', '4', '0', '1'],
 ['0', '0', '5', '0', '0']]

In [125]:
with open(DATA_DIR + 'nytimes_output.csv','w') as fn:
    # Store the textfile on desk
    for line in [[str(item) for item in line] for line in lines]:
        fn.write(",".join(line)+'\n')