In [142]:
%matplotlib notebook

In [169]:
import random

import matplotlib
import matplotlib.pyplot as plt
import matplotlib.patches as patches
from matplotlib import transforms

import numpy as np

In [152]:
def read_csv(path, header=None, mapping=None, delim=','):
    def func():
        with open(path) as f:
            for line in f:
                line = line.rstrip()
                parts = line.split(delim)
                vals = [mapping[header[i]](x) for i, x in enumerate(parts)]
                record = {k: v for k, v in zip(header, vals)}
                yield record
    return list(func())

def read_reserved_usage(path):
    header = ['rank', 'name', 'freq', 'author_usage']
    mapping = {k: v for k, v in zip(header, [int, str, int, int])}
    return read_csv(path, header=header, mapping=mapping)

def pick(lst, k):
    return [d[k] for d in lst]

In [167]:
path = './notebook-data/usage_reserved.csv'
records = read_reserved_usage(path)

xs = pick(records, 'rank')
ys0 = pick(records, 'freq')
ys1 = pick(records, 'author_usage')

plt.figure()
plt.yscale('log')
plt.plot(xs, ys0, label='Overall Frequency')
plt.plot(xs, ys1, label='Author Usage')
plt.xlabel('Rank')
plt.ylabel('Count')
plt.title('Reserved Word Usage')
plt.legend(loc='best')
plt.show()

<IPython.core.display.Javascript object>

In [175]:
path = './notebook-data/usage_unreserved.csv'
records = read_reserved_usage(path)

downsample = None
# downsample = 200

if downsample is not None:
    random.seed(11)
    index = sorted(random.sample(range(len(records)), downsample))
    records = [records[i] for i in index]

xs = pick(records, 'rank')
ys0 = pick(records, 'freq')
ys1 = pick(records, 'author_usage')

plt.figure()
plt.yscale('log')
plt.plot(xs, ys0, label='Overall Frequency')
plt.plot(xs, ys1, label='Author Usage')
plt.xlabel('Rank')
plt.ylabel('Count')
plt.title('Un-reserved Word Usage')
plt.legend(loc='best')
plt.show()

<IPython.core.display.Javascript object>

In [174]:
path = './notebook-data/usage_unreserved.csv'
records = read_reserved_usage(path)

# downsample = None
downsample = 200

if downsample is not None:
    random.seed(11)
    index = sorted(random.sample(range(len(records)), downsample))
    records = [records[i] for i in index]

xs = pick(records, 'rank')
ys0 = pick(records, 'freq')
ys1 = pick(records, 'author_usage')

plt.figure()
plt.yscale('log')
plt.plot(xs, ys0, label='Overall Frequency')
plt.plot(xs, ys1, label='Author Usage')
plt.xlabel('Rank')
plt.ylabel('Count')
plt.title('Un-reserved Word Usage')
plt.legend(loc='best')
plt.show()

<IPython.core.display.Javascript object>