In [1]:
import os
import pandas
import zipfile
import requests
from datetime import datetime
import matplotlib.pyplot as plt
import matplotlib.dates as dates
%matplotlib inline

AttributeError: 'module' object has no attribute '__version__'

In [77]:
pandas.set_option('display.float_format', lambda x: '%.2f' % x)

## Download late independent expenditure filings

They are filed via [Form 496](http://calaccess.californiacivicdata.org/documentation/calaccess-forms/f496/) for all independent expenditures over $1,000 in the 90 days prior to an election. The data are recorded in the [S496 file](http://calaccess.californiacivicdata.org/documentation/calaccess-files/s496-cd/). 

In [7]:
url = "http://calaccess.download/latest/s496_cd.csv"

In [8]:
r = requests.get(url)

In [10]:
path = os.path.join(os.getcwd(), 's496_cd.csv')

In [11]:
with open(path, 'w') as f:
    f.write(r.content)

## Read in the CSV

In [14]:
df = pandas.read_csv(path)

### Convert the date field to a datetime object

In [16]:
df['EXP_DATE'] = pandas.to_datetime(
    df['EXP_DATE'],
    errors="coerce"
)

## Basic information about the file

In [18]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 38810 entries, 0 to 38809
Data columns (total 12 columns):
FILING_ID     38810 non-null int64
AMEND_ID      38810 non-null int64
LINE_ITEM     38810 non-null int64
REC_TYPE      38810 non-null object
FORM_TYPE     38810 non-null object
TRAN_ID       38810 non-null object
AMOUNT        38779 non-null float64
EXP_DATE      38778 non-null datetime64[ns]
EXPN_DSCR     38619 non-null object
MEMO_CODE     177 non-null object
MEMO_REFNO    11523 non-null object
DATE_THRU     1445 non-null object
dtypes: datetime64[ns](1), float64(1), int64(3), object(7)
memory usage: 3.6+ MB


In [20]:
df.head()

Unnamed: 0,FILING_ID,AMEND_ID,LINE_ITEM,REC_TYPE,FORM_TYPE,TRAN_ID,AMOUNT,EXP_DATE,EXPN_DSCR,MEMO_CODE,MEMO_REFNO,DATE_THRU
0,1048489,0,2,S496,F496,EDT72,7200.0,2004-10-18,Productions Costs,,,
1,1048489,0,3,S496,F496,EDT67,39302.68,2004-10-18,Media Buys,,,
2,1048490,0,1,S496,F496,EDT70,750.0,2004-10-18,Polling,,,
3,1048490,0,2,S496,F496,NON4011,9500.0,2004-10-18,Polling,,,
4,1048490,0,3,S496,F496,EDT60,4000.0,2004-10-18,Production Costs,,,


In [30]:
n = df['FILING_ID'].count()
n

38810

## Frequency counts on the fields

In [50]:
df.groupby(['AMEND_ID'])['FILING_ID'].agg(['count'])

Unnamed: 0_level_0,count
AMEND_ID,Unnamed: 1_level_1
0,30576
1,6873
2,1105
3,223
4,26
5,7


In [41]:
df.groupby(['REC_TYPE'])['FILING_ID'].agg(['count'])

Unnamed: 0_level_0,count
REC_TYPE,Unnamed: 1_level_1
S496,38810


In [42]:
df.groupby(['FORM_TYPE'])['FILING_ID'].agg(['count'])

Unnamed: 0_level_0,count
FORM_TYPE,Unnamed: 1_level_1
F496,38810


In [44]:
df.groupby(['LINE_ITEM'])['FILING_ID'].agg(['count'])

Unnamed: 0_level_0,count
LINE_ITEM,Unnamed: 1_level_1
1,19202
2,6984
3,3833
4,2309
5,1454
6,926
7,681
8,487
9,380
10,302


In [56]:
df['AMOUNT'].sum()

560751268.05000007

In [80]:
def trim_to_year(row):
    try:
        return row['EXP_DATE'].year
    except TypeError:
        return pandas.NaN

df["year"] = df.apply(trim_to_year, axis=1) 

In [83]:
df.groupby('year')['AMOUNT'].agg('sum').reset_index()

Unnamed: 0,year,AMOUNT
0,2000.0,3104703.11
1,2001.0,657298.36
2,2002.0,11979832.52
3,2003.0,17857445.99
4,2004.0,24934576.89
5,2005.0,35156384.09
6,2006.0,89628830.24
7,2007.0,2285112.3
8,2008.0,42843750.92
9,2009.0,3296267.04


In [70]:
def trim_to_month(row):
    try:
        return datetime(year=row['EXP_DATE'].year, month=row['EXP_DATE'].month, day=1)
    except TypeError:
        return pandas.NaT

df["month"] = df.apply(trim_to_month, axis=1) 

In [82]:
df.groupby('month')['AMOUNT'].agg('sum').reset_index()

Unnamed: 0,month,AMOUNT
0,2000-02-01,614535.83
1,2000-03-01,507307.35
2,2000-10-01,1445754.56
3,2000-11-01,485605.37
4,2000-12-01,51500.00
5,2001-02-01,83771.00
6,2001-03-01,33974.48
7,2001-04-01,176305.04
8,2001-05-01,206498.20
9,2001-09-01,13261.89
