## Header Code

In [2]:
%matplotlib inline
import numpy as np
import scipy as sp
import statsmodels.api as sm
import statsmodels.formula.api as smf
from statsmodels import discrete

import re
import pandas as pd
import math 
import csv
import time
import dateutil
from datetime import datetime
import seaborn as sns
from IPython.core.display import HTML
HTML("<style>.container {width:80% !important; }</style>")



In [3]:
# pandas options plus some more
pd.set_option('display.width', 1000)
pd.set_option('display.max_columns', 100)
pd.set_option('display.notebook_repr_html', True)
pd.options.display.float_format = '{:,.3f}'.format
sns.set_style("whitegrid")
sns.set_context("poster")

In [4]:
# Matplotlib Formatting
import matplotlib as mpl
import matplotlib.cm as cm
import matplotlib.pyplot as plt
from matplotlib import gridspec
from matplotlib import ticker


millnames = ['',' Thousand',' Million',' Billion',' Trillion']
def millify(n, pos):
    n = float(n)
    millidx = max(0,min(len(millnames)-1,
                        int(math.floor(0 if n == 0 else math.log10(abs(n))/3))))
    thingtoreturn = n / 10**(3 * millidx)
    if thingtoreturn % 1 == 0:
        return '{:.0f}{}'.format(thingtoreturn, millnames[millidx])
    elif thingtoreturn % 0.1 == 0:
        return '{:.1f}{}'.format(thingtoreturn, millnames[millidx])
    else:
        return '{:.2f}{}'.format(thingtoreturn, millnames[millidx])

** load all necessary datasets **

In [5]:
budgetsdf = pd.read_csv('./exports/budgetsdf.csv')
incomesdf = pd.read_csv('./exports/incomesdf.csv')
wagesdf = pd.read_csv('./exports/ave_wage_per_year.csv')
demdf = pd.read_csv('./exports/demdf.csv')
apaneldf = pd.read_csv('./exports/apaneldf.csv')

# Compile munyeardf

## Incomes by year

In [6]:
# get annual totals with categories as columns
l6_income_year = incomesdf.groupby(["mun_id", "year","l6_name_english"]).income_value.sum().unstack().reset_index()

# get same thing except with shares by year
l6_income_shares_by_yeardf = l6_income_year.drop(["mun_id", "year"], axis=1).fillna(0).apply(lambda x: x /x.sum(), axis=0)

# combine both
for col in l6_income_shares_by_yeardf.columns:
    l6_income_year["share_" +  col.lstrip(" ")] = l6_income_shares_by_yeardf[col]

# fillna-s with 0
l6_income_year.fillna(0, inplace=1)

# apennd 'income_' to all column titles for later identification
newcols = []
for col in l6_income_year.columns:
    if not re.search("(mun_id)|(year)", col):
        newcols.append("income_" + col.lstrip(" "))
    elif re.search("previous year", col):
        newcols.append("income_" + col.lstrip(" "))
    else:
        newcols.append(col)  
l6_income_year.columns = newcols

## Budgets by year


In [7]:
# get annual totals of actuals with categories as columns
budgetsdf.drop("Unnamed: 0", axis=1, inplace=1, errors='ignore')
ec_group_year = budgetsdf.groupby(["mun_id", "year","ec_group_name_english"]).actual.sum().unstack().reset_index()

# drop 2015 values
ec_group_year = ec_group_year.drop(ec_group_year[ec_group_year.year == 2015].index, axis=0).fillna(0)

# get 
share_ec_group_year = ec_group_year.drop(["mun_id", "year"], axis=1).apply(lambda x: x /x.sum(), axis=0)

for col in share_ec_group_year.columns:
    ec_group_year["share_" +  col.lstrip(" ")] = share_ec_group_year[col]

newcols = []
for col in ec_group_year.columns:
    if not re.search("(mun_id)|(year)", col):
        newcols.append("actual_" + col.lstrip(" "))
    elif re.search("previous year", col):
        newcols.append("actual_" + col.lstrip(" "))
    else:
        newcols.append(col)  
ec_group_year.columns = newcols

# Compile mundf 

In [8]:
maindf = totals_by_yeardf.groupby('mun_id').mean().reset_index()

NameError: name 'totals_by_yeardf' is not defined

In [None]:
maindf = maindf.merge(municipal_codes_df, how="left", right_on="mb", left_on="mun_id")
maindf.drop("mb", axis=1, inplace=1)

In [None]:
maindf[[u'mun_id', u'income_value', u'plan', u'actual', u'mun']] = maindf[[u'mun_id', u'mun', u'income_value', u'plan', u'actual']]
maindf.columns = [u'mun_id', u'mun', u'mean_income_value', u'mean_plan', u'mean_actual']

In [None]:
maindf.info()

### Add Classification Shares

In [9]:
# function that cleans column names
def clean_column_names(x):
        if isinstance(x,tuple):
            pre, name = x
            return re.sub(r"(mean_)|(_share)","", pre) + "_" + name
        return x
    
clean_column_names = np.vectorize(clean_column_names)

** add economic group shares **

In [10]:
temp = budgetsdf.groupby(["mun_id","year","ec_group_name_english"]).sum().reset_index()
temp[["mean_plan_share", "mean_actual_share"]] = temp.groupby(["mun_id", "year"]).transform(lambda x: x / x.sum())
temp = temp.groupby(["mun_id", "ec_group_name_english"])[["mean_plan_share", "mean_actual_share"]].mean()
maindf = maindf.merge(temp.unstack().reset_index(), how="outer", on="mun_id")
share_economic_group_columns = temp.unstack().reset_index().columns

ValueError: Columns must be same length as key

** add functional division shares **

In [None]:
temp = budgetsdf.groupby(["mun_id","year","func_div_name_english"]).sum().reset_index()
temp[["mean_plan_share", "mean_actual_share"]] = temp.groupby(["mun_id", "year"]).transform(lambda x: x / x.sum())
temp = temp.groupby(["mun_id", "func_div_name_english"])[["mean_plan_share", "mean_actual_share"]].mean()
maindf = maindf.merge(temp.unstack().reset_index(), how="outer", on="mun_id")
share_func_div_columns = temp.unstack().reset_index().columns

**add l4 income classification shares**

In [None]:
temp = incomesdf.groupby(["mun_id","year","l4_name_english"]).sum().reset_index()
temp["mean_income_share"] = temp.groupby(["mun_id", "year"]).transform(lambda x: x / x.sum()).income_value
temp = temp.groupby(["mun_id", "l4_name_english"])["mean_income_share"].mean()
maindf = maindf.merge(temp.unstack(), how="outer", left_on="mun_id", right_index=1)
share_l4_income_columns = temp.unstack().columns

In [11]:
share_l4_income_columns

NameError: name 'share_l4_income_columns' is not defined

In [12]:
ec_group_plan_shares = []
ec_group_actual_shares = []
for col in clean_column_names(share_economic_group_columns):
    if re.search("plan", col):
        ec_group_plan_shares.append(col)
    if re.search("actual", col):
        ec_group_actual_shares.append(col)

ec_group_actual_shares

NameError: name 'share_economic_group_columns' is not defined

In [13]:
maindf.columns = clean_column_names(maindf.columns)

NameError: name 'maindf' is not defined

In [14]:
things_to_pairplot = np.array(maindf[ec_group_mean_plan_shares + ['Donations and transfers']].fillna(0).sum().sort_values()[-5:].index)

NameError: name 'maindf' is not defined

In [15]:
sns.set()
sns.set_context()
sns.pairplot(maindf[things_to_pairplot].fillna(0), kind='reg', size=4)

NameError: name 'maindf' is not defined

In [None]:
budgetsdf.head()

# Compile changedf

## Wage changes

In [16]:
tuples = zip(['wage_2011', 'wage_2012', 'wage_2013', 'wage_2014'], ['wage_2010', 'wage_2011', 'wage_2012', 'wage_2013'])

In [17]:
def rel_wage_change(df, cols = ""):
    x11 = (df[diff[0]] == 0 | np.isnan(df[diff[0]])) 
    x13 = (df[diff[1]] == 0 | np.isnan(df[diff[1]]))
    if x11 & x13:
        return 0    
    if  ~x11 & x13:
        return -1
    if x11 &  ~x13:
        return 2
    else:
        return (df[diff[0]] - df[diff[1]])  /   df[diff[1]]

In [18]:
wagedf.set_index("mun_id", inplace=1)
wage_changes_abs = pd.DataFrame()
wage_changes_rel = pd.DataFrame()
for diff in tuples:
    wage_changes_abs[diff[0] + " - " + diff[1]] = wagedf[diff[0]] - wagedf[diff[1]]

for diff in tuples:
    wage_changes_rel[diff[0] + " - " + diff[1]] = wagedf.apply(rel_wage_change, cols=diff, axis=1)

NameError: name 'wagedf' is not defined

In [19]:
wage_changes = pd.concat([wage_changes_abs.stack(), wage_changes_rel.stack()],axis=1)
wage_changes.reset_index(inplace=1)
wage_changes.columns = ["mun_id", "years_change", "wage_change_abs", "wage_change_rel"]
wage_changes.to_csv("./exports/changes_in_wages.csv")

NameError: name 'wage_changes_abs' is not defined

## Budget changes

## munyear and changesdf with categories in one row

In [20]:
test1 = budgetsdf.groupby(["mun_id","ec_group_name_english","year"])[["plan","actual"]].sum()

test=test1

changes = test.reset_index().groupby(["mun_id","ec_group_name_english"]).diff()
changes_rel = test.reset_index().groupby(["mun_id","ec_group_name_english"]).pct_change()


test = test.reset_index()

test["plan_share"] = test1.reset_index().groupby(["mun_id","year"]).transform(lambda x: x / x.sum()).plan
test["actual_share"] = test1.reset_index().groupby(["mun_id","year"]).transform(lambda x: x / x.sum()).actual

test["change_plan_abs"] = changes.plan
test["change_actual_abs"] = changes.actual


test["change_plan_rel"] = changes_rel.plan
test["change_actual_rel"] = changes_rel.actual


econ_groups_changes_shares = test
#econ_groups_changes_shares = econ_groups_changes_shares.merge(demdf,how="left", on = "mun_id")
econ_groups_changes_shares.to_csv("./exports/econ_groups_changes_shares.csv")

In [21]:
test1 = budgetsdf.groupby(["mun_id","func_div_name_english","year"])[["plan","actual"]].sum()

test=test1

changes = test.reset_index().groupby(["mun_id","func_div_name_english"]).diff()
changes_rel = test.reset_index().groupby(["mun_id","func_div_name_english"]).pct_change()


test = test.reset_index()

test["plan_share"] = test1.reset_index().groupby(["mun_id","year"]).transform(lambda x: x / x.sum()).plan
test["actual_share"] = test1.reset_index().groupby(["mun_id","year"]).transform(lambda x: x / x.sum()).actual

test["change_plan_abs"] = changes.plan
test["change_actual_abs"] = changes.actual


test["change_plan_rel"] = changes_rel.plan
test["change_actual_rel"] = changes_rel.actual

func_divs_changes_shares = test
#func_divs_changes_shares = func_divs_changes_shares.merge(demdf,how="left", on = "mun_id")
func_divs_changes_shares.to_csv("./exports/func_groups_changes_shares.csv")

In [22]:
test1 = incomesdf.groupby(["mun_id","l6_name_english","year"]).income_value.sum()

test=test1

changes = test.reset_index().groupby(["mun_id","l6_name_english"]).diff()
changes_rel = test.reset_index().groupby(["mun_id","l6_name_english"]).pct_change()


test = test.reset_index()

test["income_share"] = test1.reset_index().groupby(["mun_id","year"]).transform(lambda x: x / x.sum())

test["change_income_abs"] = changes.income_value

test["change_income_rel"] = changes_rel.income_value


income_changes_shares = test
#income_changes_shares = income_changes_shares.merge(demdf,how="left", on = "mun_id")
income_changes_shares.to_csv("./exports/income_changes_shares.csv")

## munyeardf and changesdf with categories as columns

In [23]:
budget_shares_ec_group_as_columns = econ_groups_changes_shares.groupby(["mun_id","year", "ec_group_name_english"]).sum().unstack(level=2)
budget_shares_ec_group_as_columns.columns = ['_'.join(col).strip() for col in budget_shares_ec_group_as_columns.columns.values]
budget_shares_ec_group_as_columns.to_csv("./exports/budget_shares_ec_group_as_columns.csv")

In [24]:
budget_shares_func_div_as_columns = func_divs_changes_shares.groupby(["mun_id","year", "func_div_name_english"]).sum().unstack(level=2)
budget_shares_func_div_as_columns.columns = ['_'.join(col).strip() for col in budget_shares_func_div_as_columns.columns.values]
budget_shares_func_div_as_columns.to_csv("./exports/budget_shares_func_div_as_columns.csv")

In [30]:
income_share_l6_as_columns = income_changes_shares.groupby(["mun_id","year", "l6_name_english"]).sum().unstack(level=2)
income_share_l6_as_columns.columns = ['_'.join(col).strip() for col in income_share_l6_as_columns.columns.values]

income_share_l6_as_columns.to_csv("./exports/income_share_l6_as_columns.csv")


In [36]:
func_divs_changes_shares.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4913 entries, 0 to 4912
Data columns (total 11 columns):
mun_id                   4913 non-null int64
func_div_name_english    4913 non-null object
year                     4913 non-null int64
plan                     4913 non-null float64
actual                   3902 non-null float64
plan_share               4913 non-null float64
actual_share             3902 non-null float64
change_plan_abs          3860 non-null float64
change_actual_abs        2875 non-null float64
change_plan_rel          3857 non-null float64
change_actual_rel        2850 non-null float64
dtypes: float64(8), int64(2), object(1)
memory usage: 422.3+ KB


In [None]:
incomesdf.l6