In [1]:
%matplotlib inline
import numpy as np
import scipy as sp
import statsmodels.api as sm
import statsmodels.formula.api as smf
from statsmodels import discrete

import re
import pandas as pd
import math 
import csv
import time
import dateutil
from datetime import datetime
import seaborn as sns



In [2]:
pd.set_option('display.width', 1000)
pd.set_option('display.max_columns', 100)
pd.set_option('display.notebook_repr_html', True)
pd.options.display.float_format = '{:,.2f}'.format
sns.set_style("whitegrid")
sns.set_context("poster")

In [3]:
# Matplotlib Formatting
import matplotlib as mpl
import matplotlib.cm as cm
import matplotlib.pyplot as plt
from matplotlib import gridspec
from matplotlib import ticker
"""
mpl.rcParams["axes.grid"]=False
mpl.rcParams["xtick.major.size"] = 4
mpl.rcParams["xtick.minor.size"] = 2
mpl.rcParams["xtick.major.width"] = 0.5 
mpl.rcParams["xtick.minor.width"] = 0.5 

mpl.rcParams["ytick.major.size"] = 4
mpl.rcParams["ytick.minor.size"] = 2
mpl.rcParams["ytick.major.width"] = 0.5 
mpl.rcParams["ytick.minor.width"] = 0.5 
"""

millnames = ['',' Thousand',' Million',' Billion',' Trillion']
def millify(n, pos):
    n = float(n)
    millidx = max(0,min(len(millnames)-1,
                        int(math.floor(0 if n == 0 else math.log10(abs(n))/3))))
    thingtoreturn = n / 10**(3 * millidx)
    if thingtoreturn % 1 == 0:
        return '{:.0f}{}'.format(thingtoreturn, millnames[millidx])
    elif thingtoreturn % 0.1 == 0:
        return '{:.1f}{}'.format(thingtoreturn, millnames[millidx])
    else:
        return '{:.2f}{}'.format(thingtoreturn, millnames[millidx])

In [4]:
from IPython.core.display import HTML
HTML("<style>.container {width:50% !important; }</style>")

In [33]:
zipcodesdf = pd.read_excel("./General Info/zip_codes.xls", 0)

In [34]:
# change column names
zipcodesdf.columns = ["mun_id","mun","place","place_id","zip_name","zip_id"]

# drop unecessary columns
zipcodesdf.drop(["place","zip_name"], axis=1, inplace=1)

# get ride of null zip code entries
zipcodesdf = zipcodesdf[zipcodesdf.zip_id.notnull()].copy()

# get rid of random zip doe without anything else
random = zipcodesdf[zipcodesdf.mun_id.isnull() & zipcodesdf.zip_id.notnull()].index
zipcodesdf.drop(random, inplace=1)

zipcodesdf = zipcodesdf.dropna()

In [35]:
for col in zipcodesdf:
    if re.search("id", col):
        zipcodesdf[col] = zipcodesdf[col].astype(int)
zipcodesdf.info()
zipcodesdf.head()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1137 entries, 2 to 6351
Data columns (total 4 columns):
mun_id      1137 non-null int32
mun         1137 non-null object
place_id    1137 non-null int32
zip_id      1137 non-null int32
dtypes: int32(3), object(1)
memory usage: 31.1+ KB


Unnamed: 0,mun_id,mun,place_id,zip_id
2,70092,Барајево,703494,11460
4,70092,Барајево,703516,11461
5,70092,Барајево,703524,11462
7,70092,Барајево,703559,11427
11,70092,Барајево,703591,11426


In [36]:
# save file in exports
zipcodesdf.to_excel("./exports/zip_codes.xlsx")

In [37]:
# get zip code monthly municipal panel
mpaneldf = pd.read_csv("./exports/mpaneldf.csv").drop("Unnamed: 0", axis=1)

In [38]:
# make kostolac align
zipcodesdf.loc[zipcodesdf[zipcodesdf["zip_id"] == 12208].index, 
               "mun_id"] = mpaneldf[mpaneldf.mun == "Kostolac"].mun_id.values[0]

### Possible outer merge results

In [54]:
# merge datasets
mpaneldf_zip = zipcodesdf.merge(mpaneldf, how="outer", on="mun_id")

In [56]:
mpaneldf_zip[mpaneldf_zip.mun_x.isnull() | mpaneldf_zip.mun_y.isnull()].drop_duplicates(["mun_y","mun_x"])[["mun_id","mun_y","mun_x"]]

Unnamed: 0,mun_id,mun_y,mun_x
0,70092.0,,Барајево
5,70106.0,,Вождовац
6,70114.0,,Врачар
7,70122.0,,Гроцка
273,70165.0,,Лазаревац
280,70173.0,,Младеновац
290,70181.0,,Нови Београд
291,70190.0,,Обреновац
863,70220.0,,Савски Венац
864,70238.0,,Сопот


### Inner Merge

In [48]:
# merge datasets

mpaneldf_zip = zipcodesdf.drop("mun",axis=1).merge(mpaneldf, how="inner", on="mun_id")

In [49]:

mpaneldf_zip.to_csv("./exports/mpaneldf_zip.csv")