### Review data from each year

In [27]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.ticker
import seaborn as sns
import datetime

### 2017 Data

In [97]:
files = [
    'data/2017/january-june-2017.csv',
    'data/2017/july-september-2017.csv',
    'data/2017/november-2017.csv',
    'data/2017/october-2017.csv',
    'data/2017/december-2017.csv',
]

phx_ven_pay_17 = pd.concat([pd.read_csv(f) for f in files], sort=True)

In [110]:
phx_ven_pay_17.head()

Unnamed: 0,Invoice Net Amt,Check/Payment Date,Description,Department,Vendor Name
0,400.0,1/9/2017,Percent Arts-Prf Svc,Office of Arts and Culture,"JOHNSON, GARTH W"
1,5153.6,1/27/2017,Spec Contractual Svc,Human Services,1 N 10 INC
2,2400.0,1/27/2017,Spec Contractual Svc,Human Services,1 N 10 INC
3,5153.6,1/27/2017,Spec Contractual Svc,Human Services,1 N 10 INC
4,2610.61,1/27/2017,Spec Contractual Svc,Human Services,1 N 10 INC


In [119]:
phx_ven_pay_17.isnull().sum()

 Invoice Net Amt      0
Check/Payment Date    0
Description           0
Department            0
Vendor Name           0
dtype: int64

### To clean up:
#### Drop:
+ ~~'Invoice Net Amt' shows up twice, for each column that has a value, the other column has NaN, how can I stitch these together into one column?~~
+ ~~'Vendor ID Number' isn't included in prior and later years, drop this~~
+ ~~'Fund Center' not included in prior and later years, drop this~~
+ ~~'Document Nbr' not included in prior and later years, drop this~~
+ ~~remove three records that don't have Vendor Name~~

#### Modify:
+ change 'Check/Payment Date' to datetime from object
+ change 'Dept. Descrptn' to 'Department' to be consistent with prior years
+ change 'Commitmt Item Name' to 'Description' to be consistent with prior years

In [112]:
phx_ven_pay_17.dtypes

 Invoice Net Amt      object
Check/Payment Date    object
Description           object
Department            object
Vendor Name           object
dtype: object

In [121]:
# remove handful of rows that have no vendor name
phx_ven_pay_17.dropna(inplace=True)

In [100]:
# combine two 'Invoice Net Amt' columns into one based on NaN values
phx_ven_pay_17.iloc[:,0].fillna(phx_ven_pay_17.iloc[:,6], inplace=True)

In [105]:
# delete second 'Invoice Net Amt' column since invoice amounts are now being held in first 'Invoice Net Amt' column
phx_ven_pay_17.drop(phx_ven_pay_17.columns[6], axis=1, inplace=True)

In [107]:
# rename columns
columns_to_rename = ['Dept. Descrptn',
                    'Commitmt Item Name']

if set(columns_to_rename).issubset(phx_ven_pay_17.columns):
   phx_ven_pay_17.rename(columns={'Dept. Descrptn': 'Department',
                                  'Commitmt Item Name': 'Description'},
                         inplace=True)

In [108]:
# drop unnecessary columns
excess_columns = [
                  'Document Nbr',
                  'Fund Center',
                  'Vendor ID Number']

if set(excess_columns).issubset(phx_ven_pay_17.columns):
   phx_ven_pay_17.drop([ 
                        'Document Nbr', 
                        'Fund Center', 
                        'Vendor ID Number'], 
                       axis=1, inplace=True)

# columns in oldest data = Amount, Date, Department, Description, Vendor Display

In [120]:
phx_ven_pay_17.shape

(487373, 5)

In [36]:
phx_ven_pay_17.groupby('Fund Center')['Fund Center'].count().head()

Fund Center
1000000000    157
1100000000     20
1101000000     44
1102000000     30
1103000000     36
Name: Fund Center, dtype: int64

### 2016 Data

In [17]:
files = [
    'data/2016/citycheckbookjantojune2016.csv',
    'data/2016/citycheckbookjulytodec2016.csv',
]

phx_ven_pay_16 = pd.concat([pd.read_csv(f) for f in files], sort=True)

In [51]:
phx_ven_pay_16.head()

Unnamed: 0,Amount,Date,Department,Description,Vendor Display
0,$855,01-04-2016,Community & Economic Development,Other Commodities,ABM PARKING SERVICES
1,$471,01-04-2016,Community & Economic Development,Other Commodities,ABM PARKING SERVICES
2,$813,01-04-2016,Community & Economic Development,Other Commodities,ABM PARKING SERVICES
3,$578.92,01-04-2016,Fire,Plumbing Services,ABOVE ALL PLUMBING SERVICES INC
4,"$2,712.04",01-04-2016,Fire,Plumbing Services,ABOVE ALL PLUMBING SERVICES INC


In [20]:
phx_ven_pay_16.dtypes

Amount             object
Date               object
Department         object
Description        object
G/L Description    object
Vendor Display     object
dtype: object

In [49]:
# remove 'G/L Description' column
# phx_ven_pay_16.drop('G/L Description', axis=1, inplace=True)

In [50]:
phx_ven_pay_16.shape

(533015, 5)

### To clean up:
+ 'Amount' column has dollar sign in front of it, must remove this in order to work with the values as numbers
+ convert 'Date' to datetime from object and rename to 'Check/Payment Date' in order to be consistent with later years
+ 'Commitmt Item Name' does not show up in this dataset, however it is present in the following year(2017) and beyond
+ 'GL Description' shows up this year, however it is not included in preceding years, and it is not included in following years(at least not under this name), since I don't have plants to use this, I will remove it.
+ change 'Description' to 'Dept. Descrptn' to be consistent with later years

### 2015 Data

In [21]:
files = [
    'data/2015/citycheckbookjantojune2015.csv',
    'data/2015/citycheckbookjulytodec2015.csv',
]

phx_ven_pay_15 = pd.concat([pd.read_csv(f) for f in files], sort=True)

In [22]:
phx_ven_pay_15.head()

Unnamed: 0,Amount,Date,Department,Description,Vendor Display
0,$80,01-02-2015,Municipal Court,Interpreters/Transl,A FOREIGN LANGUAGE SERVICE CORP
1,"$1,888.85",01-02-2015,Aviation,Small Tools/ Equip,A TO Z EQUIPMENT RENTALS
2,$22.46,01-02-2015,Aviation,Motor Vehicle Parts,A TO Z EQUIPMENT RENTALS
3,"$1,973.6",01-02-2015,Aviation,Small Tools/ Equip,A TO Z EQUIPMENT RENTALS
4,$17.33,01-02-2015,Public Works,Inventories,A-Z LOCK PRODUCTS CO INC


### To clean up:

+ change 'Description' to 'Dept. Descrptn' to be consistent with later years
+ convert 'Date' to datetime from object and rename to 'Check/Payment Date' in order to be consistent with later years
+ change '


In [23]:
phx_ven_pay_15.dtypes

Amount            object
Date              object
Department        object
Description       object
Vendor Display    object
dtype: object

In [24]:
phx_ven_pay_15.shape

(333440, 5)

### 2014 Data

In [28]:
files = [
    'data/2014/citycheckbookjantojune2014.csv',
    'data/2014/citycheckbookjulytodec2014.csv',
]

phx_ven_pay_14 = pd.concat([pd.read_csv(f) for f in files], sort=True)

In [29]:
phx_ven_pay_14.head()

Unnamed: 0,Amount,Date,Department,Description,Vendor Display
0,-$34.78,01-02-2014,Street Transportation,Inventories,3M COMPANY
1,-$69.55,01-02-2014,Street Transportation,Inventories,3M COMPANY
2,-$77,01-02-2014,Street Transportation,Inventories,3M COMPANY
3,$0.01,01-02-2014,Public Works,Inventories,AGS SAFETY & SUPPLY
4,-$0.79,01-02-2014,Police,Inventories,AMERICAN EUROCOPTER CORPORATION


### To clean up:

+ change 'Description' to 'Dept. Descrptn' to be consistent with later years
+ convert 'Date' to datetime from object and rename to 'Check/Payment Date' in order to be consistent with later years


In [31]:
phx_ven_pay_14.dtypes

Amount            object
Date              object
Department        object
Description       object
Vendor Display    object
dtype: object

In [30]:
phx_ven_pay_14.shape

(307110, 5)