### Review data from each year

In [253]:
import glob
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.ticker
import seaborn as sns
import datetime

### 2018 Data 

In [287]:
# open all 2018 csv files and read them into a dataframe 
files = glob.glob('data/2018/*.csv')

phx_ven_pay_18 = pd.concat([pd.read_csv(f) for f in files], sort=True)

In [288]:
phx_ven_pay_18.shape

(554526, 5)

In [289]:
phx_ven_pay_18.head()

Unnamed: 0,Check/Payment Date,Commitmt Item Name,Dept. Descrptn,Invoice Net Amt,Vendor Name
0,4/25/2018,Spec Contractual Svc,Human Services,1166.67,1 N 10 INC
1,4/25/2018,Spec Contractual Svc,Human Services,4166.66,1 N 10 INC
2,4/2/2018,Housing Appliances,Housing,1322.86,1 STOP ELECTRONICS CENTER INC
3,4/2/2018,Housing Appliances,Parks and Recreation,1316.38,1 STOP ELECTRONICS CENTER INC
4,4/2/2018,Housing Appliances,Housing,661.43,1 STOP ELECTRONICS CENTER INC


### 2017 Data

In [373]:
# open all 2017 csv files and read them into a dataframe 
files = glob.glob('data/2017/*.csv')

phx_ven_pay_17 = pd.concat([pd.read_csv(f) for f in files], sort=True)

In [400]:
phx_ven_pay_17.head()

Unnamed: 0,Invoice Net Amt,Check/Payment Date,Commitmt Item Name,Dept. Descrptn,Vendor Name
0,138.18,12/13/2017,Inventories-Offline,Public Works,1 STOP ELECTRONICS CENTER INC
1,4001.92,12/29/2017,Com Software<Cap Lmt,Planning and Development,20-20 TECHNOLOGIES
2,-316.92,12/29/2017,Com Software<Cap Lmt,Planning and Development,20-20 TECHNOLOGIES
3,2122.04,12/29/2017,Com Software<Cap Lmt,Planning and Development,20-20 TECHNOLOGIES
4,-168.04,12/29/2017,Com Software<Cap Lmt,Planning and Development,20-20 TECHNOLOGIES


In [405]:
phx_ven_pay_17.iloc[:,0].apply(lambda x: len(str(x))).head()

0    6
1    7
2    7
3    7
4    7
Name: Invoice Net Amt, dtype: int64

In [392]:
phx_ven_pay_17.columns

Index(['Invoice Net Amt', 'Check/Payment Date', 'Commitmt Item Name',
       'Dept. Descrptn', 'Vendor Name'],
      dtype='object')

In [371]:
phx_ven_pay_17.shape

(487380, 7)

In [375]:
phx_ven_pay_17.isnull().sum()

 Invoice Net Amt      121885
Check/Payment Date         3
Commitmt Item Name         3
Dept. Descrptn             7
Document Nbr          236119
Fund Center           236119
Invoice Net Amt       365496
Vendor ID Number      236119
Vendor Name                3
dtype: int64

### To clean up:
#### Drop:
+ ~~'Invoice Net Amt' shows up twice, for each column that has a value, the other column has NaN, how can I stitch these together into one column?~~
+ ~~'Vendor ID Number' isn't included in prior and later years, drop this~~
+ ~~'Fund Center' not included in prior and later years, drop this~~
+ ~~'Document Nbr' not included in prior and later years, drop this~~
+ ~~remove three records that don't have Vendor Name~~

#### Modify:
+ change 'Check/Payment Date' to datetime from object

In [395]:
phx_ven_pay_17.dtypes

Invoice Net Amt       object
Check/Payment Date    object
Commitmt Item Name    object
Dept. Descrptn        object
Vendor Name           object
dtype: object

In [377]:
# combine two 'Invoice Net Amt' columns into one based on NaN values
phx_ven_pay_17.iloc[:,0].fillna(phx_ven_pay_17.iloc[:,6], inplace=True)

In [378]:
# delete second 'Invoice Net Amt' column since invoice amounts are now being held in first 'Invoice Net Amt' column
phx_ven_pay_17.drop(phx_ven_pay_17.columns[6], axis=1, inplace=True)

In [381]:
# drop unnecessary columns
excess_columns = [
                  'Document Nbr',
                  'Fund Center',
                  'Vendor ID Number']

if set(excess_columns).issubset(phx_ven_pay_17.columns):
   phx_ven_pay_17.drop([ 
                        'Document Nbr', 
                        'Fund Center', 
                        'Vendor ID Number'], 
                       axis=1, inplace=True)

# columns in oldest data = Amount, Date, Department, Description, Vendor Display

In [383]:
# remove handful of rows that have no vendor name
phx_ven_pay_17.dropna(inplace=True)

In [385]:
# remove leading and trailing spaces before and after 'Invoice Net Amt' column label
phx_ven_pay_17.columns = phx_ven_pay_17.columns.str.strip()

In [386]:
phx_ven_pay_17.shape

(487373, 5)

### 2016 Data

In [304]:
files = [
    'data/2016/citycheckbookjantojune2016.csv',
    'data/2016/citycheckbookjulytodec2016.csv',
]

phx_ven_pay_16 = pd.concat([pd.read_csv(f) for f in files], sort=True)

In [396]:
phx_ven_pay_16.head()

Unnamed: 0,Invoice Net Amt,Check/Payment Date,Dept. Descrptn,Commitmt Item Name,Vendor Name
0,$855,01-04-2016,Community & Economic Development,Other Commodities,ABM PARKING SERVICES
1,$471,01-04-2016,Community & Economic Development,Other Commodities,ABM PARKING SERVICES
2,$813,01-04-2016,Community & Economic Development,Other Commodities,ABM PARKING SERVICES
3,$578.92,01-04-2016,Fire,Plumbing Services,ABOVE ALL PLUMBING SERVICES INC
4,"$2,712.04",01-04-2016,Fire,Plumbing Services,ABOVE ALL PLUMBING SERVICES INC


In [306]:
phx_ven_pay_16.dtypes

Amount             object
Date               object
Department         object
Description        object
G/L Description    object
Vendor Display     object
dtype: object

In [307]:
# rename columns
columns_to_rename = ['Department',
                    'Description',
                    'Amount',
                    'Vendor Display',
                    'Date']

if set(columns_to_rename).issubset(phx_ven_pay_16.columns):
   phx_ven_pay_16.rename(columns={'Department': 'Dept. Descrptn',
                                  'Description': 'Commitmt Item Name',
                                  'Amount': 'Invoice Net Amt',
                                  'Vendor Display': 'Vendor Name',
                                  'Date': 'Check/Payment Date'
                                  }, inplace=True)

In [308]:
# remove 'G/L Description' column
if 'G/L Description' in phx_ven_pay_16.columns:
    phx_ven_pay_16.drop('G/L Description', axis=1, inplace=True)

In [309]:
phx_ven_pay_16.shape

(533015, 5)

### To clean up:
+ 'Amount' column has dollar sign in front of it, must remove this in order to work with the values as numbers
+ convert 'Date' to datetime from object and rename to 'Check/Payment Date' in order to be consistent with later years
+ ~~change 'Description' to 'Dept. Descrptn' to be consistent with later years~~

### 2015 Data

In [310]:
files = [
    'data/2015/citycheckbookjantojune2015.csv',
    'data/2015/citycheckbookjulytodec2015.csv',
]

phx_ven_pay_15 = pd.concat([pd.read_csv(f) for f in files], sort=True)

In [397]:
phx_ven_pay_15.head()

Unnamed: 0,Invoice Net Amt,Check/Payment Date,Dept. Descrptn,Commitmt Item Name,Vendor Name
0,$80,01-02-2015,Municipal Court,Interpreters/Transl,A FOREIGN LANGUAGE SERVICE CORP
1,"$1,888.85",01-02-2015,Aviation,Small Tools/ Equip,A TO Z EQUIPMENT RENTALS
2,$22.46,01-02-2015,Aviation,Motor Vehicle Parts,A TO Z EQUIPMENT RENTALS
3,"$1,973.6",01-02-2015,Aviation,Small Tools/ Equip,A TO Z EQUIPMENT RENTALS
4,$17.33,01-02-2015,Public Works,Inventories,A-Z LOCK PRODUCTS CO INC


### To clean up:

+ change 'Description' to 'Dept. Descrptn' to be consistent with later years
+ change 'Department' to 'Dept. Descrptn' to be consistent with later years

+ convert 'Date' to datetime from object and rename to 'Check/Payment Date' in order to be consistent with later years

In [312]:
# rename columns
columns_to_rename = ['Department',
                    'Description',
                    'Amount',
                    'Vendor Display',
                    'Date']

if set(columns_to_rename).issubset(phx_ven_pay_15.columns):
   phx_ven_pay_15.rename(columns={'Department': 'Dept. Descrptn',
                                  'Description': 'Commitmt Item Name',
                                  'Amount': 'Invoice Net Amt',
                                  'Vendor Display': 'Vendor Name',
                                  'Date': 'Check/Payment Date'
                                  }, inplace=True)

In [313]:
phx_ven_pay_15.dtypes

Invoice Net Amt       object
Check/Payment Date    object
Dept. Descrptn        object
Commitmt Item Name    object
Vendor Name           object
dtype: object

In [314]:
phx_ven_pay_15.shape

(333440, 5)

### 2014 Data

In [315]:
files = [
    'data/2014/citycheckbookjantojune2014.csv',
    'data/2014/citycheckbookjulytodec2014.csv',
]

phx_ven_pay_14 = pd.concat([pd.read_csv(f) for f in files], sort=True)

In [398]:
phx_ven_pay_14.head()

Unnamed: 0,Invoice Net Amt,Check/Payment Date,Dept. Descrptn,Commitmt Item Name,Vendor Name
0,-$34.78,01-02-2014,Street Transportation,Inventories,3M COMPANY
1,-$69.55,01-02-2014,Street Transportation,Inventories,3M COMPANY
2,-$77,01-02-2014,Street Transportation,Inventories,3M COMPANY
3,$0.01,01-02-2014,Public Works,Inventories,AGS SAFETY & SUPPLY
4,-$0.79,01-02-2014,Police,Inventories,AMERICAN EUROCOPTER CORPORATION


### To clean up:

+ change 'Description' to 'Dept. Descrptn' to be consistent with later years
+ convert 'Date' to datetime from object and rename to 'Check/Payment Date' in order to be consistent with later years


In [317]:
# rename columns
columns_to_rename = ['Department',
                    'Description',
                    'Amount',
                    'Vendor Display',
                    'Date']

if set(columns_to_rename).issubset(phx_ven_pay_14.columns):
   phx_ven_pay_14.rename(columns={'Department': 'Dept. Descrptn',
                                  'Description': 'Commitmt Item Name',
                                  'Amount': 'Invoice Net Amt',
                                  'Vendor Display': 'Vendor Name',
                                  'Date': 'Check/Payment Date'
                                  }, inplace=True)

In [318]:
phx_ven_pay_14.dtypes

Invoice Net Amt       object
Check/Payment Date    object
Dept. Descrptn        object
Commitmt Item Name    object
Vendor Name           object
dtype: object

In [319]:
phx_ven_pay_14.shape

(307110, 5)

## Join Years 2014 - 2018 Together

In [440]:
phx_ven_pay_14to18 = pd.concat([phx_ven_pay_14,
                               phx_ven_pay_15,
                               phx_ven_pay_16,
                               phx_ven_pay_17,
                               phx_ven_pay_18],
                              sort=False)

In [441]:
phx_ven_pay_14to18.shape

(2215464, 5)

In [442]:
phx_ven_pay_14to18[phx_ven_pay_14to18['Invoice Net Amt'].apply(lambda x: str(x) == 'nan')].shape


(3, 5)

In [445]:
phx_ven_pay_14to18[phx_ven_pay_14to18['Invoice Net Amt'].apply(lambda x: str(x) == 'nan')].head()

Unnamed: 0,Invoice Net Amt,Check/Payment Date,Dept. Descrptn,Commitmt Item Name,Vendor Name
40668,,,,,
36771,,,,,
36771,,,,,


In [444]:
phx_ven_pay_14to18[phx_ven_pay_14to18['Invoice Net Amt'].apply(lambda x: str(x) == 'nan')].dropna(inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [451]:
phx_ven_pay_14to18[phx_ven_pay_14to18['Invoice Net Amt'].apply(lambda x: str(x) == 'nan')].head(20)

Unnamed: 0,Invoice Net Amt,Check/Payment Date,Dept. Descrptn,Commitmt Item Name,Vendor Name
0,,12/13/2017,Public Works,Inventories-Offline,1 STOP ELECTRONICS CENTER INC
1,,12/29/2017,Planning and Development,Com Software<Cap Lmt,20-20 TECHNOLOGIES
2,,12/29/2017,Planning and Development,Com Software<Cap Lmt,20-20 TECHNOLOGIES
3,,12/29/2017,Planning and Development,Com Software<Cap Lmt,20-20 TECHNOLOGIES
4,,12/29/2017,Planning and Development,Com Software<Cap Lmt,20-20 TECHNOLOGIES
5,,12/29/2017,Planning and Development,Oth Frght/Mvng/Strge,20-20 TECHNOLOGIES
6,,12/29/2017,Planning and Development,Oth Frght/Mvng/Strge,20-20 TECHNOLOGIES
7,,12/6/2017,Police,Comp Software Maint,3M COMPANY
8,,12/6/2017,Police,Comp Software Maint,3M COMPANY
9,,12/13/2017,Aviation,Interpreters/Transl,911 INTERPRETERS INC


### To clean up now that I have all the years together:

#### Invoice Net Amt:  
 + Remove $ signs
 + Remove commas(,)
 
#### Check/Payment Date:
 + Convert to Datetime format



In [447]:
phx_ven_pay_14to18.dtypes

Invoice Net Amt       float64
Check/Payment Date     object
Dept. Descrptn         object
Commitmt Item Name     object
Vendor Name            object
dtype: object

In [449]:
phx_ven_pay_14to18.isnull().sum()

Invoice Net Amt       289721
Check/Payment Date         4
Dept. Descrptn         20791
Commitmt Item Name    243646
Vendor Name                4
dtype: int64

In [416]:
# drop rows with no Vendor Name
# how do I drop from all columns based on an NaN in one column?
phx_ven_pay_14to18.dropna(inplace=True)

In [425]:
phx_ven_pay_14to18.shape

(2215464, 5)

In [446]:
# get the Invoice Net Amount from an object into a float64 format
# first, had to convert to type string
# phx_ven_pay_14to18.iloc[:,0] = phx_ven_pay_14to18.iloc[:,0].astype(str)

# then, find/replace commas
phx_ven_pay_14to18.iloc[:,0] = phx_ven_pay_14to18.iloc[:,0].str.replace(',','')
phx_ven_pay_14to18.iloc[:,0] = phx_ven_pay_14to18.iloc[:,0].str.replace('$','')

phx_ven_pay_14to18.iloc[:,0] = phx_ven_pay_14to18.iloc[:,0].str.replace('(','-')
phx_ven_pay_14to18.iloc[:,0] = phx_ven_pay_14to18.iloc[:,0].str.replace(')','')



# three vendor payments had NaN values, had to drop these before using to_numeric function below
# phx_ven_pay_18.dropna(inplace=True)

# finally, used to_numeric function
# I need to figure out how to set this to accept exceptions
phx_ven_pay_14to18.iloc[:,0] = pd.to_numeric(phx_ven_pay_14to18.iloc[:,0])

In [448]:
phx_ven_pay_14to18.head()

Unnamed: 0,Invoice Net Amt,Check/Payment Date,Dept. Descrptn,Commitmt Item Name,Vendor Name
0,-34.78,01-02-2014,Street Transportation,Inventories,3M COMPANY
1,-69.55,01-02-2014,Street Transportation,Inventories,3M COMPANY
2,-77.0,01-02-2014,Street Transportation,Inventories,3M COMPANY
3,0.01,01-02-2014,Public Works,Inventories,AGS SAFETY & SUPPLY
4,-0.79,01-02-2014,Police,Inventories,AMERICAN EUROCOPTER CORPORATION
