# Use pandas to process data using several basic operations

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

### Append and Concatenate (rows) DataFrames

In [None]:
# read input file
input_file1 = 'data/pandas_data_example1.csv'
input_file2 = 'data/pandas_data_example2.csv'
orders1 = pd.read_csv(input_file1, encoding='utf-8-sig')
orders2 = pd.read_csv(input_file2, encoding='utf-8-sig')

# concatenate one dataframe to another dataframe
orders = pd.concat([orders1, orders2], axis=0, ignore_index=True)
print '\nData after contenating:\n', orders

# append one row to the dataframe
row = orders.iloc[0]
orders = orders.append(row, ignore_index=True)
print '\nData after appending:\n', orders.tail(10)

# delete the last row from the dataframe
orders.drop(orders.index[len(orders.index)-1], inplace=True)
print '\nData after deleting:\n', orders.tail(10)

### Update column names and Obtain the set of unique values of a particular column

In [None]:
# print DataFrame information
print '\nIndex names:\n', orders.index
print '\nColumns names:\n', orders.columns
print '\nDataFrame values:\n', orders.values

# update column name
newColumns = {'Departure': 'Departure_Time', 'Arrival': 'Arrival_Time'}
orders.rename(columns=newColumns, inplace=True)
print '\nColumns names:\n', orders.columns

# print the set of unique values of a particular column
originSet = orders['Origin'].unique()
destinationSet = orders['Destination'].unique()
print '\nOrigin Set:\n', originSet
print '\nDestination Set:\n', destinationSet
print '\nExample rows of original data:\n', orders.head(4)
print orders.tail(4)

### Iterate through the DataFrame

In [None]:
for index, row in orders.iterrows():
    if row['Destination'] == 'GHI':
        print index, row['Origin'], row['Destination'], row['Quantity']

### Obtain the origins with the largest quantities

In [None]:
ordersByOrigin = orders.groupby(by=['Origin'])['Quantity'].sum().reset_index()
ordersByOrigin.sort_values(by=['Quantity'], ascending=False, inplace=True)
origins = ordersByOrigin.head(3)['Origin']
print origins.values

### Apply function to a particular column

In [None]:
## apply function to a particular column
newOriginSet = ['AAAA', 'BBBB', 'CCCC', 'DDDD']
originMap = {x : y for (x, y) in zip(originSet, newOriginSet)} # dict(zip(originSet, newOriginSet))
orders['Origin'] = orders['Origin'].apply(lambda x: originMap[x] if x in originMap else x)
print '\nData after updating column names:\n', orders.iloc[0:4]

### Filter data under some criteria

In [None]:
## filter data
destinationSet = ['DEF', 'GHI', 'JKL', 'MNO', 'PQR', 'STU']
destinationFilter = orders['Destination'].isin(destinationSet)
quantityFilter = orders['Quantity']>=5
orders = orders.loc[destinationFilter & quantityFilter]
print '\nData after filtering:\n', orders.iloc[0:4]

### Operate on Datatimelike columns

In [None]:
## operate on Datatimelike columns
departureTime = pd.to_datetime(orders['Departure_Time'], errors='coerce')
arrivalTime = pd.to_datetime(orders['Arrival_Time'], errors='coerce')
travelTime = (arrivalTime - departureTime).astype('timedelta64[s]')/(3600.0*24.0)
print '\nDeparture time series:\n', departureTime[0:4]

### Add and Drop columns to the DataFrame

In [None]:
## drop columns from the DataFrame
quantity = orders['Quantity']
orders.drop('Quantity', axis=1, inplace=True)
print '\nData after droping columns:\n', orders.head(5)

## add columns to the DataFrame
orders.loc[:, 'Quantity'] = quantity
orders.loc[:, 'Travel_Time'] = travelTime
orders.loc[:, 'Departure_Date'] = departureTime.dt.date # just keep the date part
orders.loc[:, 'Departure_Week'] = departureTime.dt.week # the week of year, index starting from 1
orders.loc[:, 'Departure_DayOfWeek'] = departureTime.dt.dayofweek # the day of week, e.g., 0-Mon, 4-Thu
orders.loc[:, 'Arrival_Date'] = arrivalTime.dt.date
orders.loc[:, 'Arrival_Week'] = arrivalTime.dt.week
orders.loc[:, 'Arrival_DayOfWeek'] = arrivalTime.dt.dayofweek
dayOfWeek = ['Mon','Tue','Wed','Thu','Fri','Sat','Sun']
orders['Departure_DayOfWeek'] = orders['Departure_DayOfWeek'].apply(lambda x : dayOfWeek[x])
orders['Arrival_DayOfWeek'] = orders['Arrival_DayOfWeek'].apply(lambda x : dayOfWeek[x])

print '\nData after adding columns:\n', orders.head(5)

### Pivot tables

In [None]:
## Construct a pivot table, something like unstack function
tempOrders = orders[['Origin','Destination','Quantity','Departure_Week']]
pivotTable = pd.pivot_table(tempOrders, values='Quantity', index=['Origin','Destination'], columns=['Departure_Week'])
pivotTable.fillna(value=0.0, inplace=True)
print pivotTable

### Sort w.r.t. columns, (1) default order; (2) customized order

In [None]:
## sort w.r.t. columns in default order: numeric/time/alphabetical
orders.sort_values(by=['Origin', 'Destination', 'Departure_Time'], inplace=True)
print '\nData after sorting:\n', orders[['Origin','Destination','Quantity','Departure_Time','Arrival_Time']]

## sort w.r.t. columns in customized order:
dayOfWeek = ['Mon','Tue','Wed','Thu','Fri','Sat','Sun']
sortedOrders = orders.groupby(by=['Origin','Destination','Departure_DayOfWeek'])['Quantity'].sum().reset_index()

# order by Departure_DayOfWeek
sortedOrders['Departure_DayOfWeek'] = sortedOrders['Departure_DayOfWeek'].astype('category')
sortedOrders['Departure_DayOfWeek'].cat.set_categories(dayOfWeek, inplace=True)
sortedOrders.sort_values(by=['Origin','Destination','Departure_DayOfWeek'], ascending=[True, True, True], inplace=True)
print '\nData after sorting:\n', sortedOrders

### Groupby functions: (1) single groupby; (2) multiple groupby

In [None]:
# use of single groupby function
ordersByOriDesPair = orders.groupby(by=['Origin', 'Destination'])['Quantity'].sum().reset_index()
print '\nData after group by single function:\n', ordersByOriDesPair

# use of multiple groupby functions
# Pandas includes multiple built in functions such as sum/mean/std/max/min
groupByFuns = {'Quantity': ['sum'], 'Travel_Time': ['mean', 'std']}
ordersByMultiGroupby = orders.groupby(by=['Origin', 'Destination']).agg(groupByFuns).reset_index()
ordersByMultiGroupby.fillna(0.0, inplace=True)

# # drop the outermost level from the hierarchical column index
# ordersByMultiGroupby.columns = ordersByMultiGroupby.columns.droplevel(0)

# form multi-level column labels
newLabels = ['_'.join(col) for col in ordersByMultiGroupby.columns.ravel()]
ordersByMultiGroupby.columns = newLabels
print '\nData after group by multiple functions:\n', ordersByMultiGroupby

### Unstack function

In [None]:
## unstack function
dayOfWeek = ['Mon','Tue','Wed','Thu','Fri','Sat','Sun']
ordersByOrigin = orders.groupby(by=['Origin', 'Departure_Week', 'Departure_DayOfWeek'])['Quantity'].sum().reset_index()

# order by Departure_DayOfWeek first
ordersByOrigin['Departure_DayOfWeek'] = ordersByOrigin['Departure_DayOfWeek'].astype('category')
ordersByOrigin['Departure_DayOfWeek'].cat.set_categories(dayOfWeek, inplace=True)

# unstack by Departure_DayOfWeek
unstackByDayofweek = ordersByOrigin.set_index(['Origin', 'Departure_Week', 'Departure_DayOfWeek'])
unstackByDayofweek = unstackByDayofweek.unstack('Departure_DayOfWeek')
unstackByDayofweek.fillna(0.0, inplace=True)
print '\nData unstacked by DayOfWeek:\n', unstackByDayofweek

# unstack by Departure_Week
unstackByWeek = ordersByOrigin.set_index(['Origin', 'Departure_DayOfWeek', 'Departure_Week'])
unstackByWeek = unstackByWeek.unstack('Departure_Week')
unstackByWeek.fillna(0.0, inplace=True)
print '\nData unstacked by Week:\n', unstackByWeek

### Merge function

In [None]:
## performing a database-style join operation by columns or indexes
orders1 = orders.loc[orders['Origin'].isin(['AAAA','BBBB'])]
orders2 = orders.loc[orders['Origin'].isin(['BBBB','CCCC'])]
newOrders = orders1.merge(orders2, how='inner', on=['Origin', 'Destination'])
print newOrders

### Weighted average function

In [None]:
## weighted average by built-in function
ordersByOriDesPair = orders.groupby(by=['Origin','Destination'])
avgByOriDesPair = ordersByOriDesPair.apply(lambda x : np.average(x['Travel_Time'], weights=x['Quantity'])).reset_index(name='Avg_Travel_Time')
print '\nWeighted average by built-in function:\n', avgByOriDesPair

## weighted average by customized function
def weighted_average(dataframe, value_column, weight_column):
    value = dataframe[value_column]
    weight = dataframe[weight_column]
    try:
        return (value*weight).sum()/weight.sum()
    except ZeroDivisionError:
        return value.mean()

ordersByOriDesPair = orders.groupby(by=['Origin','Destination'])
avgByOriDesPair = ordersByOriDesPair.apply(weighted_average, 'Travel_Time', 'Quantity').reset_index(name='Avg_Travel_Time')
print '\nWeighted average by customized function:\n', avgByOriDesPair

### Plot time series data

In [None]:
## plot one time series
ordersForAllTrips = orders.groupby(by=['Departure_Time'])['Quantity'].sum().reset_index()
ordersForAllTrips.set_index('Departure_Time', inplace=True)
ordersForAllTrips.cumsum().plot()
print ordersForAllTrips.head(5)
plt.show()

## plot multiple time series
ordersForIndiTrips = orders.groupby(by=['Origin','Departure_Time'])['Quantity'].sum().reset_index()
ordersForIndiTrips.set_index(['Departure_Time', 'Origin'], inplace=True)
# first unstack the data into the desired form
ordersForIndiTrips = ordersForIndiTrips.unstack('Origin')
ordersForIndiTrips.fillna(value=0.0, inplace=True)
ordersForIndiTrips.columns = ordersForIndiTrips.columns.droplevel(0)
ordersForIndiTrips.cumsum().plot()
print ordersForIndiTrips.head(5)
plt.show()