In [1]:
import pandas as pd

# Import Data & Optimization

In [2]:
# import data
bigmac = pd.read_csv('data/bigmac.csv', parse_dates = ['Date'])

# import data with multi-index set (will use prior df for illustrative purposes below)
bigmac_indexed_start = pd.read_csv(
    filepath_or_buffer = 'data/bigmac.csv',
    parse_dates = ['Date'],
    index_col = ['Date', 'Country']
)

bigmac.head()

Unnamed: 0,Date,Country,Price in US Dollars
0,2016-01-01,Argentina,2.39
1,2016-01-01,Australia,3.74
2,2016-01-01,Brazil,3.35
3,2016-01-01,Britain,4.22
4,2016-01-01,Canada,4.14


In [3]:
# display data types
bigmac.dtypes

Date                   datetime64[ns]
Country                        object
Price in US Dollars           float64
dtype: object

In [4]:
# display info
bigmac.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 652 entries, 0 to 651
Data columns (total 3 columns):
 #   Column               Non-Null Count  Dtype         
---  ------               --------------  -----         
 0   Date                 652 non-null    datetime64[ns]
 1   Country              652 non-null    object        
 2   Price in US Dollars  652 non-null    float64       
dtypes: datetime64[ns](1), float64(1), object(1)
memory usage: 15.4+ KB


# MultiIndex DataFrames

In [5]:
# creating multi-index with set_index method
bigmac_indexed = bigmac.set_index(
    keys = ['Date', 'Country'], # pass list when setting multi index
    drop = True # default (will drop column from df)
)

# order of keys list dictates order of results
# best practice to use variable with least distinct values first

bigmac_indexed.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Price in US Dollars
Date,Country,Unnamed: 2_level_1
2016-01-01,Argentina,2.39
2016-01-01,Australia,3.74
2016-01-01,Brazil,3.35
2016-01-01,Britain,4.22
2016-01-01,Canada,4.14


In [6]:
# sorting multi-indexed dataframes
bigmac_indexed.sort_index() # will sort groupings from left to right

Unnamed: 0_level_0,Unnamed: 1_level_0,Price in US Dollars
Date,Country,Unnamed: 2_level_1
2010-01-01,Argentina,1.84
2010-01-01,Australia,3.98
2010-01-01,Brazil,4.76
2010-01-01,Britain,3.67
2010-01-01,Canada,3.97
...,...,...
2016-01-01,Ukraine,1.54
2016-01-01,United States,4.93
2016-01-01,Uruguay,3.74
2016-01-01,Venezuela,0.66


In [7]:
# extract index level values using labels
from datetime import date


date_level = bigmac_indexed.index.get_level_values('Date')
country_level = bigmac_indexed.index.get_level_values('Country')

print(date_level)
print('\n')
print(country_level)

DatetimeIndex(['2016-01-01', '2016-01-01', '2016-01-01', '2016-01-01',
               '2016-01-01', '2016-01-01', '2016-01-01', '2016-01-01',
               '2016-01-01', '2016-01-01',
               ...
               '2010-01-01', '2010-01-01', '2010-01-01', '2010-01-01',
               '2010-01-01', '2010-01-01', '2010-01-01', '2010-01-01',
               '2010-01-01', '2010-01-01'],
              dtype='datetime64[ns]', name='Date', length=652, freq=None)


Index(['Argentina', 'Australia', 'Brazil', 'Britain', 'Canada', 'Chile',
       'China', 'Colombia', 'Costa Rica', 'Czech Republic',
       ...
       'Sri Lanka', 'Sweden', 'Switzerland', 'Taiwan', 'Thailand', 'Turkey',
       'UAE', 'Ukraine', 'United States', 'Uruguay'],
      dtype='object', name='Country', length=652)


In [8]:
# extract index level values using index position
from datetime import date


date_level = bigmac_indexed.index.get_level_values(0)
country_level = bigmac_indexed.index.get_level_values(1)

print(date_level)
print('\n')
print(country_level)

DatetimeIndex(['2016-01-01', '2016-01-01', '2016-01-01', '2016-01-01',
               '2016-01-01', '2016-01-01', '2016-01-01', '2016-01-01',
               '2016-01-01', '2016-01-01',
               ...
               '2010-01-01', '2010-01-01', '2010-01-01', '2010-01-01',
               '2010-01-01', '2010-01-01', '2010-01-01', '2010-01-01',
               '2010-01-01', '2010-01-01'],
              dtype='datetime64[ns]', name='Date', length=652, freq=None)


Index(['Argentina', 'Australia', 'Brazil', 'Britain', 'Canada', 'Chile',
       'China', 'Colombia', 'Costa Rica', 'Czech Republic',
       ...
       'Sri Lanka', 'Sweden', 'Switzerland', 'Taiwan', 'Thailand', 'Turkey',
       'UAE', 'Ukraine', 'United States', 'Uruguay'],
      dtype='object', name='Country', length=652)


In [9]:
# checking for inlusion
'Argentina' in country_level

True

In [10]:
# changing index label name in position order
bigmac_indexed.index.set_names(names = ['Day', 'Location'], inplace = True)
bigmac_indexed.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Price in US Dollars
Day,Location,Unnamed: 2_level_1
2016-01-01,Argentina,2.39
2016-01-01,Australia,3.74
2016-01-01,Brazil,3.35
2016-01-01,Britain,4.22
2016-01-01,Canada,4.14


In [11]:
# changing index label name explicitly (level also accepts index position)
bigmac_indexed.index.set_names(names = 'Date', level = 'Day', inplace = True)
bigmac_indexed.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Price in US Dollars
Date,Location,Unnamed: 2_level_1
2016-01-01,Argentina,2.39
2016-01-01,Australia,3.74
2016-01-01,Brazil,3.35
2016-01-01,Britain,4.22
2016-01-01,Canada,4.14


In [12]:
# sorting at different levels
bigmac_indexed = bigmac_indexed.sort_index(ascending = [False, True]) # corresponds to index order
bigmac_indexed.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Price in US Dollars
Date,Location,Unnamed: 2_level_1
2016-01-01,Argentina,2.39
2016-01-01,Australia,3.74
2016-01-01,Austria,3.76
2016-01-01,Belgium,4.25
2016-01-01,Brazil,3.35


In [13]:
# sorting specified levels
bigmac_indexed = bigmac_indexed.sort_index(
    level = 'Date', # level also takes index position
    ascending = False
)

bigmac_indexed.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Price in US Dollars
Date,Location,Unnamed: 2_level_1
2016-01-01,Vietnam,2.67
2016-01-01,Venezuela,0.66
2016-01-01,Uruguay,3.74
2016-01-01,United States,4.93
2016-01-01,Ukraine,1.54


In [14]:
# sorting at different levels explicitly
bigmac_indexed = bigmac_indexed.sort_index(
    level = ['Date', 'Location'], # level also takes index position
    ascending = [False, True]
)

bigmac_indexed.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Price in US Dollars
Date,Location,Unnamed: 2_level_1
2016-01-01,Argentina,2.39
2016-01-01,Australia,3.74
2016-01-01,Austria,3.76
2016-01-01,Belgium,4.25
2016-01-01,Brazil,3.35


# Extracting Rows from MultiIndex DataFrames

In [15]:
# extracting specified grouping
bigmac_indexed.loc['2010-01-01'].head()

Unnamed: 0_level_0,Price in US Dollars
Location,Unnamed: 1_level_1
Argentina,1.84
Australia,3.98
Brazil,4.76
Britain,3.67
Canada,3.97


In [16]:
# extracting specified row explicitly
bigmac_indexed.loc[
    ('2010-01-01', 'Argentina'), # tuple
    'Price in US Dollars' # col name
]

1.84

In [17]:
# extracting row with iloc
# regardless of multi-index lables, each row has a specific index position
bigmac_indexed.iloc[0]

Price in US Dollars    2.39
Name: (2016-01-01 00:00:00, Argentina), dtype: float64

In [18]:
# extracting multiple rows with iloc
bigmac_indexed.iloc[[10, 20, 100, 120, 200, 220]]

Unnamed: 0_level_0,Unnamed: 1_level_0,Price in US Dollars
Date,Location,Unnamed: 2_level_1
2016-01-01,Costa Rica,4.02
2016-01-01,Hong Kong,2.48
2015-07-01,Sri Lanka,2.61
2015-01-01,China,2.77
2014-07-01,New Zealand,4.94
2014-07-01,Ukraine,1.63


In [19]:
# extracting multiple rows with iloc (slicing)
bigmac_indexed.iloc[1:10]

Unnamed: 0_level_0,Unnamed: 1_level_0,Price in US Dollars
Date,Location,Unnamed: 2_level_1
2016-01-01,Australia,3.74
2016-01-01,Austria,3.76
2016-01-01,Belgium,4.25
2016-01-01,Brazil,3.35
2016-01-01,Britain,4.22
2016-01-01,Canada,4.14
2016-01-01,Chile,2.94
2016-01-01,China,2.68
2016-01-01,Colombia,2.43


# Transpose
Swaps column headers with row labels.

In [20]:
# transpose to a column multi-index
# does not include an inplace parameter so you have to save to a variable
bigmac_col_index = bigmac_indexed.transpose()
bigmac_col_index

Date,2016-01-01,2016-01-01,2016-01-01,2016-01-01,2016-01-01,2016-01-01,2016-01-01,2016-01-01,2016-01-01,2016-01-01,...,2010-01-01,2010-01-01,2010-01-01,2010-01-01,2010-01-01,2010-01-01,2010-01-01,2010-01-01,2010-01-01,2010-01-01
Location,Argentina,Australia,Austria,Belgium,Brazil,Britain,Canada,Chile,China,Colombia,...,Sri Lanka,Sweden,Switzerland,Taiwan,Thailand,Turkey,UAE,Ukraine,United States,Uruguay
Price in US Dollars,2.39,3.74,3.76,4.25,3.35,4.22,4.14,2.94,2.68,2.43,...,1.83,5.51,6.3,2.36,2.11,3.83,2.99,1.83,3.58,3.32


In [21]:
# using loc with multi-index columns
bigmac_col_index.loc[
    ('Price in US Dollars',)
]

Date        Location     
2016-01-01  Argentina        2.39
            Australia        3.74
            Austria          3.76
            Belgium          4.25
            Brazil           3.35
                             ... 
2010-01-01  Turkey           3.83
            UAE              2.99
            Ukraine          1.83
            United States    3.58
            Uruguay          3.32
Name: Price in US Dollars, Length: 652, dtype: float64

In [22]:
# using loc with multi-index columns and a second argument
bigmac_col_index.loc[
    ('Price in US Dollars',), # rows
    ('2010-01-01', 'Argentina') # columns
]

Price in US Dollars    1.84
Name: (2010-01-01 00:00:00, Argentina), dtype: float64

# Swap Level

In [23]:
# swaplevel with index labels
bigmac_indexed.swaplevel(
    'Date', 'Location' # order does not matter, index labels will be swapped
)

Unnamed: 0_level_0,Unnamed: 1_level_0,Price in US Dollars
Location,Date,Unnamed: 2_level_1
Argentina,2016-01-01,2.39
Australia,2016-01-01,3.74
Austria,2016-01-01,3.76
Belgium,2016-01-01,4.25
Brazil,2016-01-01,3.35
...,...,...
Turkey,2010-01-01,3.83
UAE,2010-01-01,2.99
Ukraine,2010-01-01,1.83
United States,2010-01-01,3.58


In [24]:
# swaplevel with index position
bigmac_indexed.swaplevel(
    0, 1 # order does not matter, index labels will be swapped
)

Unnamed: 0_level_0,Unnamed: 1_level_0,Price in US Dollars
Location,Date,Unnamed: 2_level_1
Argentina,2016-01-01,2.39
Australia,2016-01-01,3.74
Austria,2016-01-01,3.76
Belgium,2016-01-01,4.25
Brazil,2016-01-01,3.35
...,...,...
Turkey,2010-01-01,3.83
UAE,2010-01-01,2.99
Ukraine,2010-01-01,1.83
United States,2010-01-01,3.58


# Stack and Unstack Methods

In [25]:
# import data
world = pd.read_csv('data/worldstats.csv', index_col = ['country', 'year'])
world.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Population,GDP
country,year,Unnamed: 2_level_1,Unnamed: 3_level_1
Arab World,2015,392022276.0,2530102000000.0
Arab World,2014,384222592.0,2873600000000.0
Arab World,2013,376504253.0,2846994000000.0
Arab World,2012,368802611.0,2773270000000.0
Arab World,2011,361031820.0,2497945000000.0


In [26]:
# stack method (moves col index to row index)
world_stacked = world.stack() # results in a series
world_stacked

country     year            
Arab World  2015  Population    3.920223e+08
                  GDP           2.530102e+12
            2014  Population    3.842226e+08
                  GDP           2.873600e+12
            2013  Population    3.765043e+08
                                    ...     
Zimbabwe    1962  GDP           1.117602e+09
            1961  Population    3.876638e+06
                  GDP           1.096647e+09
            1960  Population    3.752390e+06
                  GDP           1.052990e+09
Length: 22422, dtype: float64

In [27]:
# stack method (moves col index to row index)
world.stack().to_frame().rename(columns = {0: 'Statistic'}) # results in a data frame

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Statistic
country,year,Unnamed: 2_level_1,Unnamed: 3_level_1
Arab World,2015,Population,3.920223e+08
Arab World,2015,GDP,2.530102e+12
Arab World,2014,Population,3.842226e+08
Arab World,2014,GDP,2.873600e+12
Arab World,2013,Population,3.765043e+08
...,...,...,...
Zimbabwe,1962,GDP,1.117602e+09
Zimbabwe,1961,Population,3.876638e+06
Zimbabwe,1961,GDP,1.096647e+09
Zimbabwe,1960,Population,3.752390e+06


In [28]:
# unstack method chained
world_stacked.unstack().unstack().unstack()

            year  country           
Population  1960  Afghanistan           8.994793e+06
                  Albania                        NaN
                  Algeria               1.112489e+07
                  Andorra                        NaN
                  Angola                         NaN
                                            ...     
GDP         2015  West Bank and Gaza    1.267740e+10
                  World                 7.343364e+13
                  Yemen, Rep.                    NaN
                  Zambia                2.120156e+10
                  Zimbabwe              1.389294e+10
Length: 28224, dtype: float64

In [29]:
# unstack level argument (can use index positions or labels)
world_stacked.unstack(level = 'country')

Unnamed: 0_level_0,country,Afghanistan,Albania,Algeria,Andorra,Angola,Antigua and Barbuda,Arab World,Argentina,Armenia,Aruba,...,Uzbekistan,Vanuatu,"Venezuela, RB",Vietnam,Virgin Islands (U.S.),West Bank and Gaza,World,"Yemen, Rep.",Zambia,Zimbabwe
year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
1960,Population,8.994793e+06,,1.112489e+07,,,,,,,,...,,,8.146845e+06,,32000.0,,3.035056e+09,,3.049586e+06,3.752390e+06
1960,GDP,5.377778e+08,,2.723638e+09,,,,,,,,...,,,8.607600e+09,,24200000.0,,1.364643e+12,,6.987397e+08,1.052990e+09
1961,Population,9.164945e+06,,1.140486e+07,,,,,,,,...,,,8.461684e+06,,34100.0,,3.076121e+09,,3.142848e+06,3.876638e+06
1961,GDP,5.488889e+08,,2.434767e+09,,,,,,,,...,,,8.923367e+09,,25700000.0,,1.420440e+12,,6.823597e+08,1.096647e+09
1962,Population,9.343772e+06,,1.169015e+07,,,,,2.128768e+07,,,...,,,8.790590e+06,,36300.0,,3.129064e+09,,3.240664e+06,4.006262e+06
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2013,GDP,2.004633e+10,1.278103e+10,2.097035e+11,3.249101e+09,1.249121e+11,1.200588e+09,2.846994e+12,6.239320e+11,1.112147e+10,,...,5.679566e+10,8.017876e+08,3.713366e+11,1.712220e+11,,1.247600e+10,7.643132e+13,3.595450e+10,2.804552e+10,1.349023e+10
2014,Population,3.162751e+07,2.893654e+06,3.893433e+07,,2.422752e+07,9.090000e+04,3.842226e+08,4.298003e+07,3.006154e+06,,...,3.075770e+07,2.588830e+05,,9.072890e+07,,4.294682e+06,7.260780e+09,,1.572134e+07,1.524586e+07
2014,GDP,2.005019e+10,1.327796e+10,2.135185e+11,,1.267751e+11,1.220976e+09,2.873600e+12,5.480549e+11,1.164444e+10,,...,6.313285e+10,8.149546e+08,,1.862047e+11,,1.271560e+10,7.810634e+13,,2.713464e+10,1.419691e+10
2015,Population,3.252656e+07,2.889167e+06,3.966652e+07,,2.502197e+07,9.181800e+04,3.920223e+08,,3.017712e+06,,...,3.129950e+07,,,9.170380e+07,,4.422143e+06,7.346633e+09,,1.621177e+07,1.560275e+07


In [30]:
# unstack level argument (can use index positions or labels) with fill value
world_stacked.unstack(level = 'country', fill_value = 0)

Unnamed: 0_level_0,country,Afghanistan,Albania,Algeria,Andorra,Angola,Antigua and Barbuda,Arab World,Argentina,Armenia,Aruba,...,Uzbekistan,Vanuatu,"Venezuela, RB",Vietnam,Virgin Islands (U.S.),West Bank and Gaza,World,"Yemen, Rep.",Zambia,Zimbabwe
year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
1960,Population,8.994793e+06,0.000000e+00,1.112489e+07,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.0,...,0.000000e+00,0.000000e+00,8.146845e+06,0.000000e+00,32000.0,0.000000e+00,3.035056e+09,0.000000e+00,3.049586e+06,3.752390e+06
1960,GDP,5.377778e+08,0.000000e+00,2.723638e+09,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.0,...,0.000000e+00,0.000000e+00,8.607600e+09,0.000000e+00,24200000.0,0.000000e+00,1.364643e+12,0.000000e+00,6.987397e+08,1.052990e+09
1961,Population,9.164945e+06,0.000000e+00,1.140486e+07,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.0,...,0.000000e+00,0.000000e+00,8.461684e+06,0.000000e+00,34100.0,0.000000e+00,3.076121e+09,0.000000e+00,3.142848e+06,3.876638e+06
1961,GDP,5.488889e+08,0.000000e+00,2.434767e+09,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.0,...,0.000000e+00,0.000000e+00,8.923367e+09,0.000000e+00,25700000.0,0.000000e+00,1.420440e+12,0.000000e+00,6.823597e+08,1.096647e+09
1962,Population,9.343772e+06,0.000000e+00,1.169015e+07,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,2.128768e+07,0.000000e+00,0.0,...,0.000000e+00,0.000000e+00,8.790590e+06,0.000000e+00,36300.0,0.000000e+00,3.129064e+09,0.000000e+00,3.240664e+06,4.006262e+06
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2013,GDP,2.004633e+10,1.278103e+10,2.097035e+11,3.249101e+09,1.249121e+11,1.200588e+09,2.846994e+12,6.239320e+11,1.112147e+10,0.0,...,5.679566e+10,8.017876e+08,3.713366e+11,1.712220e+11,0.0,1.247600e+10,7.643132e+13,3.595450e+10,2.804552e+10,1.349023e+10
2014,Population,3.162751e+07,2.893654e+06,3.893433e+07,0.000000e+00,2.422752e+07,9.090000e+04,3.842226e+08,4.298003e+07,3.006154e+06,0.0,...,3.075770e+07,2.588830e+05,0.000000e+00,9.072890e+07,0.0,4.294682e+06,7.260780e+09,0.000000e+00,1.572134e+07,1.524586e+07
2014,GDP,2.005019e+10,1.327796e+10,2.135185e+11,0.000000e+00,1.267751e+11,1.220976e+09,2.873600e+12,5.480549e+11,1.164444e+10,0.0,...,6.313285e+10,8.149546e+08,0.000000e+00,1.862047e+11,0.0,1.271560e+10,7.810634e+13,0.000000e+00,2.713464e+10,1.419691e+10
2015,Population,3.252656e+07,2.889167e+06,3.966652e+07,0.000000e+00,2.502197e+07,9.181800e+04,3.920223e+08,0.000000e+00,3.017712e+06,0.0,...,3.129950e+07,0.000000e+00,0.000000e+00,9.170380e+07,0.0,4.422143e+06,7.346633e+09,0.000000e+00,1.621177e+07,1.560275e+07


In [31]:
# unstack with multiple level argument (can use index positions or labels)
world_stacked.unstack(level = ['country', 'year'])

country,Arab World,Arab World,Arab World,Arab World,Arab World,Arab World,Arab World,Arab World,Arab World,Arab World,...,Zimbabwe,Zimbabwe,Zimbabwe,Zimbabwe,Zimbabwe,Zimbabwe,Zimbabwe,Zimbabwe,Zimbabwe,Zimbabwe
year,2015,2014,2013,2012,2011,2010,2009,2008,2007,2006,...,1969,1968,1967,1966,1965,1964,1963,1962,1961,1960
Population,392022300.0,384222600.0,376504300.0,368802600.0,361031800.0,353112200.0,345054200.0,336886500.0,328766600.0,320906700.0,...,5036321.0,4874113.0,4718612.0,4568320.0,4422132.0,4279561.0,4140804.0,4006262.0,3876638.0,3752390.0
GDP,2530102000000.0,2873600000000.0,2846994000000.0,2773270000000.0,2497945000000.0,2103825000000.0,1798878000000.0,2081343000000.0,1641666000000.0,1404190000000.0,...,1747999000.0,1479600000.0,1397002000.0,1281750000.0,1311436000.0,1217138000.0,1159512000.0,1117602000.0,1096647000.0,1052990000.0


# Pivot and Pivot Table Methods

In [33]:
# import data
sales = pd.read_csv('data/salesmen.csv', parse_dates = ['Date'])
sales['Salesman'] = sales['Salesman'].astype('category')
sales.head()

Unnamed: 0,Date,Salesman,Revenue
0,2016-01-01,Bob,7172
1,2016-01-02,Bob,6362
2,2016-01-03,Bob,5982
3,2016-01-04,Bob,7917
4,2016-01-05,Bob,7837


In [34]:
# pivot method
sales.pivot(
    index = 'Date',
    columns = 'Salesman',
    values = 'Revenue'
)

Salesman,Bob,Dave,Jeb,Oscar,Ronald
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2016-01-01,7172,1864,4430,5250,2639
2016-01-02,6362,8278,8026,8661,4951
2016-01-03,5982,4226,5188,7075,2703
2016-01-04,7917,3868,3144,2524,4258
2016-01-05,7837,2287,938,2793,7771
...,...,...,...,...,...
2016-12-27,2045,2843,6666,835,2981
2016-12-28,100,8888,1243,3073,6129
2016-12-29,4115,9490,3498,6424,7662
2016-12-30,2577,3594,8858,7088,2570


In [35]:
# import data
foods = pd.read_csv('data/foods.csv')
foods.head()

Unnamed: 0,First Name,Gender,City,Frequency,Item,Spend
0,Wanda,Female,Stamford,Weekly,Burger,15.66
1,Eric,Male,Stamford,Daily,Chalupa,10.56
2,Charles,Male,New York,Never,Sushi,42.14
3,Anna,Female,Philadelphia,Once,Ice Cream,11.01
4,Deborah,Female,Philadelphia,Daily,Chalupa,23.49


In [44]:
# pivot table method
foods.pivot_table(
    index = 'Gender',
    values = 'Spend',
    aggfunc = 'mean' # sum, std, min, max, count  
)

Unnamed: 0_level_0,Spend
Gender,Unnamed: 1_level_1
Female,50.709629
Male,49.397623


In [47]:
# pivot table method with multi-index
foods.pivot_table(
    index = ['Gender', 'Item'],
    values = 'Spend',
    aggfunc = 'mean' # sum, std, min, max, count  
)

Unnamed: 0_level_0,Unnamed: 1_level_0,Spend
Gender,Item,Unnamed: 2_level_1
Female,Burger,49.930488
Female,Burrito,50.092
Female,Chalupa,54.635
Female,Donut,49.926316
Female,Ice Cream,49.788519
Female,Sushi,50.355699
Male,Burger,49.613919
Male,Burrito,48.344819
Male,Chalupa,49.186761
Male,Donut,43.649565


In [49]:
# pivot table method with multi-index and column argument
foods.pivot_table(
    index = ['Gender', 'Item'],
    columns = 'City', # can also pass a list for columns (increases complexity)
    values = 'Spend',
    aggfunc = 'mean' # sum, std, min, max, count 
)

Unnamed: 0_level_0,City,New York,Philadelphia,Stamford
Gender,Item,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Female,Burger,51.626667,52.87871,45.037778
Female,Burrito,42.563043,52.098571,53.532647
Female,Chalupa,46.135789,52.291562,64.094
Female,Donut,46.670323,54.642,48.734118
Female,Ice Cream,56.356296,46.225625,46.910455
Female,Sushi,47.75129,58.096,45.622188
Male,Burger,58.822273,44.675238,46.424516
Male,Burrito,55.976,43.764333,46.438929
Male,Chalupa,49.1108,48.444783,50.011304
Male,Donut,44.842333,37.859394,49.004483


In [53]:
# using pivot table as a pandas method
pd.pivot_table(
    data = sales,
    index = 'Salesman',
    values = 'Revenue',
    aggfunc = 'sum'
)

Unnamed: 0_level_0,Revenue
Salesman,Unnamed: 1_level_1
Bob,1827179
Dave,1859063
Jeb,1918418
Oscar,1777779
Ronald,1827112


# Melt Method

In [54]:
# import data
sales_quarter = pd.read_csv('data/quarters.csv')
sales_quarter

Unnamed: 0,Salesman,Q1,Q2,Q3,Q4
0,Boris,602908,233879,354479,32704
1,Bob,43790,514863,297151,544493
2,Tommy,392668,113579,430882,247231
3,Travis,834663,266785,749238,570524
4,Donald,580935,411379,110390,651572
5,Ted,656644,70803,375948,321388
6,Jeb,486141,600753,742716,404995
7,Stacy,479662,742806,770712,2501
8,Morgan,992673,879183,37945,293710


In [59]:
sales_quarter.melt(
    id_vars = 'Salesman', # preserved column
    var_name = 'Quarter',
    value_name = 'Revenue'
)

Unnamed: 0,Salesman,Quarter,Revenue
0,Boris,Q1,602908
1,Bob,Q1,43790
2,Tommy,Q1,392668
3,Travis,Q1,834663
4,Donald,Q1,580935
5,Ted,Q1,656644
6,Jeb,Q1,486141
7,Stacy,Q1,479662
8,Morgan,Q1,992673
9,Boris,Q2,233879
