#### General setup.
___

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns

<br>

#### Data simulation.
___

In [2]:
# Simulate data for programming language popularity in 2014
pop2014 = pd.Series([100,99.3,95.5,93.5,92.4,84.8,84.5,78.9,74.3,72.8],
                    index=['Java','C','C++','Python','C#','PHP','JavaScript','Ruby','R','Matlab'])
pop2014

Java          100.0
C              99.3
C++            95.5
Python         93.5
C#             92.4
PHP            84.8
JavaScript     84.5
Ruby           78.9
R              74.3
Matlab         72.8
dtype: float64

In [3]:
# Create the ame but for 2015 using dictionary
pop2015 = pd.Series({'Java': 100,'C': 99.9,'C++': 99.4,'Python': 96.5,'C#': 91.3,
                     'R': 84.8,'PHP': 84.5, 'JavaScript': 83.0, 'Ruby': 76.2, 'Matlab': 72.4})
pop2015

Java          100.0
C              99.9
C++            99.4
Python         96.5
C#             91.3
R              84.8
PHP            84.5
JavaScript     83.0
Ruby           76.2
Matlab         72.4
dtype: float64

In [4]:
two_years = pd.DataFrame({'2014': pop2014, '2015': pop2015})
two_years

Unnamed: 0,2014,2015
C,99.3,99.9
C#,92.4,91.3
C++,95.5,99.4
Java,100.0,100.0
JavaScript,84.5,83.0
Matlab,72.8,72.4
PHP,84.8,84.5
Python,93.5,96.5
R,74.3,84.8
Ruby,78.9,76.2


In [5]:
two_years.sort_values('2015', ascending=False, inplace=True)

In [6]:
two_years

Unnamed: 0,2014,2015
Java,100.0,100.0
C,99.3,99.9
C++,95.5,99.4
Python,93.5,96.5
C#,92.4,91.3
R,74.3,84.8
PHP,84.8,84.5
JavaScript,84.5,83.0
Ruby,78.9,76.2
Matlab,72.8,72.4


In [7]:
two_years['avg'] = (two_years['2014'] + two_years['2015']) * 0.5
two_years

Unnamed: 0,2014,2015,avg
Java,100.0,100.0,100.0
C,99.3,99.9,99.6
C++,95.5,99.4,97.45
Python,93.5,96.5,95.0
C#,92.4,91.3,91.85
R,74.3,84.8,79.55
PHP,84.8,84.5,84.65
JavaScript,84.5,83.0,83.75
Ruby,78.9,76.2,77.55
Matlab,72.8,72.4,72.6


In [8]:
presidents = pd.DataFrame([{'name': 'Barack Obama','inauguration': 2009,'birthyear': 1961},
                          {'name': 'George W. Bush','inauguration': 2001,'birthyear': 1946},
                          {'name': 'Bill Clinton','birthyear': 1946,'inauguration': 1993},
                          {'name': 'George H. W. Bush','inauguration': 1989,'birthyear': 1924}])

presidents

Unnamed: 0,name,inauguration,birthyear
0,Barack Obama,2009,1961
1,George W. Bush,2001,1946
2,Bill Clinton,1993,1946
3,George H. W. Bush,1989,1924


In [9]:
presidents.set_index('name', inplace=True)
presidents

Unnamed: 0_level_0,inauguration,birthyear
name,Unnamed: 1_level_1,Unnamed: 2_level_1
Barack Obama,2009,1961
George W. Bush,2001,1946
Bill Clinton,1993,1946
George H. W. Bush,1989,1924


In [10]:
presidents.loc['Bill Clinton']['inauguration']

1993

In [11]:
presidents.reset_index(inplace=True)

In [12]:
presidents_fathers = pd.DataFrame([{'son': 'Barack Obama','father': 'Barack Obama, Sr.'},
                                   {'son': 'George W. Bush','father': 'George H. W. Bush'},
                                   {'son': 'George H. W. Bush','father': 'Prescott Bush'}])

In [13]:
pd.merge(presidents,presidents_fathers,left_on='name',right_on='son').drop('son', axis=1)

Unnamed: 0,name,inauguration,birthyear,father
0,Barack Obama,2009,1961,"Barack Obama, Sr."
1,George W. Bush,2001,1946,George H. W. Bush
2,George H. W. Bush,1989,1924,Prescott Bush


In [14]:
pd.merge(presidents, presidents_fathers, left_on='name', right_on='son', how='left').drop('son', axis=1)

Unnamed: 0,name,inauguration,birthyear,father
0,Barack Obama,2009,1961,"Barack Obama, Sr."
1,George W. Bush,2001,1946,George H. W. Bush
2,Bill Clinton,1993,1946,
3,George H. W. Bush,1989,1924,Prescott Bush


<br>

#### Multilevel Indexing. 
___

In [15]:
flights = sns.load_dataset('flights')
flights.head()

Unnamed: 0,year,month,passengers
0,1949,Jan,112
1,1949,Feb,118
2,1949,Mar,132
3,1949,Apr,129
4,1949,May,121


In [16]:
flights.tail()

Unnamed: 0,year,month,passengers
139,1960,Aug,606
140,1960,Sep,508
141,1960,Oct,461
142,1960,Nov,390
143,1960,Dec,432


In [17]:
flight_indexed = flights.set_index(['year','month'])
flight_indexed.head(12)

Unnamed: 0_level_0,Unnamed: 1_level_0,passengers
year,month,Unnamed: 2_level_1
1949,Jan,112
1949,Feb,118
1949,Mar,132
1949,Apr,129
1949,May,121
1949,Jun,135
1949,Jul,148
1949,Aug,148
1949,Sep,136
1949,Oct,119


In [18]:
flight_indexed.loc[1958:1959]

Unnamed: 0_level_0,Unnamed: 1_level_0,passengers
year,month,Unnamed: 2_level_1
1958,Jan,340
1958,Feb,318
1958,Mar,362
1958,Apr,348
1958,May,363
1958,Jun,435
1958,Jul,491
1958,Aug,505
1958,Sep,404
1958,Oct,359


In [19]:
flight_indexed.loc[1960, 'Mar']

passengers    419
Name: (1960, Mar), dtype: int64

In [20]:
flight_indexed.loc[1960].loc['Mar':'May']

Unnamed: 0_level_0,passengers
month,Unnamed: 1_level_1
Mar,419
Apr,461
May,472


In [21]:
flight_unstacked = flight_indexed.unstack()
flight_unstacked

Unnamed: 0_level_0,passengers,passengers,passengers,passengers,passengers,passengers,passengers,passengers,passengers,passengers,passengers,passengers
month,Jan,Feb,Mar,Apr,May,Jun,Jul,Aug,Sep,Oct,Nov,Dec
year,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2
1949,112,118,132,129,121,135,148,148,136,119,104,118
1950,115,126,141,135,125,149,170,170,158,133,114,140
1951,145,150,178,163,172,178,199,199,184,162,146,166
1952,171,180,193,181,183,218,230,242,209,191,172,194
1953,196,196,236,235,229,243,264,272,237,211,180,201
1954,204,188,235,227,234,264,302,293,259,229,203,229
1955,242,233,267,269,270,315,364,347,312,274,237,278
1956,284,277,317,313,318,374,413,405,355,306,271,306
1957,315,301,356,348,355,422,465,467,404,347,305,336
1958,340,318,362,348,363,435,491,505,404,359,310,337


In [22]:
# Calculate totals for each year
flight_unstacked.sum(axis=1)

year
1949    1520
1950    1676
1951    2042
1952    2364
1953    2700
1954    2867
1955    3408
1956    3939
1957    4421
1958    4572
1959    5140
1960    5714
dtype: int64

In [23]:
flight_unstacked['passengers', 'total'] = flight_unstacked.sum(axis=1)
flight_unstacked

Unnamed: 0_level_0,passengers,passengers,passengers,passengers,passengers,passengers,passengers,passengers,passengers,passengers,passengers,passengers,passengers
month,Jan,Feb,Mar,Apr,May,Jun,Jul,Aug,Sep,Oct,Nov,Dec,total
year,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2
1949,112,118,132,129,121,135,148,148,136,119,104,118,1520
1950,115,126,141,135,125,149,170,170,158,133,114,140,1676
1951,145,150,178,163,172,178,199,199,184,162,146,166,2042
1952,171,180,193,181,183,218,230,242,209,191,172,194,2364
1953,196,196,236,235,229,243,264,272,237,211,180,201,2700
1954,204,188,235,227,234,264,302,293,259,229,203,229,2867
1955,242,233,267,269,270,315,364,347,312,274,237,278,3408
1956,284,277,317,313,318,374,413,405,355,306,271,306,3939
1957,315,301,356,348,355,422,465,467,404,347,305,336,4421
1958,340,318,362,348,363,435,491,505,404,359,310,337,4572


In [24]:
flight_restacked = flight_unstacked.stack()
flight_restacked.head(13)

Unnamed: 0_level_0,Unnamed: 1_level_0,passengers
year,month,Unnamed: 2_level_1
1949,Jan,112
1949,Feb,118
1949,Mar,132
1949,Apr,129
1949,May,121
1949,Jun,135
1949,Jul,148
1949,Aug,148
1949,Sep,136
1949,Oct,119


In [25]:
# Select only totals in restacked dataframe
flight_restacked.loc[pd.IndexSlice[:, 'total'], 'passengers']

year  month
1949  total    1520
1950  total    1676
1951  total    2042
1952  total    2364
1953  total    2700
1954  total    2867
1955  total    3408
1956  total    3939
1957  total    4421
1958  total    4572
1959  total    5140
1960  total    5714
Name: passengers, dtype: int64

In [26]:
flight_restacked.loc[pd.IndexSlice[:, 'Jan':'Dec'], 'passengers']

year  month
1949  Jan      112
      Feb      118
      Mar      132
      Apr      129
      May      121
              ... 
1960  Aug      606
      Sep      508
      Oct      461
      Nov      390
      Dec      432
Name: passengers, Length: 144, dtype: int64

In [27]:
# Select flights with passengers over 400 and exclude totals
flight_restacked[flight_restacked['passengers'] > 400].loc[pd.IndexSlice[:, 'Jan':'Dec'], 'passengers']

year  month
1956  Jul      413
      Aug      405
1957  Jun      422
      Jul      465
      Aug      467
      Sep      404
1958  Jun      435
      Jul      491
      Aug      505
      Sep      404
1959  Mar      406
      May      420
      Jun      472
      Jul      548
      Aug      559
      Sep      463
      Oct      407
      Dec      405
1960  Jan      417
      Mar      419
      Apr      461
      May      472
      Jun      535
      Jul      622
      Aug      606
      Sep      508
      Oct      461
      Dec      432
Name: passengers, dtype: int64

<br>

___
#### End.