In [7]:
import pandas as pd
import numpy as np

# 1. Creating a DataFrame
### 1)Creating a Series

In [8]:
prices = [10.7, 10.86, 10.74, 10.71, 10.79]
shares = pd.Series(prices)
shares

0    10.70
1    10.86
2    10.74
3    10.71
4    10.79
dtype: float64

### 2) Creating an index

In [9]:
days = ['Mon', 'Tue', 'Wed', 'Thur', 'Fri']
shares = pd.Series(prices, index=days)
shares

Mon     10.70
Tue     10.86
Wed     10.74
Thur    10.71
Fri     10.79
dtype: float64

### 3) Examining an index

In [10]:
shares.index

Index(['Mon', 'Tue', 'Wed', 'Thur', 'Fri'], dtype='object')

In [11]:
shares.index[2]

'Wed'

In [12]:
shares[:2]

Mon    10.70
Tue    10.86
dtype: float64

### 4) Modifying index name: Series.index.name = 'xx'

In [15]:
shares.index.name = 'weekday'

In [16]:
shares

weekday
Mon     10.70
Tue     10.86
Wed     10.74
Thur    10.71
Fri     10.79
dtype: float64

In [17]:
shares.index[2] = 'Wednesday'

TypeError: Index does not support mutable operations

In [19]:
shares.index = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday']
shares

Monday       10.70
Tuesday      10.86
Wednesday    10.74
Thursday     10.71
Friday       10.79
dtype: float64

### 5) Assigning the index: 2 methods
- index_col = 'column_name'
- df.index = df['column_name']

In [36]:
tan = pd.read_csv('datasets/titanic.csv')
tan.head(3)

Unnamed: 0,pclass,survived,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,boat,body,home.dest
0,1,1,"Allen, Miss. Elisabeth Walton",female,29.0,0,0,24160,211.3375,B5,S,2.0,,"St Louis, MO"
1,1,1,"Allison, Master. Hudson Trevor",male,0.92,1,2,113781,151.55,C22 C26,S,11.0,,"Montreal, PQ / Chesterville, ON"
2,1,0,"Allison, Miss. Helen Loraine",female,2.0,1,2,113781,151.55,C22 C26,S,,,"Montreal, PQ / Chesterville, ON"


In [37]:
tan.index = tan['name']
tan.head(3)

Unnamed: 0_level_0,pclass,survived,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,boat,body,home.dest
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
"Allen, Miss. Elisabeth Walton",1,1,"Allen, Miss. Elisabeth Walton",female,29.0,0,0,24160,211.3375,B5,S,2.0,,"St Louis, MO"
"Allison, Master. Hudson Trevor",1,1,"Allison, Master. Hudson Trevor",male,0.92,1,2,113781,151.55,C22 C26,S,11.0,,"Montreal, PQ / Chesterville, ON"
"Allison, Miss. Helen Loraine",1,0,"Allison, Miss. Helen Loraine",female,2.0,1,2,113781,151.55,C22 C26,S,,,"Montreal, PQ / Chesterville, ON"


### 6) Remove extra columns

In [38]:
del tan['name']
tan.head(3)

Unnamed: 0_level_0,pclass,survived,sex,age,sibsp,parch,ticket,fare,cabin,embarked,boat,body,home.dest
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
"Allen, Miss. Elisabeth Walton",1,1,female,29.0,0,0,24160,211.3375,B5,S,2.0,,"St Louis, MO"
"Allison, Master. Hudson Trevor",1,1,male,0.92,1,2,113781,151.55,C22 C26,S,11.0,,"Montreal, PQ / Chesterville, ON"
"Allison, Miss. Helen Loraine",1,0,female,2.0,1,2,113781,151.55,C22 C26,S,,,"Montreal, PQ / Chesterville, ON"


In [39]:
tan.index

Index(['Allen, Miss. Elisabeth Walton', 'Allison, Master. Hudson Trevor',
       'Allison, Miss. Helen Loraine', 'Allison, Mr. Hudson Joshua Creighton',
       'Allison, Mrs. Hudson J C (Bessie Waldo Daniels)',
       'Anderson, Mr. Harry', 'Andrews, Miss. Kornelia Theodosia',
       'Andrews, Mr. Thomas Jr',
       'Appleton, Mrs. Edward Dale (Charlotte Lamson)',
       'Artagaveytia, Mr. Ramon',
       ...
       'Yasbeck, Mr. Antoni', 'Yasbeck, Mrs. Antoni (Selini Alexander)',
       'Youseff, Mr. Gerious', 'Yousif, Mr. Wazli', 'Yousseff, Mr. Gerious',
       'Zabour, Miss. Hileni', 'Zabour, Miss. Thamine',
       'Zakarian, Mr. Mapriededer', 'Zakarian, Mr. Ortin',
       'Zimmerman, Mr. Leo'],
      dtype='object', name='name', length=1309)

In [40]:
tan.index.name

'name'

In [41]:
type(tan.index)

pandas.core.indexes.base.Index

# Practice 1:

In [73]:
sales = pd.read_csv('datasets/sales/sales.csv', index_col = 'month')
sales

Unnamed: 0_level_0,eggs,salt,spam
month,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Jan,47,12.0,17
Feb,110,50.0,31
Mar,221,89.0,72
Apr,77,87.0,20
May,132,,52
Jun,205,60.0,55


In [74]:
sales.index = range(len(sales))

In [75]:
sales

Unnamed: 0,eggs,salt,spam
0,47,12.0,17
1,110,50.0,31
2,221,89.0,72
3,77,87.0,20
4,132,,52
5,205,60.0,55


In [76]:
sales = pd.read_csv('datasets/sales/sales.csv', index_col = 'month')
sales

Unnamed: 0_level_0,eggs,salt,spam
month,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Jan,47,12.0,17
Feb,110,50.0,31
Mar,221,89.0,72
Apr,77,87.0,20
May,132,,52
Jun,205,60.0,55


### 1) Create a list of new indexes & assign the new index to sales.index

In [77]:
new_index = [i.upper() for i in sales.index]
sales.index = new_index
sales

Unnamed: 0,eggs,salt,spam
JAN,47,12.0,17
FEB,110,50.0,31
MAR,221,89.0,72
APR,77,87.0,20
MAY,132,,52
JUN,205,60.0,55


### 2) Assign the string 'MONTHS' to sales.index.name and 'PRODUCTS' to sales.columns.name

In [78]:
sales.index.name = 'MONTHS'
sales

Unnamed: 0_level_0,eggs,salt,spam
MONTHS,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
JAN,47,12.0,17
FEB,110,50.0,31
MAR,221,89.0,72
APR,77,87.0,20
MAY,132,,52
JUN,205,60.0,55


In [79]:
sales.columns.name = 'PRODUCT'
sales

PRODUCT,eggs,salt,spam
MONTHS,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
JAN,47,12.0,17
FEB,110,50.0,31
MAR,221,89.0,72
APR,77,87.0,20
MAY,132,,52
JUN,205,60.0,55


### 3) Building an index & a DataFrame independently. However, it's not an ideal route as any mistakes can cause the data & the index to be aligned incorrectly.

In [82]:
sales.index = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun']

In [83]:
sales

PRODUCT,eggs,salt,spam
Jan,47,12.0,17
Feb,110,50.0,31
Mar,221,89.0,72
Apr,77,87.0,20
May,132,,52
Jun,205,60.0,55


# 2. Hierarchical indexing

In [114]:
stocks = pd.read_csv('datasets/stocks.csv')
stocks.head()

Unnamed: 0,Date,Close,Volume,Symbol
0,10/3/16,31.5,14070500,CSCO
1,10/3/16,112.52,21701800,AAPL
2,10/3/16,57.42,19189500,MSFT
3,10/4/16,113.0,29736800,AAPL
4,10/4/16,57.24,20085900,MSFT


### 1) use a tuple (Symbol & Date) to represent each record in the table: df.set_index(['col1', 'col2'])

In [115]:
stocks = stocks.set_index(['Symbol', 'Date'])
stocks

Unnamed: 0_level_0,Unnamed: 1_level_0,Close,Volume
Symbol,Date,Unnamed: 2_level_1,Unnamed: 3_level_1
CSCO,10/3/16,31.5,14070500
AAPL,10/3/16,112.52,21701800
MSFT,10/3/16,57.42,19189500
AAPL,10/4/16,113.0,29736800
MSFT,10/4/16,57.24,20085900
CSCO,10/4/16,31.35,18460400
MSFT,10/5/16,57.64,16726400
CSCO,10/5/16,31.59,11808600
AAPL,10/5/16,113.05,21453100


In [116]:
stocks.index

MultiIndex(levels=[['AAPL', 'CSCO', 'MSFT'], ['10/3/16', '10/4/16', '10/5/16']],
           codes=[[1, 0, 2, 0, 2, 1, 2, 1, 0], [0, 0, 0, 1, 1, 1, 2, 2, 2]],
           names=['Symbol', 'Date'])

In [96]:
stocks.index.name

In [97]:
stocks.index.names

FrozenList(['Symbol', 'Date'])

In [98]:
type(stocks.index.names)

pandas.core.indexes.frozen.FrozenList

### 2) Sorting index: df.sort_index( )

In [99]:
stocks = stocks.sort_index()
stocks

Unnamed: 0_level_0,Unnamed: 1_level_0,Close,Volume
Symbol,Date,Unnamed: 2_level_1,Unnamed: 3_level_1
AAPL,10/3/16,112.52,21701800
AAPL,10/4/16,113.0,29736800
AAPL,10/5/16,113.05,21453100
CSCO,10/3/16,31.5,14070500
CSCO,10/4/16,31.35,18460400
CSCO,10/5/16,31.59,11808600
MSFT,10/3/16,57.42,19189500
MSFT,10/4/16,57.24,20085900
MSFT,10/5/16,57.64,16726400


### 3) Indexing individual row

In [102]:
stocks.loc[('CSCO', '10/4/16')]

Close           31.35
Volume    18460400.00
Name: (CSCO, 10/4/16), dtype: float64

In [103]:
stocks.loc[('CSCO', '10/4/16'), 'Volume']

18460400.0

### 4) Slicing outermost index

In [110]:
stocks.loc['AAPL']

Unnamed: 0_level_0,Close,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1
10/3/16,112.52,21701800
10/4/16,113.0,29736800
10/5/16,113.05,21453100


In [111]:
stocks.loc['CSCO':'MSFT']

Unnamed: 0_level_0,Unnamed: 1_level_0,Close,Volume
Symbol,Date,Unnamed: 2_level_1,Unnamed: 3_level_1
CSCO,10/3/16,31.5,14070500
CSCO,10/4/16,31.35,18460400
CSCO,10/5/16,31.59,11808600
MSFT,10/3/16,57.42,19189500
MSFT,10/4/16,57.24,20085900
MSFT,10/5/16,57.64,16726400


In [118]:
stocks.loc[(['AAPL', 'MSFT'], '10/5/16'),:]

Unnamed: 0_level_0,Unnamed: 1_level_0,Close,Volume
Symbol,Date,Unnamed: 2_level_1,Unnamed: 3_level_1
MSFT,10/5/16,57.64,16726400
AAPL,10/5/16,113.05,21453100


In [119]:
stocks.loc[(['AAPL', 'MSFT'], '10/5/16'), 'Close']

Symbol  Date   
MSFT    10/5/16     57.64
AAPL    10/5/16    113.05
Name: Close, dtype: float64

### 3) Fancy innermost index

In [123]:
stocks.loc[('CSCO', ['10/5/16', '10/3/16']),:]

Unnamed: 0_level_0,Unnamed: 1_level_0,Close,Volume
Symbol,Date,Unnamed: 2_level_1,Unnamed: 3_level_1
CSCO,10/3/16,31.5,14070500
CSCO,10/5/16,31.59,11808600


# Practice 2:

In [135]:
sale = pd.read_csv('datasets/sales.csv', index_col='state')
sale

Unnamed: 0_level_0,month,eggs,salt,spam
state,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
CA,Jan,47,12.0,17
CA,Feb,110,50.0,31
NY,Mar,221,89.0,72
NY,Apr,77,87.0,20
TX,May,132,,52
TX,Jun,205,60.0,55
TX,Jan,200,50.0,30


In [136]:
sale.loc[['CA','TX']]

Unnamed: 0_level_0,month,eggs,salt,spam
state,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
CA,Jan,47,12.0,17
CA,Feb,110,50.0,31
TX,May,132,,52
TX,Jun,205,60.0,55
TX,Jan,200,50.0,30


In [140]:
sale['CA':'TX']

Unnamed: 0_level_0,month,eggs,salt,spam
state,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
CA,Jan,47,12.0,17
CA,Feb,110,50.0,31
NY,Mar,221,89.0,72
NY,Apr,77,87.0,20
TX,May,132,,52
TX,Jun,205,60.0,55
TX,Jan,200,50.0,30


In [147]:
sale = pd.read_csv('datasets/sales.csv')
sale

Unnamed: 0,state,month,eggs,salt,spam
0,CA,Jan,47,12.0,17
1,CA,Feb,110,50.0,31
2,NY,Mar,221,89.0,72
3,NY,Apr,77,87.0,20
4,TX,May,132,,52
5,TX,Jun,205,60.0,55
6,TX,Jan,200,50.0,30


### 1) Set & sort a MultiIndex

In [148]:
sale = sale.set_index(['state', 'month'])
sale = sale.sort_index()
sale

Unnamed: 0_level_0,Unnamed: 1_level_0,eggs,salt,spam
state,month,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
CA,Feb,110,50.0,31
CA,Jan,47,12.0,17
NY,Apr,77,87.0,20
NY,Mar,221,89.0,72
TX,Jan,200,50.0,30
TX,Jun,205,60.0,55
TX,May,132,,52


In [164]:
sale.loc['NY']

Unnamed: 0_level_0,eggs,salt,spam
month,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Apr,77,87.0,20
Mar,221,89.0,72


In [176]:
NYmonth1 = sale.loc[('NY', 'Apr')]
NYmonth1

eggs    77.0
salt    87.0
spam    20.0
Name: (NY, Apr), dtype: float64

In [179]:
CA_TX_month2 = sale.loc[(['CA', 'TX'], 'Jan'),:]
CA_TX_month2

Unnamed: 0_level_0,Unnamed: 1_level_0,eggs,salt,spam
state,month,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
CA,Jan,47,12.0,17
TX,Jan,200,50.0,30


In [183]:
all_month2 = sale.loc[(slice(None), 'Jan'), :]
all_month2

Unnamed: 0_level_0,Unnamed: 1_level_0,eggs,salt,spam
state,month,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
CA,Jan,47,12.0,17
TX,Jan,200,50.0,30
