# Advanced Indexing
- In pandas Key building blocks are
    - Indexes: Sequence of labels
    - Series: 1D array with Index (index + 1 column)
    - DataFrames: 2D array with Series as columns
- Indexes
    - Immutable (Like dictionary keys)
    - Homogenous in data type (Like NumPy arrays)

# creating indexes

In [1]:
import pandas as pd

prices=[10,20,30,40,50]
print(type(prices))
print(prices)

<class 'list'>
[10, 20, 30, 40, 50]


In [2]:
#pd.Series?

## creating series with default indexes

In [3]:
prices=[10,20,30,40,50]
days=['Mon','Tue','Wed','Thu','Fri']

In [4]:
import pandas as pd

s=pd.Series(data=prices)  #data,index,name   #df: data,index,columns
s

0    10
1    20
2    30
3    40
4    50
dtype: int64

In [5]:
type(s)

pandas.core.series.Series

## creating series with customized indexes

In [6]:
s=pd.Series(data=prices, index=days)
s

Mon    10
Tue    20
Wed    30
Thu    40
Fri    50
dtype: int64

In [7]:
type(s)

pandas.core.series.Series

### Reading indexes

In [8]:
s.index

Index(['Mon', 'Tue', 'Wed', 'Thu', 'Fri'], dtype='object')

In [9]:
s.index[1]

'Tue'

In [10]:
s.index[1:3]

Index(['Tue', 'Wed'], dtype='object')

In [11]:
# from right side 3 items will be removed
s.index[:-3]

Index(['Mon', 'Tue'], dtype='object')

In [12]:
s.index[-2:]

Index(['Thu', 'Fri'], dtype='object')

### index names

In [13]:
s

Mon    10
Tue    20
Wed    30
Thu    40
Fri    50
dtype: int64

In [14]:
s.index.name="weekday"

In [15]:
s

weekday
Mon    10
Tue    20
Wed    30
Thu    40
Fri    50
dtype: int64

## Indexes are immutable

In [16]:
# shares.index[2] = "Wednesday"    # TypeError: Index does not support mutable operations

In [17]:
#shares.index[:2] =['Monday','Tuesday']   # TypeError: Index does not support mutable operations

In [18]:
s.index=["Monday","Tuesday","Wednesday","Thursday","Friday"]
s

Monday       10
Tuesday      20
Wednesday    30
Thursday     40
Friday       50
dtype: int64

In [19]:
a_str="python"
#a_str[0]='J'

#### we can modify total entire object

In [20]:
a_str="Jython"
a_str

'Jython'

# creating index from import file

In [21]:
import os
print(os.getcwd())
os.chdir("E:/code/5.DataAnalysisOfficial/data/pandas")
print(os.getcwd())

E:\code\5.DataAnalysisOfficial
E:\code\5.DataAnalysisOfficial\data\pandas


In [22]:
import pandas as pd

df = pd.read_csv("pandas_sales.csv", index_col="month")
df

Unnamed: 0_level_0,eggs,salt,spam
month,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Jan,47,12.0,17
Feb,110,50.0,31
Mar,221,89.0,72
Apr,77,87.0,20
May,132,,52
Jun,205,60.0,55


In [23]:
df.index

Index(['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun'], dtype='object', name='month')

## hierarchel indexes
- A composite key, in the context of relational databases, is a combination of two or more columns in a table that can be used to uniquely identify each row in the table. Uniqueness is only guaranteed when the columns are combined; when taken individually the columns do not guarantee uniqueness.
- exmple: many to many relation  (both primary keys are taken into another table)

- hierarchical indexing = composit key

In [24]:
import pandas as pd

df = pd.read_csv("pandas_sales_hierarchical_indexing.csv",index_col=["state","month"])
df

Unnamed: 0_level_0,Unnamed: 1_level_0,eggs,salt,spam
state,month,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
CA,1,47,12.0,17
NY,1,221,89.0,72
NY,2,77,87.0,20
TX,1,132,,52
TX,2,205,60.0,55
TX,1,22,2.0,24
CA,2,110,50.0,31


## 3.2 Read Index of DF

In [25]:
df.index

MultiIndex([('CA', 1),
            ('NY', 1),
            ('NY', 2),
            ('TX', 1),
            ('TX', 2),
            ('TX', 1),
            ('CA', 2)],
           names=['state', 'month'])

In [26]:
print(df.index.name)

None


In [27]:
df.index.name="state_month_composit_key"
print(df.index)

MultiIndex([('CA', 1),
            ('NY', 1),
            ('NY', 2),
            ('TX', 1),
            ('TX', 2),
            ('TX', 1),
            ('CA', 2)],
           name='state_month_composit_key')


In [28]:
df.index.names

FrozenList(['state', 'month'])

In [29]:
df

Unnamed: 0_level_0,Unnamed: 1_level_0,eggs,salt,spam
state,month,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
CA,1,47,12.0,17
NY,1,221,89.0,72
NY,2,77,87.0,20
TX,1,132,,52
TX,2,205,60.0,55
TX,1,22,2.0,24
CA,2,110,50.0,31


### sort indexes

In [30]:
df = df.sort_index()
df

Unnamed: 0_level_0,Unnamed: 1_level_0,eggs,salt,spam
state,month,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
CA,1,47,12.0,17
CA,2,110,50.0,31
NY,1,221,89.0,72
NY,2,77,87.0,20
TX,1,132,,52
TX,1,22,2.0,24
TX,2,205,60.0,55


## Reading

In [31]:
df.loc['CA'].loc[1]     #df.loc['CA',1]

eggs    47.0
salt    12.0
spam    17.0
Name: 1, dtype: float64

In [32]:
df.loc[('CA',1),'salt']    # series

state  month
CA     1        12.0
Name: salt, dtype: float64

In [33]:
df.loc['NY']

Unnamed: 0_level_0,eggs,salt,spam
month,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,221,89.0,72
2,77,87.0,20


In [34]:
df.loc['CA':'NY']

Unnamed: 0_level_0,Unnamed: 1_level_0,eggs,salt,spam
state,month,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
CA,1,47,12.0,17
CA,2,110,50.0,31
NY,1,221,89.0,72
NY,2,77,87.0,20


In [35]:
df.loc[(['CA','TX'],1),:]

Unnamed: 0_level_0,Unnamed: 1_level_0,eggs,salt,spam
state,month,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
CA,1,47,12.0,17
TX,1,132,,52
TX,1,22,2.0,24


### creating DataFrame

In [36]:
import pandas as pd

spam = [17,31,72,20,52,55]
month = ['jan','mar','feb','sept','oct','nov']

df = pd.DataFrame(data=spam, index=month, columns=['spam'])
df

Unnamed: 0,spam
jan,17
mar,31
feb,72
sept,20
oct,52
nov,55


In [37]:
import numpy as np

spam1 = [17,31,72,20,52,55]
spam2 = [17,31,72,20,52,55]
spam3 = [17,31,72,20,52,55]
month = ['jan','mar','feb','apr','june',"spet"]

df = pd.DataFrame(data=[spam1,spam2,spam3], columns=month, index=['spam1','spam2','spam3'])
df.T

Unnamed: 0,spam1,spam2,spam3
jan,17,17,17
mar,31,31,31
feb,72,72,72
apr,20,20,20
june,52,52,52
spet,55,55,55
