In [3]:
import pandas as pd
import numpy as np

In [4]:
rang = np.random.RandomState(42)

In [6]:
pd.Series(rang.randint(0,10,4))

0    6
1    3
2    7
3    4
dtype: int32

In [10]:
df = pd.DataFrame(rang.randint(0,10,(3,4)), columns=['A','B','C','D'])

In [11]:
df

Unnamed: 0,A,B,C,D
0,8,0,9,2
1,6,3,8,2
2,4,2,6,4


In [12]:
df * 2

Unnamed: 0,A,B,C,D
0,16,0,18,4
1,12,6,16,4
2,8,4,12,8


In [15]:
# Converting dictionary to pandas series
area = pd.Series({'Alaska': 1723337, 'Texas': 695662,
                  'California': 423967}, name='area')

In [16]:
area

Alaska        1723337
California     423967
Texas          695662
Name: area, dtype: int64

In [17]:
population = pd.Series({'California': 38332521, 'Texas': 26448193,
                        'New York': 19651127}, name='population')

In [18]:
population

California    38332521
New York      19651127
Texas         26448193
Name: population, dtype: int64

In [19]:
population / area

Alaska              NaN
California    90.413926
New York            NaN
Texas         38.018740
dtype: float64

In [20]:
A = pd.Series([2, 4, 6], index=[0, 1, 2])
B = pd.Series([1, 3, 5], index=[1, 2, 3])

In [24]:
#A + B
A.add(B,fill_value=0)

0    2.0
1    5.0
2    9.0
3    5.0
dtype: float64

In [26]:
A = pd.DataFrame(rang.randint(0, 20, (2, 2)),
                 columns=list('AB'))

In [27]:
A

Unnamed: 0,A,B
0,8,6
1,17,3


In [28]:
fill = A.stack().mean()

In [29]:
fill

8.5

In [31]:
A.stack()

0  A     8
   B     6
1  A    17
   B     3
dtype: int32

In [32]:
A = pd.Series([2, 4, 6], index=[0, 1, 2])

In [33]:
pd.DataFrame(A,columns=['Name'])

Unnamed: 0,Name
0,2
1,4
2,6


In [34]:
vals1 = np.array([1, None, 3, 4])

In [35]:
vals1

array([1, None, 3, 4], dtype=object)

In [36]:
vals1.sum()

TypeError: unsupported operand type(s) for +: 'int' and 'NoneType'

In [38]:
data = pd.Series([1, np.nan, 'hello', None])

In [40]:
# Dropping the null values
data.dropna()

0        1
2    hello
dtype: object

In [41]:
df = pd.DataFrame([[1,      np.nan, 2],
                   [2,      3,      5],
                   [np.nan, 4,      6]])

In [45]:
df.dropna(axis=1)

Unnamed: 0,2
0,2
1,5
2,6


In [49]:
df[3] = np.nan

In [50]:
df

Unnamed: 0,0,1,2,3
0,1.0,,2,
1,2.0,3.0,5,
2,,4.0,6,


In [53]:
#columns with min 3 np.nan values will be retained
df.dropna(axis='columns',thresh=3)

Unnamed: 0,2
0,2
1,5
2,6


In [54]:
data = pd.Series([1, np.nan, 2, None, 3], index=list('abcde'))

In [55]:
data

a    1.0
b    NaN
c    2.0
d    NaN
e    3.0
dtype: float64

In [56]:
data.fillna(0)

a    1.0
b    0.0
c    2.0
d    0.0
e    3.0
dtype: float64

In [58]:
data.fillna(method='bfill')

a    1.0
b    2.0
c    2.0
d    3.0
e    3.0
dtype: float64

In [59]:
data.fillna(method='ffill')

a    1.0
b    1.0
c    2.0
d    2.0
e    3.0
dtype: float64

### Hierarchical-Indexing

In [64]:
index = [('California', 2000), ('California', 2010),
         ('New York', 2000), ('New York', 2010),
         ('Texas', 2000), ('Texas', 2010)]

populations = [33871648, 37253956,
               18976457, 19378102,
               20851820, 25145561]

# This is not a good way
pop = pd.Series(populations, index=index)

In [65]:
pop

(California, 2000)    33871648
(California, 2010)    37253956
(New York, 2000)      18976457
(New York, 2010)      19378102
(Texas, 2000)         20851820
(Texas, 2010)         25145561
dtype: int64

In [66]:
[i for i in pop.index if i[0] == 'California']

[('California', 2000), ('California', 2010)]

### There are 3 good ways to handle this case

In [67]:
index = pd.MultiIndex.from_tuples(index)

In [68]:
index

MultiIndex(levels=[['California', 'New York', 'Texas'], [2000, 2010]],
           labels=[[0, 0, 1, 1, 2, 2], [0, 1, 0, 1, 0, 1]])

In [73]:
pop = pop.reindex(index)

In [80]:
pop[:, 2010]

California    37253956
New York      19378102
Texas         25145561
dtype: int64

In [81]:
pop['California']

2000    33871648
2010    37253956
dtype: int64

In [82]:
pop

California  2000    33871648
            2010    37253956
New York    2000    18976457
            2010    19378102
Texas       2000    20851820
            2010    25145561
dtype: int64

In [109]:
pop['California'][2010]
pop['California',2010]
pop[pop > 19378102]

California  2000    33871648
            2010    37253956
Texas       2000    20851820
            2010    25145561
dtype: int64

In [85]:
pop_df = pd.DataFrame({'total': pop,
                       'under18': [9267089, 9284094,
                                   4687374, 4318033,
                                   5906301, 6879014]})

In [86]:
pop_df

Unnamed: 0,Unnamed: 1,total,under18
California,2000,33871648,9267089
California,2010,37253956,9284094
New York,2000,18976457,4687374
New York,2010,19378102,4318033
Texas,2000,20851820,5906301
Texas,2010,25145561,6879014


In [87]:
pop_df.unstack()

Unnamed: 0_level_0,total,total,under18,under18
Unnamed: 0_level_1,2000,2010,2000,2010
California,33871648,37253956,9267089,9284094
New York,18976457,19378102,4687374,4318033
Texas,20851820,25145561,5906301,6879014


##### Method 2 : Create MultiIndex from arrays

In [90]:
index = pd.MultiIndex.from_arrays([['a', 'a', 'b', 'b'], [1, 2, 1, 3]])

In [92]:
pd.DataFrame({'age':[11,22,33,43]},index=index)

Unnamed: 0,Unnamed: 1,age
a,1,11
a,2,22
b,1,33
b,3,43


In [93]:
index = pd.MultiIndex.from_product([[2013, 2014], [1, 2]],
                                   names=['year', 'visit'])

In [94]:
index

MultiIndex(levels=[[2013, 2014], [1, 2]],
           labels=[[0, 0, 1, 1], [0, 1, 0, 1]],
           names=['year', 'visit'])

In [95]:
pd.DataFrame({'age':[11,22,33,43]},index=index)

Unnamed: 0_level_0,Unnamed: 1_level_0,age
year,visit,Unnamed: 2_level_1
2013,1,11
2013,2,22
2014,1,33
2014,2,43


In [96]:
columns = pd.MultiIndex.from_product([['Bob', 'Guido', 'Sue'], ['HR', 'Temp']],
                                     names=['subject', 'type'])

In [97]:
columns

MultiIndex(levels=[['Bob', 'Guido', 'Sue'], ['HR', 'Temp']],
           labels=[[0, 0, 1, 1, 2, 2], [0, 1, 0, 1, 0, 1]],
           names=['subject', 'type'])

In [102]:
data = np.round(np.random.randn(4,6),1) + 10 * 10

In [104]:
health_data = pd.DataFrame(data,index = index, columns=columns)

In [106]:
health_data['Guido']['HR']

year  visit
2013  1         98.4
      2        100.3
2014  1        100.2
      2        102.5
Name: HR, dtype: float64

In [107]:
health_data

Unnamed: 0_level_0,subject,Bob,Bob,Guido,Guido,Sue,Sue
Unnamed: 0_level_1,type,HR,Temp,HR,Temp,HR,Temp
year,visit,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
2013,1,97.8,101.0,98.4,100.4,99.9,98.7
2013,2,100.4,99.0,100.3,100.6,100.4,100.8
2014,1,98.7,99.9,100.2,99.1,101.1,99.2
2014,2,100.3,98.5,102.5,99.8,99.4,99.9
