### Introduction to Pandas Data Structures


In [2]:
from pandas import Series,DataFrame
import pandas as pd
import numpy as np

##### Series
* Is 1D array-like object containing array of data and an associated array of data labels, called its ***index***.

In [3]:
obj = Series([4,7,-1,2])
obj

0    4
1    7
2   -1
3    2
dtype: int64

In [5]:
obj.values, obj.index

(array([ 4,  7, -1,  2], dtype=int64), RangeIndex(start=0, stop=4, step=1))

In [7]:
obj2 = Series([1,2,3,5], index=['a','b','c','d'])
obj2

a    1
b    2
c    3
d    5
dtype: int64

In [8]:
obj2.index

Index(['a', 'b', 'c', 'd'], dtype='object')

In [9]:
obj2['a']

1

In [11]:
obj2[['a','d']]

a    1
d    5
dtype: int64

* Converting data of Python dict to Series

In [12]:
sdata =  {'Ohio': 35000, 'Texas': 71000, 'Oregon': 16000, 'Utah': 5000}
obj3 = Series(sdata)
obj3

Ohio      35000
Texas     71000
Oregon    16000
Utah       5000
dtype: int64

In [13]:
states = ['California','Ohio','Texas','Utah']
obj4 = Series(sdata,index=states)
obj4

California        NaN
Ohio          35000.0
Texas         71000.0
Utah           5000.0
dtype: float64

In [14]:
pd.isnull(obj4), pd.notnull(obj4)

(California     True
 Ohio          False
 Texas         False
 Utah          False
 dtype: bool, California    False
 Ohio           True
 Texas          True
 Utah           True
 dtype: bool)

In [15]:
obj4.isnull()

California     True
Ohio          False
Texas         False
Utah          False
dtype: bool

* **Critical Feature** --> arthimetic operations can be done on Series

In [18]:
obj3+obj4

California         NaN
Ohio           70000.0
Oregon             NaN
Texas         142000.0
Utah           10000.0
dtype: float64

In [19]:
obj4.name = 'population'
obj4.index.name = 'states'
obj4

states
California        NaN
Ohio          35000.0
Texas         71000.0
Utah           5000.0
Name: population, dtype: float64

##### DataFrame

* DataFrame represents a tabular,spreadsheet-like data structure
* DataFram has both row and column index

In [20]:
data = {'state': ['Ohio', 'Ohio', 'Ohio', 'Nevada', 'Nevada'],
 'year': [2000, 2001, 2002, 2001, 2002],
 'pop': [1.5, 1.7, 3.6, 2.4, 2.9]}
frame=DataFrame(data)
frame

Unnamed: 0,state,year,pop
0,Ohio,2000,1.5
1,Ohio,2001,1.7
2,Ohio,2002,3.6
3,Nevada,2001,2.4
4,Nevada,2002,2.9


In [21]:
#changing the order of columns
DataFrame(data,columns=['year','state','pop'])

Unnamed: 0,year,state,pop
0,2000,Ohio,1.5
1,2001,Ohio,1.7
2,2002,Ohio,3.6
3,2001,Nevada,2.4
4,2002,Nevada,2.9


In [22]:
#Craeting empty columns and giving indexes
frame2 = DataFrame(data,columns=['year','state','pop','debt'], index=['one','two','three','four','five'])
frame2

Unnamed: 0,year,state,pop,debt
one,2000,Ohio,1.5,
two,2001,Ohio,1.7,
three,2002,Ohio,3.6,
four,2001,Nevada,2.4,
five,2002,Nevada,2.9,


In [23]:
frame2.columns

Index(['year', 'state', 'pop', 'debt'], dtype='object')

In [24]:
frame2['state']

one        Ohio
two        Ohio
three      Ohio
four     Nevada
five     Nevada
Name: state, dtype: object

In [26]:
frame2['year']

one      2000
two      2001
three    2002
four     2001
five     2002
Name: year, dtype: int64

In [31]:
# getting data at index =3
frame2.loc['three']

year     2002
state    Ohio
pop       3.6
debt      NaN
Name: three, dtype: object

In [34]:
frame2['debt'] = np.arange(5.)
frame2

Unnamed: 0,year,state,pop,debt
one,2000,Ohio,1.5,0.0
two,2001,Ohio,1.7,1.0
three,2002,Ohio,3.6,2.0
four,2001,Nevada,2.4,3.0
five,2002,Nevada,2.9,4.0


In [40]:
# Assigning a column that doesn't exist will create new column
frame2['eastern'] = frame2.state == 'Ohio'
frame2

Unnamed: 0,year,state,pop,debt,eastern
one,2000,Ohio,1.5,0.0,True
two,2001,Ohio,1.7,1.0,True
three,2002,Ohio,3.6,2.0,True
four,2001,Nevada,2.4,3.0,False
five,2002,Nevada,2.9,4.0,False


In [41]:
del frame2['eastern']
frame2.columns

Index(['year', 'state', 'pop', 'debt'], dtype='object')

In [46]:
# Nested Dict
pop = {'Nevada': {2001: 2.4, 2002: 2.9},'Ohio': {2000: 1.5, 2001: 1.7, 2002: 3.6}}
frame3 = DataFrame(pop)
frame3

Unnamed: 0,Nevada,Ohio
2000,,1.5
2001,2.4,1.7
2002,2.9,3.6


In [43]:
frame3.T

Unnamed: 0,2000,2001,2002
Nevada,,2.4,2.9
Ohio,1.5,1.7,3.6


### Essential Functionality

##### 1. Reindexing


In [47]:
obj = Series([4.5, 7.2, -5.3, 3.6], index=['d', 'b', 'a', 'c'])
obj

d    4.5
b    7.2
a   -5.3
c    3.6
dtype: float64

In [48]:
obj2 = obj.reindex(['a','b','c','d','e'])
obj2

a   -5.3
b    7.2
c    3.6
d    4.5
e    NaN
dtype: float64

In [50]:
obj2 = obj.reindex(['a','b','c','d','e'], fill_value=0)
obj2

a   -5.3
b    7.2
c    3.6
d    4.5
e    0.0
dtype: float64

In [53]:
obj3 = Series(['blue', 'purple', 'yellow'], index=[0, 2, 4])
obj3.reindex(range(6), method='ffill')
#ffill/pad --> fill values forward
#bfill/backfill --> fill values backward

0      blue
1      blue
2    purple
3    purple
4    yellow
5    yellow
dtype: object

##### 2. Dropping entries from an axis

In [54]:
obj = Series(np.arange(5.), index=['a','b','c','d','e'])
obj.drop('c')

a    0.0
b    1.0
d    3.0
e    4.0
dtype: float64

In [55]:
obj.drop(['d','b'])

a    0.0
c    2.0
e    4.0
dtype: float64

##### 3. Indexing, selection and filtering

In [56]:
data = DataFrame(np.arange(16).reshape((4, 4)),index=['Ohio', 'Colorado', 'Utah', 'New York'],
                 columns=['one', 'two', 'three', 'four'])
data

Unnamed: 0,one,two,three,four
Ohio,0,1,2,3
Colorado,4,5,6,7
Utah,8,9,10,11
New York,12,13,14,15


In [57]:
data['two']

Ohio         1
Colorado     5
Utah         9
New York    13
Name: two, dtype: int32

In [62]:
data[:2]

Unnamed: 0,one,two,three,four
Ohio,0,1,2,3
Colorado,4,5,6,7


In [66]:
data[data['three']>5]

Unnamed: 0,one,two,three,four
Colorado,4,5,6,7
Utah,8,9,10,11
New York,12,13,14,15


In [68]:
data.loc[['Colorado','Utah'],['four','one','two']]

Unnamed: 0,four,one,two
Colorado,7,4,5
Utah,11,8,9


##### 4. Arthimetic and Data Alignment

In [69]:
dt1 = DataFrame(np.arange(9.).reshape((3, 3)),index=['Ohio', 'Teaxs','Colorado'],
                 columns=list('bcd'))
dt2 = DataFrame(np.arange(12.).reshape((4, 3)),index=['Utah','Ohio', 'Teaxs','Oregon'],
                 columns=list('bde'))
dt1

Unnamed: 0,b,c,d
Ohio,0.0,1.0,2.0
Teaxs,3.0,4.0,5.0
Colorado,6.0,7.0,8.0


In [70]:
dt2

Unnamed: 0,b,d,e
Utah,0.0,1.0,2.0
Ohio,3.0,4.0,5.0
Teaxs,6.0,7.0,8.0
Oregon,9.0,10.0,11.0


In [72]:
dt1+dt2

Unnamed: 0,b,c,d,e
Colorado,,,,
Ohio,3.0,,6.0,
Oregon,,,,
Teaxs,9.0,,12.0,
Utah,,,,


In [83]:
dt1.add(dt2, fill_value=0)

Unnamed: 0,b,c,d,e
Colorado,6.0,7.0,8.0,
Ohio,3.0,1.0,6.0,5.0
Oregon,9.0,,10.0,11.0
Teaxs,9.0,4.0,12.0,8.0
Utah,0.0,,1.0,2.0


### Summarizing and Computing Descriptive Statistics

In [3]:
df = DataFrame([[1.4, np.nan], [7.1, -4.5],
                 [np.nan, np.nan], [0.75, -1.3]],
                index=['a', 'b', 'c', 'd'],
                columns=['one', 'two'])
df

Unnamed: 0,one,two
a,1.4,
b,7.1,-4.5
c,,
d,0.75,-1.3


In [5]:
df.sum()     # Column Sums

one    9.25
two   -5.80
dtype: float64

In [7]:
df.sum(1)    #Row Sums

a    1.40
b    2.60
c    0.00
d   -0.55
dtype: float64

In [11]:
df.mean(axis=1, skipna=False) #skipna is True by default

a      NaN
b    1.300
c      NaN
d   -0.275
dtype: float64

In [12]:
df.idxmax() # idxmin and idxmax returns the index value where the min and max values are attained

one    b
two    d
dtype: object

In [13]:
df.cumsum()

Unnamed: 0,one,two
a,1.4,
b,8.5,-4.5
c,,
d,9.25,-5.8


In [14]:
df.describe()

Unnamed: 0,one,two
count,3.0,2.0
mean,3.083333,-2.9
std,3.493685,2.262742
min,0.75,-4.5
25%,1.075,-3.7
50%,1.4,-2.9
75%,4.25,-2.1
max,7.1,-1.3


### Unique values, Value Counts and Membership

In [21]:
obj = Series(['c', 'a', 'd', 'a', 'a', 'b', 'b', 'c', 'c'])
unique = obj.unique()
unique

array(['c', 'a', 'd', 'b'], dtype=object)

In [23]:
obj.value_counts()

a    3
c    3
b    2
d    1
dtype: int64

In [28]:
# isin --> Compute boolean array indicating whether each Series value is contained in the passed sequence of values
mask = obj.isin(['b','c'])
obj[mask]

0    c
5    b
6    b
7    c
8    c
dtype: object

### Handling Missing Data

In [29]:
from numpy import nan as NA

data = Series([1, NA, 3.5, NA, 7])
data.dropna() # Alternative for below code

0    1.0
2    3.5
4    7.0
dtype: float64

In [32]:
data[data.notnull()]

0    1.0
2    3.5
4    7.0
dtype: float64

In [33]:
data = DataFrame([[1., 6.5, 3.], [1., NA, NA],[NA, NA, NA], [NA, 6.5, 3.]])
cleaned = data.dropna() ## drop all rows which has NAN
data

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
2,,,
3,,6.5,3.0


In [34]:
cleaned

Unnamed: 0,0,1,2
0,1.0,6.5,3.0


In [35]:
data.dropna(how='all') # drop only those rows which has all columns = NAN

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
3,,6.5,3.0


In [38]:
data[4] = NA

In [40]:
data.dropna(axis=1,how='all')

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
2,,,
3,,6.5,3.0


In [41]:
# Filling information
data = Series([1., NA, 3.5, NA, 7])
data.fillna(0)

0    1.0
1    0.0
2    3.5
3    0.0
4    7.0
dtype: float64

In [42]:
data.fillna(method='ffill')

0    1.0
1    1.0
2    3.5
3    3.5
4    7.0
dtype: float64

### Hierarchical Indexing

In [43]:
data = Series(np.random.randn(10),index=[['a', 'a', 'a', 'b', 'b', 'b', 'c', 'c', 'd', 'd'],[1, 2, 3, 1, 2, 3, 1, 2, 2, 3]])
data

a  1   -0.183671
   2   -0.985175
   3   -0.540695
b  1   -0.018614
   2   -0.415890
   3   -0.214017
c  1    0.486580
   2   -0.154420
d  2   -0.665956
   3    0.565902
dtype: float64

In [44]:
data.index

MultiIndex(levels=[['a', 'b', 'c', 'd'], [1, 2, 3]],
           codes=[[0, 0, 0, 1, 1, 1, 2, 2, 3, 3], [0, 1, 2, 0, 1, 2, 0, 1, 1, 2]])

In [45]:
data['b']

1   -0.018614
2   -0.415890
3   -0.214017
dtype: float64

In [46]:
data[:,2]

a   -0.985175
b   -0.415890
c   -0.154420
d   -0.665956
dtype: float64

In [47]:
data.unstack()

Unnamed: 0,1,2,3
a,-0.183671,-0.985175,-0.540695
b,-0.018614,-0.41589,-0.214017
c,0.48658,-0.15442,
d,,-0.665956,0.565902
