- 1. Reindexing
- 2. Dropping entries from an axis
- 3. Indexing, Selection, and Filtering
- 4. Arithmetic and Data Alignment
- 5. Function Application and Mapping
- 6. Sorting and Ranking
- 7. Summarizing and Computing Descriptive Statistics

In [6]:
import pandas as pd
import numpy as np
obj = pd.Series([1,2,3,4],index = ['d','b','a','c'])
obj

d    1
b    2
a    3
c    4
dtype: int64

In [4]:
obj = obj.reindex(['a','b','c','d','e'])
obj

a    3.0
b    2.0
c    4.0
d    1.0
e    NaN
dtype: float64

<div style="width:image width px; font-size:100%; text-align:center;"><img src='images/reindex.png' alt="alternate text" width="width" height="height" style="width:700px;height:250px;" /> Figure 1 </div>

In [7]:
obj.reindex(['a','b','c','d','e'],fill_value = 'missing')

a          3
b          2
c          4
d          1
e    missing
dtype: object

In [8]:
obj3 = pd.Series(['blue','purple','yellow'],index=[0,2,4])
obj3

0      blue
2    purple
4    yellow
dtype: object

In [11]:
obj3.reindex(np.arange(7),method='ffill')

0      blue
1      blue
2    purple
3    purple
4    yellow
5    yellow
6    yellow
dtype: object

In [None]:
obj3

In [12]:
obj3.reindex(np.arange(12), method='ffill',limit=4)

0       blue
1       blue
2     purple
3     purple
4     yellow
5     yellow
6     yellow
7     yellow
8     yellow
9        NaN
10       NaN
11       NaN
dtype: object

In [None]:
obj5 = pd.Series(['blue','purple','yellow'],index=[0,5,6])

In [None]:
obj5

In [None]:
obj5.reindex(np.arange(9),method='ffill', limit =3)

DROPPING ENTRIES FROM AN AXIS

<div style="width:image width px; font-size:100%; text-align:center;"><img src='images/drop_function.png' alt="alternate text" width="width" height="height" style="width:700px;height:250px;" /> Figure 1 </div>

In [13]:
obj = pd.Series(np.arange(5),index = ['a','b','c','d','e'])
obj

a    0
b    1
c    2
d    3
e    4
dtype: int32

In [14]:
###drop 1 index
obj.drop('a')

b    1
c    2
d    3
e    4
dtype: int32

In [15]:
### drop more than 2 indexes -> using list
obj.drop(['a','b'])

c    2
d    3
e    4
dtype: int32

In [17]:
### a tuple is used as single label
obj.drop(('a'))

b    1
c    2
d    3
e    4
dtype: int32

In [16]:
obj.drop(('a','b'))

KeyError: "[('a', 'b')] not found in axis"

In [18]:
import pandas as pd

In [19]:
###drop a column
data = {'State' : ['Ohio','Ohio','Ohio','Nevada','Neveda','Nevada'],\
       'year':[2000,2001,2002,2001,2002,2003],\
       'pop': [1.5,1.7,3.5,2.4,2.9,3.2]}
df = pd.DataFrame(data)

In [20]:
df

Unnamed: 0,State,year,pop
0,Ohio,2000,1.5
1,Ohio,2001,1.7
2,Ohio,2002,3.5
3,Nevada,2001,2.4
4,Neveda,2002,2.9
5,Nevada,2003,3.2


In [23]:
df.drop('State',axis=1,inplace=True)
df

Unnamed: 0,year,pop
0,2000,1.5
1,2001,1.7
2,2002,3.5
3,2001,2.4
4,2002,2.9
5,2003,3.2


In [None]:
df.drop(['State','year'],axis=1)

In [None]:
###when using columns = or index =1 -> not need specific the axis
df.drop(columns = ['State','year'])

In [None]:
df

In [None]:
df.drop(columns = ['State','year'],inplace = True)

In [None]:
df

SELECTION WITH LOC AND ILOC

<div style="width:image width px; font-size:100%; text-align:center;"><img src='images/loc.png' alt="alternate text" width="width" height="height" style="width:700px;height:250px;" /> Figure 1 </div>

<div style="width:image width px; font-size:100%; text-align:center;"><img src='images/iloc.png' alt="alternate text" width="width" height="height" style="width:700px;height:250px;" /> Figure 1 </div>

CREATE A DATAFRAME LOOK LIKE THE FOLLOWING TABLE


<div style="width:image width px; font-size:100%; text-align:center;"><img src='images/dataframe.png' alt="alternate text" width="width" height="height" style="width:300px;height:250px;" /> Figure 1 </div>

- Select the column ‘one’ using loc
- Select the first row and the columns ‘one’ and ‘two’ by label
- Perform similar selections with integers using iloc
- Select the first 3 rows and the columns ‘one and ‘two’ using loc and iloc


In [24]:
data = pd.DataFrame(np.arange(16).reshape(4,4),\
                   index = ['Ohio','Colorado','Utah','New York'],\
                   columns = ['one','two','three','four'])
data

Unnamed: 0,one,two,three,four
Ohio,0,1,2,3
Colorado,4,5,6,7
Utah,8,9,10,11
New York,12,13,14,15


In [25]:
data.loc['one']

KeyError: 'one'

In [26]:
data.loc[:,'one']

Ohio         0
Colorado     4
Utah         8
New York    12
Name: one, dtype: int32

select one row, multi columns by label

In [27]:
data.loc['Ohio']

one      0
two      1
three    2
four     3
Name: Ohio, dtype: int32

In [28]:
data.loc['Ohio',['one','two']]

one    0
two    1
Name: Ohio, dtype: int32

In [29]:
##select multi rows and multi columns by label
data.loc[['Ohio','Colorado','Utah'],['one','two']]

Unnamed: 0,one,two
Ohio,0,1
Colorado,4,5
Utah,8,9


In [None]:
data.loc[:'Utah',['one','two']]

In [None]:
data.iloc[:3,[0,1]]

ARITHMETIC AND DATA ALIGNMENT

<div style="width:image width px; font-size:100%; text-align:center;"><img src='images/arithmetic.png' alt="alternate text" width="width" height="height" style="width:500px;height:250px;" /> Figure 1 </div>

In [30]:
s1 = pd.Series([7.3,-2.5,3.4,1.5],\
               index = ['a','c','d','e'])
s2 = pd.Series([-2.1,3.6,-1.5,4,3.1],\
               index = ['a','c','e','f','g'])

In [31]:
s1

a    7.3
c   -2.5
d    3.4
e    1.5
dtype: float64

In [32]:
s2

a   -2.1
c    3.6
e   -1.5
f    4.0
g    3.1
dtype: float64

In [33]:
s1 + s2

a    5.2
c    1.1
d    NaN
e    0.0
f    NaN
g    NaN
dtype: float64

In [34]:
s1.add(s2,fill_value=0)

a    5.2
c    1.1
d    3.4
e    0.0
f    4.0
g    3.1
dtype: float64

In [35]:
df1 = pd.DataFrame(np.arange(9).reshape(3,3),\
                   columns = list('bcd'),index = ['i2','i3','i4'])
df2 = pd.DataFrame(np.arange(16).reshape(4,4),\
                   columns = list('bcde'),index = ['i1','i2','i3','i5'])

In [36]:
df1 

Unnamed: 0,b,c,d
i2,0,1,2
i3,3,4,5
i4,6,7,8


In [37]:
df2

Unnamed: 0,b,c,d,e
i1,0,1,2,3
i2,4,5,6,7
i3,8,9,10,11
i5,12,13,14,15


In [38]:
df1 + df2

Unnamed: 0,b,c,d,e
i1,,,,
i2,4.0,6.0,8.0,
i3,11.0,13.0,15.0,
i4,,,,
i5,,,,


In [39]:
df1.add(df2,fill_value = 0)### why result_df(i4,e) is null?

Unnamed: 0,b,c,d,e
i1,0.0,1.0,2.0,3.0
i2,4.0,6.0,8.0,7.0
i3,11.0,13.0,15.0,11.0
i4,6.0,7.0,8.0,
i5,12.0,13.0,14.0,15.0


If data in both corresponding DataFrame locations is missing the result will be missing.

In [None]:
df1.rdiv(1)

# OPERATIONS BETWEEN DATAFRAME AND SERIES

- By default, arithmetic between  DataFrame and Series matches  the index of Series on the DataFrame’s columns, broadcasting down the rows
- If an index value is not found in either the DataFrame’s columns or the Series’s index, the objects will be reindexed to form the union
- If you want to instead broadcast over the columns, matching on the rows, you have to use one of the arithmetic methods


Create the following dataframe 
df = pd.DataFrame(np.arange(12).reshape(4,3),\
                 columns = list('bcd'),\
                 index = ['i1','i2','i3','i4'])
series1 = df.iloc[0]
series2 = df['b']
Try df – series1 and df – series2


In [40]:
import numpy as np
df = pd.DataFrame(np.arange(12).reshape(4,3),\
                 columns = list('bcd'),\
                 index = ['i1','i2','i3','i4'])
series1 = df.iloc[0]
series2 = df['b']

In [41]:
series1

b    0
c    1
d    2
Name: i1, dtype: int32

In [42]:
series2

i1    0
i2    3
i3    6
i4    9
Name: b, dtype: int32

In [43]:
df

Unnamed: 0,b,c,d
i1,0,1,2
i2,3,4,5
i3,6,7,8
i4,9,10,11


In [44]:
df - series1

Unnamed: 0,b,c,d
i1,0,0,0
i2,3,3,3
i3,6,6,6
i4,9,9,9


In [45]:
df - series2

Unnamed: 0,b,c,d,i1,i2,i3,i4
i1,,,,,,,
i2,,,,,,,
i3,,,,,,,
i4,,,,,,,


In [46]:
df.sub(series2,axis='index')###matching on the rows

Unnamed: 0,b,c,d
i1,0,1,2
i2,0,1,2
i3,0,1,2
i4,0,1,2


In [47]:
df.sub(series1,axis='columns')

Unnamed: 0,b,c,d
i1,0,0,0
i2,3,3,3
i3,6,6,6
i4,9,9,9


FUNCTION APPLICATION AND MAPPING

-  Numpy ufuncs (element-wise methods) also work with pandas objects:
Ex: np.abs(dataframe)
- Another frequent operation is applying a function on one-dimensional arrays to each column or row
df.apply(func,axis = 0)
- The function passed to apply need not return a scalar value; it can also return a Series with multiple values
- We can use applymap for applying  a function to a Dataframe elementwise.


In [48]:
df

Unnamed: 0,b,c,d
i1,0,1,2
i2,3,4,5
i3,6,7,8
i4,9,10,11


In [49]:
df.apply(lambda x: x.max()-x.min())

b    9
c    9
d    9
dtype: int64

In [50]:
df.apply(lambda x: x.max()-x.min(),axis='columns')

i1    2
i2    2
i3    2
i4    2
dtype: int64

In [51]:
def stats(x):
    return pd.Series([x.min(),x.max()],index = ['min','max'])

In [52]:
df.apply(stats,axis=0)

Unnamed: 0,b,c,d
min,0,1,2
max,9,10,11


In [53]:
df.applymap(stats)###try and see error

AttributeError: 'int' object has no attribute 'min'

SORTING AND RANKING

- Sorting a dataset by some criterion is another important built-in operation. 
- To sort lexicographically by row or column index, use the sort_index method, which returns a new, sorted object
- To sort a Series by its value, use its sort_values method


<div style="width:image width px; font-size:100%; text-align:center;"><img src='images/sort.png' alt="alternate text" width="width" height="height" style="width:500px;height:250px;" /> Figure 1 </div>

In [54]:
df1

Unnamed: 0,b,c,d
i2,0,1,2
i3,3,4,5
i4,6,7,8


In [55]:
df1 = df1.reindex(['i3','i2','i4'])
df1 = df1[['d','b','c']]
df1

Unnamed: 0,d,b,c
i3,5,3,4
i2,2,0,1
i4,8,6,7


In [56]:
df1.sort_index()

Unnamed: 0,d,b,c
i2,2,0,1
i3,5,3,4
i4,8,6,7


In [57]:
df1.sort_index(axis=1)

Unnamed: 0,b,c,d
i3,3,4,5
i2,0,1,2
i4,6,7,8


In [58]:
df1.sort_values(by = 'c',ascending=False)

Unnamed: 0,d,b,c
i4,8,6,7
i3,5,3,4
i2,2,0,1


In [59]:
df1.loc['i3','c'] = np.nan

In [60]:
df1 

Unnamed: 0,d,b,c
i3,5,3,
i2,2,0,1.0
i4,8,6,7.0


In [61]:
df1.sort_values(by='c',na_position = 'first')

Unnamed: 0,d,b,c
i3,5,3,
i2,2,0,1.0
i4,8,6,7.0


In [62]:
df1.sort_values(by='i2',axis=1,ascending=False)

Unnamed: 0,d,c,b
i3,5,,3
i2,2,1.0,0
i4,8,7.0,6


RANKING

In [63]:
obj = pd.Series([7, -5, 7, 4, 2, 0, 4])
obj

0    7
1   -5
2    7
3    4
4    2
5    0
6    4
dtype: int64

In [64]:
obj.rank()

0    6.5
1    1.0
2    6.5
3    4.5
4    3.0
5    2.0
6    4.5
dtype: float64

In [67]:
obj.rank(method='first')

0    6.0
1    1.0
2    7.0
3    4.0
4    3.0
5    2.0
6    5.0
dtype: float64

In [66]:
obj.rank(method='dense')

0    5.0
1    1.0
2    5.0
3    4.0
4    3.0
5    2.0
6    4.0
dtype: float64

SUMMARIZING AND COMPUTING DESCRIPTIVE STATISTIC

In [None]:
df1

In [None]:
df1.loc['i3','d'] = np.nan
df1.loc['i4','c'] = np.nan

In [None]:
df1

In [None]:
df1.sum(axis=1,skipna=False)

In [None]:
df1.idxmax()

In [None]:
df1['b'].argmax()

In [None]:
df1.prod()

In [None]:
df1['b'].var() ###ddof

In [None]:
df1['b']

In [None]:
np.var([3,0,6])

In [None]:
df1['b'].var(ddof = 0)

# Unique Values, Value Counts, and Membership

In [68]:
obj = pd.Series(['c','a','d','a','a','b','b','c','c'])
obj

0    c
1    a
2    d
3    a
4    a
5    b
6    b
7    c
8    c
dtype: object

In [71]:
obj.unique()

array(['c', 'a', 'd', 'b'], dtype=object)

In [72]:
obj.nunique()

4

In [73]:
obj.value_counts()

c    3
a    3
b    2
d    1
Name: count, dtype: int64

In [74]:
obj.value_counts(normalize=True)

c    0.333333
a    0.333333
b    0.222222
d    0.111111
Name: proportion, dtype: float64

In [76]:
obj

0    c
1    a
2    d
3    a
4    a
5    b
6    b
7    c
8    c
dtype: object

In [75]:
obj.isin(['b','c'])

0     True
1    False
2    False
3    False
4    False
5     True
6     True
7     True
8     True
dtype: bool

In [77]:
obj[obj.isin(['b','c'])]

0    c
5    b
6    b
7    c
8    c
dtype: object

In [None]:
obj

In [None]:
obj.str.match(pat='(b)|(c)')