<h2>Chapter 5. Getting Started with pandas<h2>

In [2]:
import pandas as pd
import numpy as np

In [3]:
from pandas import DataFrame, Series

<h3>Series<h2>

In [24]:
# A Series is a one-dimensional array-like object containing a sequence of values (of similar types to NumPy types) and an associated array of data labels, called its index. The simplest Series is formed from only an array of data:

In [25]:
obj = Series([1,3,5,6]) #Since we did not specify an index for the data, a default one consisting of the integers 0 through N - 1 (where N is the length of the data) is created. 

In [26]:
obj

0    1
1    3
2    5
3    6
dtype: int64

In [27]:
obj.values

array([1, 3, 5, 6])

In [28]:
obj.index # like range(4)

RangeIndex(start=0, stop=4, step=1)

In [29]:
obj2 = pd.Series([4, 7, -5, 3], index=['d', 'b', 'a', 'c']).astype(np.float64)

In [30]:
obj2

d    4.0
b    7.0
a   -5.0
c    3.0
dtype: float64

In [31]:
obj2['a']

-5.0

In [32]:
obj2[['d','a']]

d    4.0
a   -5.0
dtype: float64

In [33]:
obj2['a','d']

KeyError: 'key of type tuple not found and not a MultiIndex'

In [34]:
obj2[obj2>3]

d    4.0
b    7.0
dtype: float64

In [35]:
# mainlz it has all the functions of numpy arrray(1-d), see below:

In [36]:
obj2

d    4.0
b    7.0
a   -5.0
c    3.0
dtype: float64

In [37]:
obj2*4

d    16.0
b    28.0
a   -20.0
c    12.0
dtype: float64

In [38]:
max(obj2)

7.0

In [39]:
np.max(obj2)

7.0

In [40]:
np.exp(obj2)

d      54.598150
b    1096.633158
a       0.006738
c      20.085537
dtype: float64

In [41]:
np.sqrt(obj2)

  result = getattr(ufunc, method)(*inputs, **kwargs)


d    2.000000
b    2.645751
a         NaN
c    1.732051
dtype: float64

In [43]:
# obj2=o

In [44]:
np.where(obj2>0,np.sqrt(obj2),0)

  result = getattr(ufunc, method)(*inputs, **kwargs)


array([2.        , 2.64575131, 0.        , 1.73205081])

In [45]:
np.sqrt(obj2[obj2>0])

d    2.000000
b    2.645751
c    1.732051
dtype: float64

In [46]:
np.sqrt(np.where(obj2>0,obj2,0))

array([2.        , 2.64575131, 0.        , 1.73205081])

In [47]:
out = obj2.copy()

In [48]:
out

d    4.0
b    7.0
a   -5.0
c    3.0
dtype: float64

In [50]:
np.sqrt(obj2,out=out)

RecursionError: maximum recursion depth exceeded while calling a Python object

In [51]:
obj2

d    4.0
b    7.0
a   -5.0
c    3.0
dtype: float64

In [52]:
np.sqrt(obj2,out=out,where=[True,True,False,True])

RecursionError: maximum recursion depth exceeded while calling a Python object

In [53]:
np.where(obj2>0,np.sqrt(obj2),obj2)  # so far this is the best method

  result = getattr(ufunc, method)(*inputs, **kwargs)


array([ 2.        ,  2.64575131, -5.        ,  1.73205081])

In [54]:
# %load ignorewarnings.py
import warnings
warnings.filterwarnings('ignore')


In [55]:
np.where(obj2>0,np.sqrt(obj2),obj2)  # so far this is the best method

array([ 2.        ,  2.64575131, -5.        ,  1.73205081])

In [56]:
# Another way to think about a Series is as a fixed-length, ordered dict, as it is a mapping of index values to data values. It can be used in many contexts where you might use a dict:

In [57]:
obj2

d    4.0
b    7.0
a   -5.0
c    3.0
dtype: float64

In [58]:
'd' in obj2

True

In [59]:
'e' in obj2

False

In [60]:
# Create a python series using dict

In [61]:
inp = {'apple':100,
      'oranges':400,
      'pomgrenade':900}

In [62]:
s=Series(inp)

In [63]:
s

apple         100
oranges       400
pomgrenade    900
dtype: int64

In [64]:
'apple' in s

True

In [65]:
# When you are only passing a dict, the index in the resulting Series will have the dict’s keys in sorted order. You can override this by passing the dict keys in the order you want them to appear in the resulting Series:

In [66]:
states=['oranges','apple','kela']

In [67]:
s=Series(inp, index=states)

In [68]:
s

oranges    400.0
apple      100.0
kela         NaN
dtype: float64

In [9]:
Series({'Punjab':[100,200]}).index

Index(['Punjab'], dtype='object')

In [69]:
# The isnull and notnull functions in pandas should be used to detect missing data:

In [70]:
s.isnull()

oranges    False
apple      False
kela        True
dtype: bool

In [71]:
s.notnull()

oranges     True
apple       True
kela       False
dtype: bool

In [72]:
s.notna()

oranges     True
apple       True
kela       False
dtype: bool

In [73]:
pd.isnull(s)

oranges    False
apple      False
kela        True
dtype: bool

In [74]:
one = Series({'Punjab':100,
             'Haryana':300,
             'JK':550})

In [75]:
two = Series({'Delhi':200,
             'JK':200})

In [76]:
#A useful Series feature for many applications is that it automatically aligns by index label in arithmetic operations:
one + two 

Delhi        NaN
Haryana      NaN
JK         750.0
Punjab       NaN
dtype: float64

In [77]:
# it's like join operation in databases

In [78]:
one

Punjab     100
Haryana    300
JK         550
dtype: int64

In [79]:
# Both the Series object itself and its index have a name attribute, which integrates with other key areas of pandas functionality:

In [80]:
one.name='Number'

In [81]:
one.index.name='States'

In [82]:
one

States
Punjab     100
Haryana    300
JK         550
Name: Number, dtype: int64

<h2>DataFrames <h2>

In [83]:
data ={'State':['Punjab','Punjab','Haryana','Haryana','JK','JK','JK'],
      'Year':[2000,2001,2000,2001,2000,2001,2002],
      'Population':[12,32,10,30,10,27,31]}

In [84]:
# two ways to create

In [85]:
frame = DataFrame(data)

In [86]:
frame  # see the index here, its similar to what we discussed stdying Series above

Unnamed: 0,State,Year,Population
0,Punjab,2000,12
1,Punjab,2001,32
2,Haryana,2000,10
3,Haryana,2001,30
4,JK,2000,10
5,JK,2001,27
6,JK,2002,31


In [87]:
# another way of passing columns , it will reflect in that order only and also it additional column is given which is not preset in dataframe, it will give Nan values for all of them

# also, we can pass index

In [88]:
frame2=pd.DataFrame(data, columns=['Year','Population','State','average'],
            index = ['one', 'two','three','four','five','six','seven'])

In [89]:
frame2

Unnamed: 0,Year,Population,State,average
one,2000,12,Punjab,
two,2001,32,Punjab,
three,2000,10,Haryana,
four,2001,30,Haryana,
five,2000,10,JK,
six,2001,27,JK,
seven,2002,31,JK,


In [90]:
frame2['Year']

one      2000
two      2001
three    2000
four     2001
five     2000
six      2001
seven    2002
Name: Year, dtype: int64

In [91]:
frame2.Year

one      2000
two      2001
three    2000
four     2001
five     2000
six      2001
seven    2002
Name: Year, dtype: int64

In [92]:
# although, I already know but rows can e retrived using loc or positions


In [93]:
frame2.loc['one']

Year            2000
Population        12
State         Punjab
average          NaN
Name: one, dtype: object

In [94]:
# Columns can be modified by assignment. For example, the empty 'debt' column could be assigned a scalar value or an array of values:

In [95]:
frame2.columns

Index(['Year', 'Population', 'State', 'average'], dtype='object')

In [96]:
frame2['average']=2

In [97]:
frame2

Unnamed: 0,Year,Population,State,average
one,2000,12,Punjab,2
two,2001,32,Punjab,2
three,2000,10,Haryana,2
four,2001,30,Haryana,2
five,2000,10,JK,2
six,2001,27,JK,2
seven,2002,31,JK,2


In [98]:
frame2.average=[44,4,43,12,45,67,42]

In [99]:
frame2

Unnamed: 0,Year,Population,State,average
one,2000,12,Punjab,44
two,2001,32,Punjab,4
three,2000,10,Haryana,43
four,2001,30,Haryana,12
five,2000,10,JK,45
six,2001,27,JK,67
seven,2002,31,JK,42


In [100]:
frame2['numbering']=np.arange(1,8)

In [101]:
frame2

Unnamed: 0,Year,Population,State,average,numbering
one,2000,12,Punjab,44,1
two,2001,32,Punjab,4,2
three,2000,10,Haryana,43,3
four,2001,30,Haryana,12,4
five,2000,10,JK,45,5
six,2001,27,JK,67,6
seven,2002,31,JK,42,7


In [102]:
# When you are assigning lists or arrays to a column, the value’s length must match the length of the DataFrame. If you assign a Series, its labels will be realigned exactly to the DataFrame’s index, inserting missing values in any holes:

In [103]:
frame2.T

Unnamed: 0,one,two,three,four,five,six,seven
Year,2000,2001,2000,2001,2000,2001,2002
Population,12,32,10,30,10,27,31
State,Punjab,Punjab,Haryana,Haryana,JK,JK,JK
average,44,4,43,12,45,67,42
numbering,1,2,3,4,5,6,7


In [104]:
# The del keyword will delete columns as with a dict.

In [105]:
frame2['bools']=frame2['Year']=2000

In [106]:
frame2

Unnamed: 0,Year,Population,State,average,numbering,bools
one,2000,12,Punjab,44,1,2000
two,2000,32,Punjab,4,2,2000
three,2000,10,Haryana,43,3,2000
four,2000,30,Haryana,12,4,2000
five,2000,10,JK,45,5,2000
six,2000,27,JK,67,6,2000
seven,2000,31,JK,42,7,2000


In [107]:
frame2.rename(columns={'bools':'Punjab'})

Unnamed: 0,Year,Population,State,average,numbering,Punjab
one,2000,12,Punjab,44,1,2000
two,2000,32,Punjab,4,2,2000
three,2000,10,Haryana,43,3,2000
four,2000,30,Haryana,12,4,2000
five,2000,10,JK,45,5,2000
six,2000,27,JK,67,6,2000
seven,2000,31,JK,42,7,2000


In [108]:
# Another method
frame2.rename({'bools':'Punjab'},axis=1)

Unnamed: 0,Year,Population,State,average,numbering,Punjab
one,2000,12,Punjab,44,1,2000
two,2000,32,Punjab,4,2,2000
three,2000,10,Haryana,43,3,2000
four,2000,30,Haryana,12,4,2000
five,2000,10,JK,45,5,2000
six,2000,27,JK,67,6,2000
seven,2000,31,JK,42,7,2000


In [109]:
# Another method
frame2.rename({'bools':'Punjab'},axis='columns', inplace=True)

In [110]:
frame2['Punjab']=frame2['State']=='Punjab'

In [111]:
frame2

Unnamed: 0,Year,Population,State,average,numbering,Punjab
one,2000,12,Punjab,44,1,True
two,2000,32,Punjab,4,2,True
three,2000,10,Haryana,43,3,False
four,2000,30,Haryana,12,4,False
five,2000,10,JK,45,5,False
six,2000,27,JK,67,6,False
seven,2000,31,JK,42,7,False


In [112]:
del frame2['numbering']   # del is used to delete a column of dataframe

In [113]:
frame2

Unnamed: 0,Year,Population,State,average,Punjab
one,2000,12,Punjab,44,True
two,2000,32,Punjab,4,True
three,2000,10,Haryana,43,False
four,2000,30,Haryana,12,False
five,2000,10,JK,45,False
six,2000,27,JK,67,False
seven,2000,31,JK,42,False


In [114]:
frame3 = frame2.copy()

In [115]:
del frame3   # can be used to delete a dataframe as well

In [116]:
frame3

NameError: name 'frame3' is not defined

In [117]:
# CAUTION

# The column returned from indexing a DataFrame is a view on the underlying data, not a copy. Thus, any in-place modifications to the Series will be reflected in the DataFrame. The column can be explicitly copied with the Series’s copy method.

In [118]:
# another way tp create a datframe , is to use a nested dict.
# when we pass a nested dict , it will automatically pick up the "outer" key as its columns and inner keys as its index
# see below:

In [10]:
inp = {'Punjab':{'population':13, 'languages':['Punjabi','Hindi','English'], 'GB':'Good'},
      'Chennai':{'population':20,'languages':['Tamil','Telugu'], 'GB':'Good'}
      }

In [11]:
frame4 = DataFrame(inp)

In [14]:
frame4

Unnamed: 0,Punjab,Chennai
population,13,20
languages,"[Punjabi, Hindi, English]","[Tamil, Telugu]"
GB,Good,Good


In [15]:
frame4.T

Unnamed: 0,population,languages,GB
Punjab,13,"[Punjabi, Hindi, English]",Good
Chennai,20,"[Tamil, Telugu]",Good


In [123]:
# The keys in the inner dicts are combined and sorted to form the index in the result. This isn’t true if an explicit index is specified:

In [124]:
frame4

Unnamed: 0,Punjab,Chennai
population,13,20
languages,"[Punjabi, Hindi, English]","[Tamil, Telugu]"


In [16]:
DataFrame(inp, index=['languages','population','literacy'])

Unnamed: 0,Punjab,Chennai
languages,"[Punjabi, Hindi, English]","[Tamil, Telugu]"
population,13,20
literacy,,


In [126]:
# this is reallz good below:

In [127]:
 d = dict.fromkeys(range(100))

In [128]:
# list(d)    # r3eal

In [129]:
# However i have learnt something new o generate random dataframe using pandas only.
# I will need to explore this module from pandas
# pd.util

In [None]:
dir(pd.util)

In [130]:
pd.util.testing.makeMixedDataFrame()

Unnamed: 0,A,B,C,D
0,0.0,0.0,foo1,2009-01-01
1,1.0,1.0,foo2,2009-01-02
2,2.0,0.0,foo3,2009-01-05
3,3.0,1.0,foo4,2009-01-06
4,4.0,0.0,foo5,2009-01-07


In [131]:
# dir(pd.util.testing)

In [37]:
def generate_random_df():
    name= pd.util.testing.makeDataFrame()
    name.rename(columns={'A':'avg','B':'max','C':'min','D':'std'}, inplace=True)
    return name

In [161]:
# del frame5

In [162]:
frame5=generate_random_df()

In [163]:
frame5.head(3)

Unnamed: 0,avg,max,min,std
i5Ji0sgJeo,-0.446182,-0.63716,-0.616612,-0.242023
ZwWqTwCxbc,-1.238061,-0.824767,0.431741,-2.481099
VtazPktSmr,-0.391535,1.502678,0.777381,-1.187307


In [166]:
frame5.shape

(30, 4)

In [167]:
# Dicts of Series are treated in much the same wa

In [168]:
frame6 = DataFrame({'first':frame5['avg'].head(10),
                   'second':frame5['std'].head(10)})

In [169]:
frame6

Unnamed: 0,first,second
i5Ji0sgJeo,-0.446182,-0.242023
ZwWqTwCxbc,-1.238061,-2.481099
VtazPktSmr,-0.391535,-1.187307
GesfDT1hR9,0.937705,-0.589153
TFfJCGZNOT,0.110043,0.840242
TlhsHlMVUG,0.5105,0.611343
HpNnAESrPI,0.284323,0.192256
hKXBbJIUZJ,-0.482126,-0.593057
UYoo7WOX0V,-1.368682,-2.021828
S5nyMGCaud,-2.055448,0.391382


In [170]:
frame6.name='numbers'

In [172]:
frame6.columns.name='names'

In [173]:
frame6

names,first,second
i5Ji0sgJeo,-0.446182,-0.242023
ZwWqTwCxbc,-1.238061,-2.481099
VtazPktSmr,-0.391535,-1.187307
GesfDT1hR9,0.937705,-0.589153
TFfJCGZNOT,0.110043,0.840242
TlhsHlMVUG,0.5105,0.611343
HpNnAESrPI,0.284323,0.192256
hKXBbJIUZJ,-0.482126,-0.593057
UYoo7WOX0V,-1.368682,-2.021828
S5nyMGCaud,-2.055448,0.391382


In [174]:
frame6.index.name='randomness'

In [175]:
frame6

names,first,second
randomness,Unnamed: 1_level_1,Unnamed: 2_level_1
i5Ji0sgJeo,-0.446182,-0.242023
ZwWqTwCxbc,-1.238061,-2.481099
VtazPktSmr,-0.391535,-1.187307
GesfDT1hR9,0.937705,-0.589153
TFfJCGZNOT,0.110043,0.840242
TlhsHlMVUG,0.5105,0.611343
HpNnAESrPI,0.284323,0.192256
hKXBbJIUZJ,-0.482126,-0.593057
UYoo7WOX0V,-1.368682,-2.021828
S5nyMGCaud,-2.055448,0.391382


In [178]:
# like we used Series.values above in Series chapter, we can use this method same way for pandas dataframe as well
# it will give us twp dimensional nd-array 

In [179]:
frame6.values

array([[-0.44618207, -0.24202283],
       [-1.23806092, -2.48109887],
       [-0.39153479, -1.18730748],
       [ 0.93770544, -0.58915285],
       [ 0.1100427 ,  0.84024242],
       [ 0.51050021,  0.61134275],
       [ 0.28432266,  0.19225588],
       [-0.4821256 , -0.59305739],
       [-1.36868243, -2.02182778],
       [-2.05544831,  0.39138189]])

In [None]:
  #####################        Testing      ##########################################

In [193]:
l = [1,3]

In [194]:
m=[4,5]

In [196]:
DataFrame([l,m])

Unnamed: 0,0,1
0,1,3
1,4,5


In [17]:
np.array( [1,3]).T

array([1, 3])

<img src="part1.png" width=900 height=100 />

<img src="part2.png" width=800 height=100 />

In [198]:
######################################################################################

<h2>Index Objects<h2>

In [199]:
# pandas’s Index objects are responsible for holding the axis labels and other metadata (like the axis name or names). Any array or other sequence of labels you use when constructing a Series or DataFrame is internally converted to an Index:

In [200]:
import string

In [203]:
string.ascii_lowercase

'abcdefghijklmnopqrstuvwxyz'

In [207]:
obj= Series(range(4), index=list(string.ascii_lowercase[:4]))

In [208]:
obj

a    0
b    1
c    2
d    3
dtype: int64

In [210]:
obj.index

Index(['a', 'b', 'c', 'd'], dtype='object')

In [216]:
ind = obj.index

In [217]:
type(ind)

pandas.core.indexes.base.Index

In [218]:
# index type is immutable

In [219]:
ind[1]='k'

TypeError: Index does not support mutable operations

In [220]:
ind

Index(['a', 'b', 'c', 'd'], dtype='object')

In [222]:
ind[:2]    # can be used as  1-d array or a python list

Index(['a', 'b'], dtype='object')

In [223]:
# Immutability makes it safer to share Index objects among data structures:

In [224]:
type(ind)

pandas.core.indexes.base.Index

In [232]:
labels = pd.Index(range(3))

In [233]:
labels

RangeIndex(start=0, stop=3, step=1)

In [234]:
type(labels)    # see the difference between lables and ind, one is Index wheras other is RangeIndex

pandas.core.indexes.range.RangeIndex

In [235]:
labels2 = pd.Index(np.arange(3))

In [236]:
labels2

Int64Index([0, 1, 2], dtype='int64')

In [237]:
type(labels2)

pandas.core.indexes.numeric.Int64Index

In [239]:
dir(pd.core.indexes)   # see here diff types of indexes

['__builtins__',
 '__cached__',
 '__doc__',
 '__file__',
 '__loader__',
 '__name__',
 '__package__',
 '__path__',
 '__spec__',
 'accessors',
 'api',
 'base',
 'category',
 'datetimelike',
 'datetimes',
 'extension',
 'frozen',
 'interval',
 'multi',
 'numeric',
 'period',
 'range',
 'timedeltas']

In [240]:
labels2

Int64Index([0, 1, 2], dtype='int64')

In [249]:
labels3 = pd.Index(list(string.ascii_lowercase[:4]))

In [251]:
labels3

Index(['a', 'b', 'c', 'd'], dtype='object')

In [254]:
obj2 = Series(range(4), index=labels3, dtype=np.float64)

In [255]:
obj2

a    0.0
b    1.0
c    2.0
d    3.0
dtype: float64

In [256]:
obj2.index

Index(['a', 'b', 'c', 'd'], dtype='object')

In [257]:
labels3

Index(['a', 'b', 'c', 'd'], dtype='object')

In [258]:
obj2.index is labels3

True

In [260]:
# CAUTION
# Some users will not often take advantage of the capabilities provided by indexes, but because some operations will yield results containing indexed data, it’s important to understand how they work.

In [264]:
frame6.columns

Index(['first', 'second'], dtype='object', name='names')

In [266]:
frame6.index

Index(['i5Ji0sgJeo', 'ZwWqTwCxbc', 'VtazPktSmr', 'GesfDT1hR9', 'TFfJCGZNOT',
       'TlhsHlMVUG', 'HpNnAESrPI', 'hKXBbJIUZJ', 'UYoo7WOX0V', 'S5nyMGCaud'],
      dtype='object', name='randomness')

In [267]:
'first' in frame6.columns

True

In [268]:
# Unlike Python sets, a pandas Index can contain duplicate labels:

In [269]:
pd.Index(['foo','bar','foo','bar'])

Index(['foo', 'bar', 'foo', 'bar'], dtype='object')

Each Index has a number of methods and properties for set logic, which answer other common questions about the data it contains. Some useful ones are below

In [275]:
# %load '../to_display_full_columns.py'
def full_len():
    pd.set_option('display.max_colwidth', None)
full_len()


In [276]:
indexProprties=pd.read_clipboard()

In [277]:
indexProprties.to_csv('indexProprties.csv')

In [278]:
indexProprties

Unnamed: 0,Method,Description
0,append,"Concatenate with additional Index objects, producing a new Index"
1,difference,Compute set difference as an Index
2,intersection,Compute set intersection
3,union,Compute set union
4,isin,Compute boolean array indicating whether each value is contained in the passed collection
5,delete,Compute new Index with element at index i deleted
6,drop,Compute new Index by deleting passed values
7,insert,Compute new Index by inserting element at index i
8,is_monotonic,Returns True if each element is greater than or equal to the previous element
9,is_unique,Returns True if the Index has no duplicate values


In [279]:
# lets test some methods from above taable

In [280]:
testIndex=pd.Index(list(string.ascii_lowercase[:4]))

In [281]:
testIndex

Index(['a', 'b', 'c', 'd'], dtype='object')

In [284]:
testIndex.append(pd.Index([4]))

Index(['a', 'b', 'c', 'd', 4], dtype='object')

In [287]:
frametest=frame5.head()

In [293]:
frametest.index=frametest.index.append(pd.Index([5]))

ValueError: Length mismatch: Expected axis has 5 elements, new values have 6 elements

In [290]:
frametest

Unnamed: 0,avg,max,min,std
i5Ji0sgJeo,-0.446182,-0.63716,-0.616612,-0.242023
ZwWqTwCxbc,-1.238061,-0.824767,0.431741,-2.481099
VtazPktSmr,-0.391535,1.502678,0.777381,-1.187307
GesfDT1hR9,0.937705,-0.335373,0.342641,-0.589153
TFfJCGZNOT,0.110043,-0.687782,-0.711292,0.840242


<h1>5.2 Essential Functionality<h1>

<h4>Reindexing<h4>

In [3]:
# reindex is really powerful tool when working with manipulation , exporing data with pandas.

In [4]:
# let's first talk about Series

In [8]:
import string
import numpy as np

In [17]:
obj = Series(list(string.ascii_letters[:4]), index = np.arange(4))

In [18]:
obj

0    a
1    b
2    c
3    d
dtype: object

In [24]:
obj.reindex(np.arange(3,-1,-1))   # lets rearrange with reverse index

3    d
2    c
1    b
0    a
dtype: object

In [26]:
# For ordered data like time series, it may be desirable to do some interpolation or filling of values when reindexing. The method option allows us to do this, using a method such as ffill, which forward-fills the values:

In [27]:
obj = Series(['Apple','Banana','Orange','Pineapple'], index=np.arange(4))

In [28]:
obj

0        Apple
1       Banana
2       Orange
3    Pineapple
dtype: object

In [31]:
obj.reindex([4,2,0,1,3], method='ffill' )

4    Pineapple
2       Orange
0        Apple
1       Banana
3    Pineapple
dtype: object

In [34]:
obj.reindex([4,2,0,1,3], method='nearest' )

4    Pineapple
2       Orange
0        Apple
1       Banana
3    Pineapple
dtype: object

In [35]:
# reindexing stats here for Dataframes

In [36]:
# for dataframes, we can reindex the indices for  rows, columns or both
# if we pass only one list, it will by defult pick for rows

In [38]:
# lets first create a random dataframe

In [40]:
import warnings
warnings.filterwarnings('ignore')

In [41]:
frame = generate_random_df()

In [43]:
frame= frame.head()

In [48]:
frame

Unnamed: 0,avg,max,min,std
WXW99aiT9b,0.665444,-0.540849,0.076935,-0.474225
VuPBJfHVnQ,0.318133,1.814329,-0.344318,-1.354298
8KvaHiZzOF,0.496101,-0.85618,0.719137,0.033774
dxtKsQUGFD,-0.284466,-0.200268,-0.145471,0.423443
Dkv3fEz3Xp,-0.220162,1.518133,0.513775,-0.033472


In [53]:
frame.rename({'WXW99aiT9b':'one', 'VuPBJfHVnQ':'two',
             '8KvaHiZzOF':'third', 'dxtKsQUGFD':'fourth', 'Dkv3fEz3Xp':'fifth'}, inplace=True)

In [54]:
frame

Unnamed: 0,avg,max,min,std
one,0.665444,-0.540849,0.076935,-0.474225
two,0.318133,1.814329,-0.344318,-1.354298
third,0.496101,-0.85618,0.719137,0.033774
fourth,-0.284466,-0.200268,-0.145471,0.423443
fifth,-0.220162,1.518133,0.513775,-0.033472


In [56]:
# this is first, when we do not mention axis name etc. So by default it is taking row index
frame.reindex(['two','fifth','fourth','one','third'])

Unnamed: 0,avg,max,min,std
two,0.318133,1.814329,-0.344318,-1.354298
fifth,-0.220162,1.518133,0.513775,-0.033472
fourth,-0.284466,-0.200268,-0.145471,0.423443
one,0.665444,-0.540849,0.076935,-0.474225
third,0.496101,-0.85618,0.719137,0.033774


In [62]:
# this is second method, where we have mentioned 'columns'
columns=['max','avg','min','max']
frame.reindex(columns=columns)

Unnamed: 0,max,avg,min,max.1
one,-0.540849,0.665444,0.076935,-0.540849
two,1.814329,0.318133,-0.344318,1.814329
third,-0.85618,0.496101,0.719137,-0.85618
fourth,-0.200268,-0.284466,-0.145471,-0.200268
fifth,1.518133,-0.220162,0.513775,1.518133


In [63]:
# another thing is we can pass axis, see below:
frame.reindex(columns, axis=1)

Unnamed: 0,max,avg,min,max.1
one,-0.540849,0.665444,0.076935,-0.540849
two,1.814329,0.318133,-0.344318,1.814329
third,-0.85618,0.496101,0.719137,-0.85618
fourth,-0.200268,-0.284466,-0.145471,-0.200268
fifth,1.518133,-0.220162,0.513775,1.518133


In [64]:
# for both reindexing, we can use loc method, see below:

frame.loc[['two', 'fourth'], columns]

Unnamed: 0,max,avg,min,max.1
two,1.814329,0.318133,-0.344318,1.814329
fourth,-0.200268,-0.284466,-0.145471,-0.200268


In [65]:
reindexFunctionArguments = pd.read_clipboard()

In [66]:
reindexFunctionArguments.to_csv('reindexFunctionArguments.csv')

In [75]:
# %load '../to_display_full_columns.py'
def full_len():
    pd.set_option('display.max_colwidth', None)
full_len()


In [76]:
reindexFunctionArguments

Unnamed: 0,Argument,Description
0,index,New sequence to use as index. Can be Index instance or any other sequence-like Python data structure. An Index will be used exactly as is without any copying.
1,method,"Interpolation (fill) method; 'ffill' fills forward, while 'bfill' fills backward."
2,fill_value,Substitute value to use when introducing missing data by reindexing.
3,limit,"When forward- or backfilling, maximum size gap (in number of elements) to fill."
4,tolerance,"When forward- or backfilling, maximum size gap (in absolute numeric distance) to fill for inexact matches."
5,level,Match simple Index on level of MultiIndex; otherwise select subset of.
6,copy,"If True, always copy underlying data even if new index is equivalent to old index; if False, do not copy the data when the indexes are equivalent."


In [77]:
frame.reindex(['two','fifth','fourth','one','third'])

Unnamed: 0,avg,max,min,std
two,0.318133,1.814329,-0.344318,-1.354298
fifth,-0.220162,1.518133,0.513775,-0.033472
fourth,-0.284466,-0.200268,-0.145471,0.423443
one,0.665444,-0.540849,0.076935,-0.474225
third,0.496101,-0.85618,0.719137,0.033774


In [79]:
frame.reindex(index=['two','fifth','fourth','one','third'])   #OR

Unnamed: 0,avg,max,min,std
two,0.318133,1.814329,-0.344318,-1.354298
fifth,-0.220162,1.518133,0.513775,-0.033472
fourth,-0.284466,-0.200268,-0.145471,0.423443
one,0.665444,-0.540849,0.076935,-0.474225
third,0.496101,-0.85618,0.719137,0.033774


In [84]:
frame.reindex(index=pd.Index(range(5)), fill_value='jeet')
# two things:
#     we can pass pd.Index, but it goves Nan since we dont have that index name in roginal frame
#     Then i passed 'fill_value' to give value in place of Nan

Unnamed: 0,avg,max,min,std
0,jeet,jeet,jeet,jeet
1,jeet,jeet,jeet,jeet
2,jeet,jeet,jeet,jeet
3,jeet,jeet,jeet,jeet
4,jeet,jeet,jeet,jeet


In [98]:
# frame.reindex(['one','siz','tww'], method='ffill')

In [96]:
dataframe=pd.DataFrame({'Attendance': {0: 60, 1: 100, 2: 80,3: 75, 4: 95},
                    'Name': {0: 'Olivia', 1: 'John', 2: 'Laura',3: 'Ben',4: 'Kevin'},
                    'Obtained Marks': {0: 56, 1: 75, 2: 82, 3: 64, 4: 67}})

In [97]:
dataframe

Unnamed: 0,Attendance,Name,Obtained Marks
0,60,Olivia,56
1,100,John,75
2,80,Laura,82
3,75,Ben,64
4,95,Kevin,67


In [99]:
frame

Unnamed: 0,avg,max,min,std
one,0.665444,-0.540849,0.076935,-0.474225
two,0.318133,1.814329,-0.344318,-1.354298
third,0.496101,-0.85618,0.719137,0.033774
fourth,-0.284466,-0.200268,-0.145471,0.423443
fifth,-0.220162,1.518133,0.513775,-0.033472


In [106]:
dataframe.reindex([1,2,3,4,5], method='ffill')

Unnamed: 0,Attendance,Name,Obtained Marks
1,100,John,75
2,80,Laura,82
3,75,Ben,64
4,95,Kevin,67
5,95,Kevin,67


In [116]:
frame.reindex(['one','two','siz'], method='ffill')

TypeError: '<' not supported between instances of 'int' and 'str'

In [117]:
# above mthod wa giving error because index are in string fomrat, however we need index in increasing/ decrasing values, so I changed the str values to numeric .
# IT WORKED!!!!

In [118]:
frame.rename(index={'one':1, 'two':2, 'third':3, 'fourth':4, 'fifth':5}, inplace=True)

In [126]:
frame.reindex([1,2,5,4,6,7,8,9,10,11], method='ffill') # this added values from '5' index

# but we need to test the paramter 'limit' and 'tolerance'

Unnamed: 0,avg,max,min,std
1,0.665444,-0.540849,0.076935,-0.474225
2,0.318133,1.814329,-0.344318,-1.354298
5,-0.220162,1.518133,0.513775,-0.033472
4,-0.284466,-0.200268,-0.145471,0.423443
6,-0.220162,1.518133,0.513775,-0.033472
7,-0.220162,1.518133,0.513775,-0.033472
8,-0.220162,1.518133,0.513775,-0.033472
9,-0.220162,1.518133,0.513775,-0.033472
10,-0.220162,1.518133,0.513775,-0.033472
11,-0.220162,1.518133,0.513775,-0.033472


In [139]:
frame.reindex([1,2,5,4,6,7,8,9,10,11], method='ffill', tolerance=2)  # see, it worked

Unnamed: 0,1,2,3,4
1,0.665444,-0.540849,0.076935,-0.474225
2,0.318133,1.814329,-0.344318,-1.354298
5,-0.220162,1.518133,0.513775,-0.033472
4,-0.284466,-0.200268,-0.145471,0.423443
6,-0.220162,1.518133,0.513775,-0.033472
7,-0.220162,1.518133,0.513775,-0.033472
8,,,,
9,,,,
10,,,,
11,,,,


In [130]:
frame.rename(columns={'avg':1, 'max':2, 'min':3, 'std':4}, inplace=True)

In [140]:
# frame.reindex(columns=[1,2,5,4,6,7,8,9,10,11], method='ffill', limit=1)

<h4>dropping entries from index<h4>

In [143]:
# lest create a series first

In [149]:
obj = Series(range(4), index=list(string.ascii_lowercase[:4]))

In [150]:
obj

a    0
b    1
c    2
d    3
dtype: int64

In [152]:
obj.drop('c')  # by default it takes row as axis

a    0
b    1
d    3
dtype: int64

In [170]:
# for multiple values 

obj.drop(['c','a'])

b    1
d    3
dtype: int64

<b>for Dataframes<b>

In [9]:
frame = DataFrame(np.arange(16).reshape(4,4), 
         index = ['one','two','three','fourth'],
         columns=['Punjab','Haryana','Delhi','HK'])

In [157]:
frame

Unnamed: 0,Punjab,Haryana,Delhi,HK
one,0,1,2,3
two,4,5,6,7
three,8,9,10,11
fourth,12,13,14,15


In [158]:
frame.drop(['one'])

Unnamed: 0,Punjab,Haryana,Delhi,HK
two,4,5,6,7
three,8,9,10,11
fourth,12,13,14,15


In [160]:
# for columns

frame.drop(['Punjab'], axis=1)

Unnamed: 0,Haryana,Delhi,HK
one,1,2,3
two,5,6,7
three,9,10,11
fourth,13,14,15


In [171]:
frame.drop(['Punjab','HK'], axis= 'columns')

Unnamed: 0,Haryana,Delhi
one,1,2
two,5,6
three,9,10
fourth,13,14


In [172]:
frame

Unnamed: 0,Punjab,Haryana,Delhi,HK
one,0,1,2,3
two,4,5,6,7
three,8,9,10,11
fourth,12,13,14,15


In [178]:
# for deleting bth rows and columns

frame.drop(['Punjab'], axis='columns').drop('two', axis='rows')

Unnamed: 0,Haryana,Delhi,HK
one,1,2,3
three,9,10,11
fourth,13,14,15


In [None]:
# be careful while using inplace, it destroys any data that is dropped

In [10]:
frame.drop(['Punjab'], axis='columns').drop('two', axis='rows')

Unnamed: 0,Haryana,Delhi,HK
one,1,2,3
three,9,10,11
fourth,13,14,15


In [11]:
frame

Unnamed: 0,Punjab,Haryana,Delhi,HK
one,0,1,2,3
two,4,5,6,7
three,8,9,10,11
fourth,12,13,14,15


In [15]:
frame.drop(['Punjab'], axis='columns').drop('two', axis='rows', inplace=True)

KeyError: "['Punjab'] not found in axis"

In [16]:
frame

Unnamed: 0,Haryana,Delhi,HK
one,1,2,3
two,5,6,7
three,9,10,11
fourth,13,14,15


Indexing, Selection, and Filtering

In [17]:
# Indexing

In [18]:
obj = pd.Series(np.arange(4.), index=['a', 'b', 'c', 'd'])

In [19]:
obj

a    0.0
b    1.0
c    2.0
d    3.0
dtype: float64

In [23]:
obj['b']

1.0

In [24]:
obj[1]

1.0

In [26]:
obj[0:2]

a    0.0
b    1.0
dtype: float64

In [27]:
obj[['a','b']]

a    0.0
b    1.0
dtype: float64

In [28]:
obj[[1,2]]

b    1.0
c    2.0
dtype: float64

In [29]:
obj

a    0.0
b    1.0
c    2.0
d    3.0
dtype: float64

In [30]:
obj[obj<3]

a    0.0
b    1.0
c    2.0
dtype: float64

In [32]:
# Now see one difference:
#     when we slice base don index, it takes upper arg exclusive
#     but when we are slicing based on labels, this limit is inclusive

In [35]:
obj[0:3]    # index 3 is excluded

a    0.0
b    1.0
c    2.0
dtype: float64

In [36]:
obj['a':'d']

a    0.0
b    1.0
c    2.0
d    3.0
dtype: float64

In [37]:
obj['a':'c'] =10   # behaves same way as numpy

In [38]:
obj

a    10.0
b    10.0
c    10.0
d     3.0
dtype: float64

In [39]:
# Dataframe startes here 

In [40]:
# mainly in 2d array we need to see the row indexing, column indexing

In [41]:
data = pd.DataFrame(np.arange(16).reshape((4, 4)),
   .....:                     index=['Ohio', 'Colorado', 'Utah', 'New York'],
   .....:                     columns=['one', 'two', 'three', 'four'])

In [42]:
data

Unnamed: 0,one,two,three,four
Ohio,0,1,2,3
Colorado,4,5,6,7
Utah,8,9,10,11
New York,12,13,14,15


In [43]:
data[:2]  # rows

Unnamed: 0,one,two,three,four
Ohio,0,1,2,3
Colorado,4,5,6,7


In [44]:
data['Ohio':'Colorado']

Unnamed: 0,one,two,three,four
Ohio,0,1,2,3
Colorado,4,5,6,7


In [48]:
data[['one','three']]

Unnamed: 0,one,three
Ohio,0,2
Colorado,4,6
Utah,8,10
New York,12,14


In [96]:
data['Ohio']  #The row selection syntax data[:2] is provided as a convenience. Passing a single element or a list to the [] operator selects columns.

KeyError: 0

In [49]:
# Another use case is boolean

In [50]:
data

Unnamed: 0,one,two,three,four
Ohio,0,1,2,3
Colorado,4,5,6,7
Utah,8,9,10,11
New York,12,13,14,15


In [52]:
data>2

Unnamed: 0,one,two,three,four
Ohio,False,False,False,True
Colorado,True,True,True,True
Utah,True,True,True,True
New York,True,True,True,True


In [54]:
data[data>2]=0

In [55]:
data

Unnamed: 0,one,two,three,four
Ohio,0,1,2,0
Colorado,0,0,0,0
Utah,0,0,0,0
New York,0,0,0,0


In [56]:
data = pd.DataFrame(np.arange(16).reshape((4, 4)),
   .....:                     index=['Ohio', 'Colorado', 'Utah', 'New York'],
   .....:                     columns=['one', 'two', 'three', 'four'])

In [61]:
data[data<8]

Unnamed: 0,one,two,three,four
Ohio,0.0,1.0,2.0,3.0
Colorado,4.0,5.0,6.0,7.0
Utah,,,,
New York,,,,


In [62]:
data[data<8]=0.0

In [63]:
data

Unnamed: 0,one,two,three,four
Ohio,0,0,0,0
Colorado,0,0,0,0
Utah,8,9,10,11
New York,12,13,14,15


In [64]:
# selection with loc and iloc

In [67]:
# we can apply label index on column but not on rows
#     e.g. data[['one',two]]

# So, to overcome this, we have two methods  -- loc and iloc

# loc is for integer index and loc is for label index

In [68]:
data

Unnamed: 0,one,two,three,four
Ohio,0,0,0,0
Colorado,0,0,0,0
Utah,8,9,10,11
New York,12,13,14,15


In [70]:
data.one

Ohio         0
Colorado     0
Utah         8
New York    12
Name: one, dtype: int64

In [72]:
# data.Utah   #not working

In [73]:
data.loc[['Ohio']]

Unnamed: 0,one,two,three,four
Ohio,0,0,0,0


In [77]:
data.loc[['Ohio'],'one']

Ohio    0
Name: one, dtype: int64

In [78]:
data.loc[['Ohio'],['one','two']]

Unnamed: 0,one,two
Ohio,0,0


In [84]:
# iloc starts here

In [85]:
data.iloc[[0]]

Unnamed: 0,one,two,three,four
Ohio,0,0,0,0


In [82]:
data.iloc[[0],0]

Ohio    0
Name: one, dtype: int64

In [83]:
data.iloc[[0],[0,1]]

Unnamed: 0,one,two
Ohio,0,0


In [87]:
data.loc[:'Utah']

Unnamed: 0,one,two,three,four
Ohio,0,0,0,0
Colorado,0,0,0,0
Utah,8,9,10,11


In [88]:
data.loc[:'Utah','one']

Ohio        0
Colorado    0
Utah        8
Name: one, dtype: int64

In [90]:
data.iloc[:2,:3]

Unnamed: 0,one,two,three
Ohio,0,0,0
Colorado,0,0,0


In [97]:
data

Unnamed: 0,one,two,three,four
Ohio,0,0,0,0
Colorado,0,0,0,0
Utah,8,9,10,11
New York,12,13,14,15


In [101]:
data.iloc[:, data[data>1]]

IndexError: positional indexers are out-of-bounds

In [105]:
data.iloc[:,[True,False,True,True]]

Unnamed: 0,one,three,four
Ohio,0,0,0
Colorado,0,0,0
Utah,8,10,11
New York,12,14,15


In [111]:
data.iloc[:2]

Unnamed: 0,one,two,three,four
Ohio,0,0,0,0
Colorado,0,0,0,0


In [113]:
# data.loc[:2]   # gives error

In [119]:
data.loc[:,[True,False,True]]

IndexError: Boolean index has wrong length: 3 instead of 4

In [120]:
data

Unnamed: 0,one,two,three,four
Ohio,0,0,0,0
Colorado,0,0,0,0
Utah,8,9,10,11
New York,12,13,14,15


In [124]:
data.at['Utah','one']

8

In [125]:
# get same value using i(integer)

In [126]:
data.iat[2,0]

8

In [129]:
p=pd.get_dummies(data)

In [130]:
p

Unnamed: 0,one,two,three,four
Ohio,0,0,0,0
Colorado,0,0,0,0
Utah,8,9,10,11
New York,12,13,14,15


In [131]:
data

Unnamed: 0,one,two,three,four
Ohio,0,0,0,0
Colorado,0,0,0,0
Utah,8,9,10,11
New York,12,13,14,15


In [132]:
data['region']= ['Punjabi','Haryanavi','Punjabi','Haryanavi']

In [133]:
data

Unnamed: 0,one,two,three,four,region
Ohio,0,0,0,0,Punjabi
Colorado,0,0,0,0,Haryanavi
Utah,8,9,10,11,Punjabi
New York,12,13,14,15,Haryanavi


In [134]:
pd.get_dummies(data)

Unnamed: 0,one,two,three,four,region_Haryanavi,region_Punjabi
Ohio,0,0,0,0,0,1
Colorado,0,0,0,0,1,0
Utah,8,9,10,11,0,1
New York,12,13,14,15,1,0


In [135]:
data.get_value(0,'two')

AttributeError: 'DataFrame' object has no attribute 'get_value'

In [139]:
indexingOptionWithDataframe=pd.read_clipboard()

In [140]:
indexingOptionWithDataframe.to_csv('indexingOptionWithDataframe.csv')

In [141]:
indexingOptionWithDataframe

Unnamed: 0,Type,Notes
0,df[val],Select single column or sequence of columns fr...
1,df.loc[val],Selects single row or subset of rows from the ...
2,"df.loc[:, val]",Selects single column or subset of columns by ...
3,"df.loc[val1, val2]",Select both rows and columns by label
4,df.iloc[where],Selects single row or subset of rows from the ...
5,"df.iloc[:, where]",Selects single column or subset of columns by ...
6,"df.iloc[where_i, where_j]",Select both rows and columns by integer position
7,"df.at[label_i, label_j]",Select a single scalar value by row and column...
8,"df.iat[i, j]",Select a single scalar value by row and column...
9,reindex method,Select either rows or columns by labels


In [145]:
# %load '../to_display_full_columns.py'
def full_len():
    pd.set_option('display.max_colwidth', None)
full_len()


In [146]:
indexingOptionWithDataframe

Unnamed: 0,Type,Notes
0,df[val],"Select single column or sequence of columns from the DataFrame; special case conveniences: boolean array (filter rows), slice (slice rows), or boolean DataFrame (set values based on some criterion)"
1,df.loc[val],Selects single row or subset of rows from the DataFrame by label
2,"df.loc[:, val]",Selects single column or subset of columns by label
3,"df.loc[val1, val2]",Select both rows and columns by label
4,df.iloc[where],Selects single row or subset of rows from the DataFrame by integer position
5,"df.iloc[:, where]",Selects single column or subset of columns by integer position
6,"df.iloc[where_i, where_j]",Select both rows and columns by integer position
7,"df.at[label_i, label_j]",Select a single scalar value by row and column label
8,"df.iat[i, j]",Select a single scalar value by row and column position (integers)
9,reindex method,Select either rows or columns by labels


In [4]:
# integer indexing
import numpy as np

In [9]:
ser = Series(np.arange(3.))

In [10]:
ser

0    0.0
1    1.0
2    2.0
dtype: float64

In [12]:
ser[0]

0.0

In [13]:
ser2 = pd.Series(np.arange(3.), index=['a', 'b', 'c'])

In [14]:
ser2[-1]

2.0

In [15]:
ser2

a    0.0
b    1.0
c    2.0
dtype: float64

In [18]:
ser2[0]

0.0

In [19]:
ser_lis=list(np.arange(4))

In [20]:
ser_lis[-1]

3

In [21]:
# so what I have learnt is that while getting element from python list, index from backward works

In [22]:
ser_lis[-1]

3

In [23]:
# but that is not the case in series/dataframes when we have int index. Thats because there is amabiguity here

In [25]:
# ser[-1] # gives error

In [26]:
# now, this works perfectly fine when the index are label based

In [27]:
ser2

a    0.0
b    1.0
c    2.0
dtype: float64

In [28]:
ser2[-1]

2.0

In [29]:
ser2.loc['a']

0.0

In [30]:
ser.iloc[-1]

2.0

In [32]:
# so for more precise handling, we need to/ should use loc and iloc

In [33]:
# on the other hand, slicing is lawyas integer based

ser[:2]

0    0.0
1    1.0
dtype: float64

In [34]:
# Arithmetic and data alignment

In [35]:
# lets dive in !!!

#  Not: in general. When you are adding together objects, if any index pairs are not the same, the respective index in the result will be the union of the index pairs. For users with database experience, this is similar to an automatic outer join on the index labels.

In [38]:
ser = Series(np.arange(3),index=list('abc'))

In [39]:
ser

a    0
b    1
c    2
dtype: int64

In [40]:
ser2 = Series([-3,4,9],index=list('abd'))

In [41]:
ser2

a   -3
b    4
d    9
dtype: int64

In [42]:
ser+ser2

a   -3.0
b    5.0
c    NaN
d    NaN
dtype: float64

In [43]:
# In the case of DataFrame, alignment is performed on both the rows and the columns:

In [45]:
frame1 = DataFrame(np.arange(9).reshape(3,3)
                  , index=list('abc')
                  , columns=['Punjab','Haryana','Delhi'])

In [47]:
frame2 = DataFrame(np.arange(12.).reshape(4,3)
                  , index=list('abcd')
                  ,columns=['Punjab','JK','Haryana'])

In [48]:
frame1

Unnamed: 0,Punjab,Haryana,Delhi
a,0,1,2
b,3,4,5
c,6,7,8


In [49]:
frame2

Unnamed: 0,Punjab,JK,Haryana
a,0.0,1.0,2.0
b,3.0,4.0,5.0
c,6.0,7.0,8.0
d,9.0,10.0,11.0


In [50]:
frame1+frame2

Unnamed: 0,Delhi,Haryana,JK,Punjab
a,,3.0,,0.0
b,,9.0,,6.0
c,,15.0,,12.0
d,,,,


In [52]:
# If you add DataFrame objects with no column or row labels in common, the result will contain all nulls:

In [57]:
fr1 = DataFrame({'a':[1,2]})

In [58]:
fr2 = DataFrame({'a':[4,3]})

In [59]:
fr3 = DataFrame({'b':[4,3]})

In [60]:
fr1+fr2

Unnamed: 0,a
0,5
1,5


In [61]:
fr1+fr3

Unnamed: 0,a,b
0,,
1,,


Arithrmetic operations with fill value

In [62]:
frame1+frame2

Unnamed: 0,Delhi,Haryana,JK,Punjab
a,,3.0,,0.0
b,,9.0,,6.0
c,,15.0,,12.0
d,,,,


In [63]:
# if we need sme value instead of Nan, we need to use the method, see below:

In [75]:
# frame1.add(frame2, fill_value=0.)

In [70]:
#### to showw all in paralllell  #####

In [4]:
from IPython.display import display, HTML

CSS = """
.output {
    flex-direction: row;
}
"""

HTML('<style>{}</style>'.format(CSS))

In [74]:
display(frame1)
display(frame2)

Unnamed: 0,Punjab,Haryana,Delhi
a,0,1,2
b,3,4,5
c,6,7,8


Unnamed: 0,Punjab,JK,Haryana
a,0.0,1.0,2.0
b,3.0,4.0,5.0
c,6.0,7.0,8.0
d,9.0,10.0,11.0


In [77]:
frame1.add(frame2, fill_value=0. )

# point to note is that it does not gve us default value when resut is Nan, it gives Nan which was not present in either frames and then perfomred add operation on it

Unnamed: 0,Delhi,Haryana,JK,Punjab
a,2.0,3.0,1.0,0.0
b,5.0,9.0,4.0,6.0
c,8.0,15.0,7.0,12.0
d,,11.0,10.0,9.0


In [80]:
# %load '../to_display_full_columns.py'
def full_len():
    pd.set_option('display.max_colwidth', None)
full_len()


In [81]:
flexibleArithmeticMethods=pd.read_clipboard()

In [82]:
flexibleArithmeticMethods.to_csv('flexibleArithmeticMethods.csv')

In [83]:
flexibleArithmeticMethods

Unnamed: 0,Method,Description
0,"add, radd",Methods for addition (+)
1,"sub, rsub",Methods for subtraction (-)
2,"div, rdiv",Methods for division (/)
3,"floordiv, rfloordiv",Methods for floor division (//)
4,"mul, rmul",Methods for multiplication (*)
5,"pow, rpow",Methods for exponentiation (**)


In [84]:
frame1

Unnamed: 0,Punjab,Haryana,Delhi
a,0,1,2
b,3,4,5
c,6,7,8


In [86]:
1/frame1

Unnamed: 0,Punjab,Haryana,Delhi
a,inf,1.0,0.5
b,0.333333,0.25,0.2
c,0.166667,0.142857,0.125


In [87]:
# above is same as:
frame1.rdiv(1)

Unnamed: 0,Punjab,Haryana,Delhi
a,inf,1.0,0.5
b,0.333333,0.25,0.2
c,0.166667,0.142857,0.125


In [88]:
# similarly other methods

In [89]:
frame1.mul(2)

Unnamed: 0,Punjab,Haryana,Delhi
a,0,2,4
b,6,8,10
c,12,14,16


In [93]:
display(frame1)
display(frame1.rmul(2))
display(frame1.mul(2))      
# just an observation

Unnamed: 0,Punjab,Haryana,Delhi
a,0,1,2
b,3,4,5
c,6,7,8


Unnamed: 0,Punjab,Haryana,Delhi
a,0,2,4
b,6,8,10
c,12,14,16


Unnamed: 0,Punjab,Haryana,Delhi
a,0,2,4
b,6,8,10
c,12,14,16


# Operation between dataframe and Series

In [95]:
# As with NumPy arrays of different dimensions, arithmetic between DataFrame and Series is also defined.

In [96]:
columns = ['Punjab','JK','Haryana','Delhi']

In [97]:
frame = DataFrame(np.arange(16).reshape(4,4)
                  , columns =columns
                 , index=list('abcd'))

In [98]:
frame

Unnamed: 0,Punjab,JK,Haryana,Delhi
a,0,1,2,3
b,4,5,6,7
c,8,9,10,11
d,12,13,14,15


In [106]:
ser= frame.iloc[0]

In [107]:
ser

Punjab     0
JK         1
Haryana    2
Delhi      3
Name: a, dtype: int64

In [109]:
display(frame)
display(ser)
display(frame-ser)

Unnamed: 0,Punjab,JK,Haryana,Delhi
a,0,1,2,3
b,4,5,6,7
c,8,9,10,11
d,12,13,14,15


Punjab     0
JK         1
Haryana    2
Delhi      3
Name: a, dtype: int64

Unnamed: 0,Punjab,JK,Haryana,Delhi
a,0,0,0,0
b,4,4,4,4
c,8,8,8,8
d,12,12,12,12


In [34]:
import sys

In [35]:
sys.path.append('../randomss/')

In [36]:
from generateStates import genertes

In [38]:
genertes(4)

array(['Uttar Pradesh', 'Andaman and Nicobar Islands', 'Daman and Diu',
       'Andhra Pradesh'], dtype='<U35')

In [115]:
ser2

a    0
b    1
d    2
e    3
dtype: int64

In [113]:
frame-ser2

Unnamed: 0,Delhi,Haryana,JK,Punjab,a,b,d,e
a,,,,,,,,
b,,,,,,,,
c,,,,,,,,
d,,,,,,,,


Munction Application and Mapping

In [1]:
# NumPy ufuncs (element-wise array methods) also work with pandas objects:

In [39]:
import numpy as np
def genertes(n):
	states = np.loadtxt('states.txt',delimiter="\n", dtype=str)
	states = list(map(lambda x: x.replace(' ',''),states))
	states = np.array(states)
	return np.random.choice(states,n, replace=False)

In [43]:
frame = DataFrame(np.random.randn(4,4)
                 ,index= genertes(4)
                 ,columns=list('abcd'))

In [44]:
frame

Unnamed: 0,a,b,c,d
Bihar,-0.429453,-0.114281,-2.002858,1.374004
Mizoram,-0.827583,-0.565632,0.313063,-0.425758
Chandigarh,0.657578,-1.158674,-1.3577,-0.063732
TamilNadu,-1.221454,-0.900075,0.164019,-0.505107


In [45]:
np.abs(frame)

Unnamed: 0,a,b,c,d
Bihar,0.429453,0.114281,2.002858,1.374004
Mizoram,0.827583,0.565632,0.313063,0.425758
Chandigarh,0.657578,1.158674,1.3577,0.063732
TamilNadu,1.221454,0.900075,0.164019,0.505107


In [46]:
# apply method is used to apply a function row wise

In [50]:
f = lambda _: _.max() - _.min()

In [51]:
frame.apply(f)

a    1.879032
b    1.044393
c    2.315921
d    1.879112
dtype: float64

In [52]:
frame

Unnamed: 0,a,b,c,d
Bihar,-0.429453,-0.114281,-2.002858,1.374004
Mizoram,-0.827583,-0.565632,0.313063,-0.425758
Chandigarh,0.657578,-1.158674,-1.3577,-0.063732
TamilNadu,-1.221454,-0.900075,0.164019,-0.505107


In [57]:
frame.loc[:,['a']].max()

a    0.657578
dtype: float64

In [58]:
frame.loc[:,['a']].min()

a   -1.221454
dtype: float64

In [60]:
frame.loc[:,['a']].max() - frame.loc[:,['a']].min()

a    1.879032
dtype: float64

In [61]:
# simlarly we can use 'appply' method on columns as well

In [62]:
frame

Unnamed: 0,a,b,c,d
Bihar,-0.429453,-0.114281,-2.002858,1.374004
Mizoram,-0.827583,-0.565632,0.313063,-0.425758
Chandigarh,0.657578,-1.158674,-1.3577,-0.063732
TamilNadu,-1.221454,-0.900075,0.164019,-0.505107


In [65]:
display(frame.apply(f))
display(frame.apply(f, axis='columns'))

a    1.879032
b    1.044393
c    2.315921
d    1.879112
dtype: float64

Bihar         3.376863
Mizoram       1.140646
Chandigarh    2.015278
TamilNadu     1.385474
dtype: float64

In [66]:
# The function passed to apply need not return a scalar value; it can also return a Series with multiple values:

In [67]:
frame

Unnamed: 0,a,b,c,d
Bihar,-0.429453,-0.114281,-2.002858,1.374004
Mizoram,-0.827583,-0.565632,0.313063,-0.425758
Chandigarh,0.657578,-1.158674,-1.3577,-0.063732
TamilNadu,-1.221454,-0.900075,0.164019,-0.505107


In [71]:
frame.apply(lambda x: Series([x.min(), x.max(), x.mean(), x.std()], index=['min', 'max', 'avg', 'std']) )

Unnamed: 0,a,b,c,d
min,-1.221454,-1.158674,-2.002858,-0.505107
max,0.657578,-0.114281,0.313063,1.374004
avg,-0.455228,-0.684666,-0.720869,0.094852
std,0.80927,0.451144,1.140335,0.874141


In [73]:
# when we want to apply method element-wise, 'applymap' method comes to ouur rescue

In [74]:
frame

Unnamed: 0,a,b,c,d
Bihar,-0.429453,-0.114281,-2.002858,1.374004
Mizoram,-0.827583,-0.565632,0.313063,-0.425758
Chandigarh,0.657578,-1.158674,-1.3577,-0.063732
TamilNadu,-1.221454,-0.900075,0.164019,-0.505107


In [78]:
frame.applymap(lambda x: abs(float('%.2f'%x)))

Unnamed: 0,a,b,c,d
Bihar,0.43,0.11,2.0,1.37
Mizoram,0.83,0.57,0.31,0.43
Chandigarh,0.66,1.16,1.36,0.06
TamilNadu,1.22,0.9,0.16,0.51


In [79]:
#like we have map method for iterables (list), similarly we can use this method on Series but NOT on dataframe

In [80]:
frame.loc['Bihar']

a   -0.429453
b   -0.114281
c   -2.002858
d    1.374004
Name: Bihar, dtype: float64

In [81]:
frame.loc['Bihar'].map(lambda x: abs(float('%.2f'%x)))

a    0.43
b    0.11
c    2.00
d    1.37
Name: Bihar, dtype: float64

In [90]:
# sorting and ranking

# what we are going to study is, sorting the series, frames based on the index or values
# for ranking, we will rank the values with different assigning values/methods

In [94]:
ser = Series(genertes(6), index=[4,1,3,5,2,6])

In [95]:
ser

4                     Haryana
1             JammuandKashmir
3                       Assam
5                      Punjab
2    AndamanandNicobarIslands
6                   Meghalaya
dtype: object

In [96]:
ser.sort_index()

1             JammuandKashmir
2    AndamanandNicobarIslands
3                       Assam
4                     Haryana
5                      Punjab
6                   Meghalaya
dtype: object

In [98]:
ser.sort_index(ascending=False)

6                   Meghalaya
5                      Punjab
4                     Haryana
3                       Assam
2    AndamanandNicobarIslands
1             JammuandKashmir
dtype: object

In [99]:
# lets sort based on values now

In [100]:
ser

4                     Haryana
1             JammuandKashmir
3                       Assam
5                      Punjab
2    AndamanandNicobarIslands
6                   Meghalaya
dtype: object

In [101]:
ser.sort_values()

2    AndamanandNicobarIslands
3                       Assam
4                     Haryana
1             JammuandKashmir
6                   Meghalaya
5                      Punjab
dtype: object

In [102]:
ser.sort_values(ascending=False)

5                      Punjab
6                   Meghalaya
1             JammuandKashmir
4                     Haryana
3                       Assam
2    AndamanandNicobarIslands
dtype: object

In [103]:
# lets do it on dataframes now

In [105]:
frame = DataFrame(np.arange(16).reshape(4,4)
                 ,columns=list('adbc')
                 ,index= genertes(4))

In [106]:
frame

Unnamed: 0,a,d,b,c
Telangana,0,1,2,3
Maharashtra,4,5,6,7
AndhraPradesh,8,9,10,11
Rajasthan,12,13,14,15


In [107]:
frame.sort_index()

Unnamed: 0,a,d,b,c
AndhraPradesh,8,9,10,11
Maharashtra,4,5,6,7
Rajasthan,12,13,14,15
Telangana,0,1,2,3


In [113]:
frame.sort_index(axis='columns').sort_index()   # done both here-- first ordred based on columns then on rows

Unnamed: 0,a,b,c,d
AndhraPradesh,8,10,11,9
Maharashtra,4,6,7,5
Rajasthan,12,14,15,13
Telangana,0,2,3,1


In [114]:
# Any missing values are sorted to the end of the Series by default:
obj = pd.Series([4, np.nan, 7, np.nan, -3, 2])

In [116]:
obj.sort_values()

4   -3.0
5    2.0
0    4.0
2    7.0
1    NaN
3    NaN
dtype: float64

In [110]:
frame.sort_values(by='a')  # its like Order by in sql

Unnamed: 0,a,d,b,c
Telangana,0,1,2,3
Maharashtra,4,5,6,7
AndhraPradesh,8,9,10,11
Rajasthan,12,13,14,15


In [111]:
frame.sort_values(by=['d','c'])

Unnamed: 0,a,d,b,c
Telangana,0,1,2,3
Maharashtra,4,5,6,7
AndhraPradesh,8,9,10,11
Rajasthan,12,13,14,15


Ranking starts here

The rank methods for Series and DataFrame are the place to look; by default rank breaks ties by assigning each group the mean rank:



In [82]:
obj = pd.Series([7, -5, 7, 4, 2, 0, 4])

In [85]:
display(obj)
display(obj.rank())
display(obj.rank(method='first'))

0    7
1   -5
2    7
3    4
4    2
5    0
6    4
dtype: int64

0    6.5
1    1.0
2    6.5
3    4.5
4    3.0
5    2.0
6    4.5
dtype: float64

0    6.0
1    1.0
2    7.0
3    4.0
4    3.0
5    2.0
6    5.0
dtype: float64

In [89]:
display(obj.rank( method='max'))
display(obj.rank(ascending=False, method='max'))
display(obj.rank(ascending=False, method='first'))

0    7.0
1    1.0
2    7.0
3    5.0
4    3.0
5    2.0
6    5.0
dtype: float64

0    2.0
1    7.0
2    2.0
3    4.0
4    5.0
5    6.0
6    4.0
dtype: float64

0    1.0
1    7.0
2    2.0
3    3.0
4    5.0
5    6.0
6    4.0
dtype: float64

In [117]:
tieBreakingMethdsWithRank=pd.read_clipboard()

In [120]:
# %load '../randomss/to_display_full_columns.py'
def full_len():
    pd.set_option('display.max_colwidth', None)
full_len()


In [121]:
tieBreakingMethdsWithRank

Unnamed: 0,Method,Description
0,'average',Default: assign the average rank to each entry in the equal group
1,'min',Use the minimum rank for the whole group
2,'max',Use the maximum rank for the whole group
3,'first',Assign ranks in the order the values appear in the data
4,'dense',"Like method='min', but ranks always increase by 1 in between groups rather than the number of equal elements in a group"


In [None]:
tieBreakingMethdsWithRank.to_clipboard('tieBreakingMethdsWithRank.csv')