Today we are going to learn about Pandas.
Pandas contains data structures and data manipulation tools designed for the purpose of cleaning data and analysing data fast.
Pandas is used along with NumPy to visualize or manipulate the data

Lets begin

In [1]:
import pandas as pd # Importing pandas

In [2]:
from pandas import Series,DataFrame # Or Import frequently used libraries 

There are two types of Data Structure in Pandas namely,
Series and DataFrame

In [3]:
# Series is a 1D array-like object
obj = pd.Series([1,2,3,4])

In [4]:
obj

0    1
1    2
2    3
3    4
dtype: int64

In [5]:
obj = pd.Series([1,2,3,4], index = ['a','b','c','d'])

In [6]:
obj

a    1
b    2
c    3
d    4
dtype: int64

In [7]:
obj.index

Index(['a', 'b', 'c', 'd'], dtype='object')

In [8]:
obj.values

array([1, 2, 3, 4], dtype=int64)

In [9]:
obj['a']

1

In [10]:
obj[['a','b']]

a    1
b    2
dtype: int64

In [11]:
# You can assign condition 
obj[obj>2]

c    3
d    4
dtype: int64

In [12]:
# Or do mathematical Operation
obj/2

a    0.5
b    1.0
c    1.5
d    2.0
dtype: float64

In [13]:
import numpy as np
np.exp(obj)

a     2.718282
b     7.389056
c    20.085537
d    54.598150
dtype: float64

In [14]:
'b' in obj

True

In [15]:
sdata = {'Ohio': 35000, 'Texas': 71000, 'Oregon': 16000, 'Utah': 5000}

In [16]:
obj = pd.Series(sdata) # This passes dict to Data Frame

In [17]:
obj

Ohio      35000
Texas     71000
Oregon    16000
Utah       5000
dtype: int64

In [18]:
# You can override the dict key for a data structure if its value exists
states = ['California', 'Ohio', 'Oregon', 'Texas']
obj1 = pd.Series(sdata, index=states)

In [19]:
obj

Ohio      35000
Texas     71000
Oregon    16000
Utah       5000
dtype: int64

Three values found in sdata were placed in the appropriate locations, but since no value for 'California' was found, it appears as NaN (not a number), which is considered
in pandas to mark missing or NA values. Since 'Utah' was not included in states, it is excluded from the resulting object.

In [20]:
# To check the missing values
pd.isnull(obj)

Ohio      False
Texas     False
Oregon    False
Utah      False
dtype: bool

In [21]:
pd.notnull(obj)

Ohio      True
Texas     True
Oregon    True
Utah      True
dtype: bool

In [22]:
# Performing arithmetic operations
# It automatically aligns by indexing the label
obj +obj1

California         NaN
Ohio           70000.0
Oregon         32000.0
Texas         142000.0
Utah               NaN
dtype: float64

In [23]:
# Assigning Titles and Index Names
obj1.name = 'Population'
obj1.index.name = 'state'

In [24]:
obj1

state
California        NaN
Ohio          35000.0
Oregon        16000.0
Texas         71000.0
Name: Population, dtype: float64

In [25]:
objec = pd.Series([1,2,3,4])

In [26]:
objec.index = ['Bob','Steve','You','Me']

In [27]:
objec

Bob      1
Steve    2
You      3
Me       4
dtype: int64

## Now lets explore DataFrame
DataFrame represents rectangular table of data which contains an ordered collection of columns, each of which can be a different value type like a dict
`Always pass a column contained in a dict`

In [28]:
data = {'state': ['Ohio', 'Ohio', 'Ohio', 'Nevada', 'Nevada', 'Nevada'],
'year': [2000, 2001, 2002, 2001, 2002, 2003],
'pop': [1.5, 1.7, 3.6, 2.4, 2.9, 3.2]}

In [29]:
frame = pd.DataFrame(data)

In [30]:
frame

Unnamed: 0,state,year,pop
0,Ohio,2000,1.5
1,Ohio,2001,1.7
2,Ohio,2002,3.6
3,Nevada,2001,2.4
4,Nevada,2002,2.9
5,Nevada,2003,3.2


In [31]:
frame.head() # Returns 5 value

Unnamed: 0,state,year,pop
0,Ohio,2000,1.5
1,Ohio,2001,1.7
2,Ohio,2002,3.6
3,Nevada,2001,2.4
4,Nevada,2002,2.9


In [32]:
frame.tail() # Returns last 5

Unnamed: 0,state,year,pop
1,Ohio,2001,1.7
2,Ohio,2002,3.6
3,Nevada,2001,2.4
4,Nevada,2002,2.9
5,Nevada,2003,3.2


In [33]:
# You can specify a sequence of columns also
pd.DataFrame(data,columns = ['pop','year','state'])

Unnamed: 0,pop,year,state
0,1.5,2000,Ohio
1,1.7,2001,Ohio
2,3.6,2002,Ohio
3,2.4,2001,Nevada
4,2.9,2002,Nevada
5,3.2,2003,Nevada


In [34]:
frame.columns

Index(['state', 'year', 'pop'], dtype='object')

In [35]:
frame['state']

0      Ohio
1      Ohio
2      Ohio
3    Nevada
4    Nevada
5    Nevada
Name: state, dtype: object

In [36]:
frame.loc[1] # 'loc' retrieves postion of an element in a row 

state    Ohio
year     2001
pop       1.7
Name: 1, dtype: object

In [37]:
frame['new_val'] = np.arange(6)

In [38]:
frame

Unnamed: 0,state,year,pop,new_val
0,Ohio,2000,1.5,0
1,Ohio,2001,1.7,1
2,Ohio,2002,3.6,2
3,Nevada,2001,2.4,3
4,Nevada,2002,2.9,4
5,Nevada,2003,3.2,5


In [39]:
val = pd.Series([-1.2, -1.5, -1.7], index=[2, 4, 5])

In [40]:
frame['new_val'] = val # Realigns accordingly but missing values will be NaN

In [41]:
frame

Unnamed: 0,state,year,pop,new_val
0,Ohio,2000,1.5,
1,Ohio,2001,1.7,
2,Ohio,2002,3.6,-1.2
3,Nevada,2001,2.4,
4,Nevada,2002,2.9,-1.5
5,Nevada,2003,3.2,-1.7


In [42]:
del frame['new_val']

In [43]:
frame

Unnamed: 0,state,year,pop
0,Ohio,2000,1.5
1,Ohio,2001,1.7
2,Ohio,2002,3.6
3,Nevada,2001,2.4
4,Nevada,2002,2.9
5,Nevada,2003,3.2


In [44]:
pop = {'Nevada': {2001: 2.4, 2002: 2.9},'Ohio': {2000: 1.5, 2001: 1.7, 2002: 3.6}}

In [45]:
new = pd.Series(pop)

In [46]:
new

Nevada               {2001: 2.4, 2002: 2.9}
Ohio      {2000: 1.5, 2001: 1.7, 2002: 3.6}
dtype: object

In [47]:
new = pd.DataFrame(new) 

In [48]:
new # Bad representation of Data

Unnamed: 0,0
Nevada,"{2001: 2.4, 2002: 2.9}"
Ohio,"{2000: 1.5, 2001: 1.7, 2002: 3.6}"


In [49]:
new_ = pd.DataFrame(pop)

In [50]:
new_

Unnamed: 0,Nevada,Ohio
2001,2.4,1.7
2002,2.9,3.6
2000,,1.5


In [51]:
new_.T

Unnamed: 0,2001,2002,2000
Nevada,2.4,2.9,
Ohio,1.7,3.6,1.5


In [52]:
pd.DataFrame(pop,index = [2001,2002,2003])

Unnamed: 0,Nevada,Ohio
2001,2.4,1.7
2002,2.9,3.6
2003,,


In [53]:
new_.index.name = 'Year';new_.columns.name = 'State'

In [54]:
new_

State,Nevada,Ohio
Year,Unnamed: 1_level_1,Unnamed: 2_level_1
2001,2.4,1.7
2002,2.9,3.6
2000,,1.5


![image.png](attachment:image.png)

In [55]:
obj = pd.Series(range(3), index =['a','b','c'])
index = obj.index

In [56]:
index

Index(['a', 'b', 'c'], dtype='object')

In [57]:
index[1:]

Index(['b', 'c'], dtype='object')

In [58]:
obj = pd.Series(['blue','purple','yellow'],index = [0,2,4])

In [59]:
obj

0      blue
2    purple
4    yellow
dtype: object

In [60]:
obj.reindex(range(10),method = 'bfill') # Fills missing values with NaN

0      blue
1    purple
2    purple
3    yellow
4    yellow
5       NaN
6       NaN
7       NaN
8       NaN
9       NaN
dtype: object

In [61]:
obj.reindex(range(6), method='ffill') # Which forward-fills the values

0      blue
1      blue
2    purple
3    purple
4    yellow
5    yellow
dtype: object

In [62]:
frame = pd.DataFrame(np.arange(9).reshape((3, 3)),index=['a', 'c', 'd'],
                     columns=['Ohio', 'Texas', 'California'])

In [63]:
frame

Unnamed: 0,Ohio,Texas,California
a,0,1,2
c,3,4,5
d,6,7,8


In [64]:
frame = frame.reindex(['a', 'b', 'c', 'd'])

In [65]:
frame

Unnamed: 0,Ohio,Texas,California
a,0.0,1.0,2.0
b,,,
c,3.0,4.0,5.0
d,6.0,7.0,8.0


In [66]:
states =['Texas','Utah','California']
f = frame.reindex(columns= states)

In [67]:
f

Unnamed: 0,Texas,Utah,California
a,1.0,,2.0
b,,,
c,4.0,,5.0
d,7.0,,8.0


In [68]:
frame # You can see that bothe elements or changes are unaffected

Unnamed: 0,Ohio,Texas,California
a,0.0,1.0,2.0
b,,,
c,3.0,4.0,5.0
d,6.0,7.0,8.0


![image.png](attachment:image.png)

## Dropping Entries 

In [69]:
obj = pd.Series(np.arange(5.), index=['a', 'b', 'c', 'd', 'e'])
obj

a    0.0
b    1.0
c    2.0
d    3.0
e    4.0
dtype: float64

In [70]:
new_obj = obj.drop('c')

In [71]:
new_obj

a    0.0
b    1.0
d    3.0
e    4.0
dtype: float64

In [72]:
data = pd.DataFrame(np.arange(16).reshape((4, 4)),
                    index=['Ohio', 'Colorado', 'Utah', 'New York'],
                    columns=['one', 'two', 'three', 'four'])

In [73]:
data

Unnamed: 0,one,two,three,four
Ohio,0,1,2,3
Colorado,4,5,6,7
Utah,8,9,10,11
New York,12,13,14,15


In [74]:
data.drop('two', axis=1)

Unnamed: 0,one,three,four
Ohio,0,2,3
Colorado,4,6,7
Utah,8,10,11
New York,12,14,15


In [75]:
data.drop(['two', 'four'], axis='columns')

Unnamed: 0,one,three
Ohio,0,2
Colorado,4,6
Utah,8,10
New York,12,14


In [76]:
# Indexing
data['Ohio':'Utah']

Unnamed: 0,one,two,three,four
Ohio,0,1,2,3
Colorado,4,5,6,7
Utah,8,9,10,11


In [77]:
data = pd.DataFrame(np.arange(16).reshape((4, 4)),
                    index=['Ohio', 'Colorado', 'Utah', 'New York'],
                    columns=['one', 'two', 'three', 'four'])

In [78]:
data

Unnamed: 0,one,two,three,four
Ohio,0,1,2,3
Colorado,4,5,6,7
Utah,8,9,10,11
New York,12,13,14,15


In [79]:
data['two']

Ohio         1
Colorado     5
Utah         9
New York    13
Name: two, dtype: int32

In [80]:
data[['three','one']]

Unnamed: 0,three,one
Ohio,2,0
Colorado,6,4
Utah,10,8
New York,14,12


In [81]:
data[:2]

Unnamed: 0,one,two,three,four
Ohio,0,1,2,3
Colorado,4,5,6,7


In [82]:
data[data['three'] > 5]

Unnamed: 0,one,two,three,four
Colorado,4,5,6,7
Utah,8,9,10,11
New York,12,13,14,15


In [83]:
data < 5

Unnamed: 0,one,two,three,four
Ohio,True,True,True,True
Colorado,True,False,False,False
Utah,False,False,False,False
New York,False,False,False,False


In [84]:
# Assigning values
data[data< 5] = 0

In [85]:
data

Unnamed: 0,one,two,three,four
Ohio,0,0,0,0
Colorado,0,5,6,7
Utah,8,9,10,11
New York,12,13,14,15


In [86]:
# Selection using loc and iloc
data.loc['Colorado', ['two', 'three']]

two      5
three    6
Name: Colorado, dtype: int32

In [87]:
data.iloc[2, [3, 0, 1]] # Same process but with integers

four    11
one      8
two      9
Name: Utah, dtype: int32

In [88]:
data.iloc[:, :3][data.three > 0]

Unnamed: 0,one,two,three
Colorado,0,5,6
Utah,8,9,10
New York,12,13,14


![image.png](attachment:image.png)
![image.png](attachment:image.png)

### Function Application and Mapping

In [89]:
frame1 = pd.DataFrame(np.random.randn(4, 3), columns=list('bde'),
                     index=['Utah', 'Ohio', 'Texas', 'Oregon'])

In [90]:
frame1

Unnamed: 0,b,d,e
Utah,-0.624564,-0.831541,-0.818088
Ohio,0.013247,0.160569,1.815917
Texas,2.194017,-0.179676,0.807752
Oregon,1.280233,1.502499,-0.915407


In [91]:
x = np.abs(frame1)

In [92]:
m = lambda x: x.max() - x.min()



In [93]:
frame1.apply(m)

b    2.818581
d    2.334040
e    2.731325
dtype: float64

Another frequent operation is applying a function on one-dimensional arrays to each column or row. DataFrame’s apply method does exactly this

In [94]:
# Let create the same in a fuction
def f(x):
    return pd.Series([x.min(), x.max()], index=['min', 'max'])

In [95]:
frame.apply(f)

Unnamed: 0,Ohio,Texas,California
min,0.0,1.0,2.0
max,6.0,7.0,8.0


In [96]:
format = lambda x : '%.2f' %x

Suppose you wanted to compute a formatted string from each floating-point value in frame. You can do this with apply map

In [97]:
frame.applymap(format)

Unnamed: 0,Ohio,Texas,California
a,0.0,1.0,2.0
b,,,
c,3.0,4.0,5.0
d,6.0,7.0,8.0


Map method can also do element-wise function

In [98]:
frame1['e'].map(format)

Utah      -0.82
Ohio       1.82
Texas      0.81
Oregon    -0.92
Name: e, dtype: object

In [99]:
# Sorting and Ranking
obj = pd.Series(range(4), index = ['d','a','b','c'])
obj.sort_index()

a    1
b    2
c    3
d    0
dtype: int64

In [100]:
frame = pd.DataFrame(np.arange(8).reshape((2, 4)),
                     index=['three', 'one'],
                     columns=['d', 'a', 'b', 'c'])

In [101]:
frame.sort_index() # Colomn wise sorting

Unnamed: 0,d,a,b,c
one,4,5,6,7
three,0,1,2,3


In [102]:
frame

Unnamed: 0,d,a,b,c
three,0,1,2,3
one,4,5,6,7


In [103]:
frame.sort_index(axis=1) # Row wise

Unnamed: 0,a,b,c,d
three,1,2,3,0
one,5,6,7,4


In [104]:
frame.sort_index(axis=0) # Colomn wise

Unnamed: 0,d,a,b,c
one,4,5,6,7
three,0,1,2,3


In [105]:
obj = pd.Series([4, 7, -3, 2])

In [106]:
obj.sort_values()

2   -3
3    2
0    4
1    7
dtype: int64

In [107]:
frame = pd.DataFrame({'b': [4, 7, -3, 2], 'a': [0, 1, 0, 1]})

In [108]:
frame

Unnamed: 0,b,a
0,4,0
1,7,1
2,-3,0
3,2,1


In [109]:
frame.sort_values(by='b')

Unnamed: 0,b,a
2,-3,0
3,2,1
0,4,0
1,7,1


In [110]:
frame.sort_values(by=['a', 'b'])

Unnamed: 0,b,a
2,-3,0
0,4,0
3,2,1
1,7,1


In [111]:
obj = pd.Series(range(5), index=['a', 'a', 'b', 'b', 'c'])

In [112]:
obj.index.is_unique # Finds whether it is unique or not

False

In [113]:
frame.sum() # colomn wise sum

b    10
a     2
dtype: int64

In [114]:
frame.sum(axis = 'columns') # Sums Values on Colomn/ Row Wise

0    4
1    8
2   -3
3    3
dtype: int64

In [115]:
frame.mean(axis = 'columns', skipna =  False)

0    2.0
1    4.0
2   -1.5
3    1.5
dtype: float64

In [116]:
# Lets understand the above skipna with another example
df = pd.DataFrame([[1.4, np.nan], [7.1, -4.5],
                   [np.nan, np.nan], [0.75, -1.3]],
                  index=['a', 'b', 'c', 'd'],
                  columns=['one', 'two'])

In [117]:
df

Unnamed: 0,one,two
a,1.4,
b,7.1,-4.5
c,,
d,0.75,-1.3


In [118]:
df.mean(axis = 'columns', skipna = False) 

a      NaN
b    1.300
c      NaN
d   -0.275
dtype: float64

In [119]:
df.mean(axis ='columns', skipna = True) # NaN is ingnored in this case

a    1.400
b    1.300
c      NaN
d   -0.275
dtype: float64

![image.png](attachment:image.png)

In [120]:
df.idxmax() # First returns max; then min

one    b
two    d
dtype: object

In [121]:
df.idxmin() # Vice versa to idxmax

one    d
two    b
dtype: object

In [122]:
# We can obtain the statistical summary of a data frame
df.describe()

Unnamed: 0,one,two
count,3.0,2.0
mean,3.083333,-2.9
std,3.493685,2.262742
min,0.75,-4.5
25%,1.075,-3.7
50%,1.4,-2.9
75%,4.25,-2.1
max,7.1,-1.3


## Correlation and Convariance
Lets understand this by seeing an example of a stock prices and volumes obtained from Yahoo !

In [123]:
# For Running this library one must install pandas_datareader
# You can download it by conda install pandas_datareader or if you're in linux environ - pip install pandas_datareader
# If you conda environ does not work then try pip install pandas_datareader implying that you already have pip installed
import pandas_datareader.data as web
all_data = {ticker: web.get_data_yahoo(ticker)
            for ticker in ['AAPL', 'IBM', 'MSFT', 'GOOG']}

  from pandas.util.testing import assert_frame_equal


In [124]:
price = pd.DataFrame({ticker: data['Adj Close']
                      for ticker, data in all_data.items()})
volume = pd.DataFrame({ticker: data['Volume']
                       for ticker, data in all_data.items()})

In [125]:
returns = price.pct_change()

In [126]:
returns.tail()

Unnamed: 0_level_0,AAPL,IBM,MSFT,GOOG
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2020-05-08,0.023802,0.014518,0.005882,0.011519
2020-05-11,0.015735,-0.003252,0.011154,0.010725
2020-05-12,-0.011428,-0.019006,-0.022652,-0.019611
2020-05-13,-0.012074,-0.037668,-0.015122,-0.019197
2020-05-14,0.006143,0.010542,0.004339,0.00504


In [127]:
returns['MSFT'].corr(returns['IBM']) #correlation

0.6004855284577707

In [128]:
returns['MSFT'].cov(returns['IBM']) # convariance

0.00016228809404216508

In [129]:
returns.MSFT.corr(returns.IBM) # Another way

0.6004855284577707

In [130]:
returns.corr()

Unnamed: 0,AAPL,IBM,MSFT,GOOG
AAPL,1.0,0.532697,0.710988,0.642625
IBM,0.532697,1.0,0.600486,0.530369
MSFT,0.710988,0.600486,1.0,0.750899
GOOG,0.642625,0.530369,0.750899,1.0


In [131]:
returns.cov()

Unnamed: 0,AAPL,IBM,MSFT,GOOG
AAPL,0.000328,0.000152,0.000222,0.000199
IBM,0.000152,0.000247,0.000162,0.000143
MSFT,0.000222,0.000162,0.000296,0.000221
GOOG,0.000199,0.000143,0.000221,0.000293


In [132]:
# To return correlation with respect to an element 
returns.corrwith(returns.IBM)

AAPL    0.532697
IBM     1.000000
MSFT    0.600486
GOOG    0.530369
dtype: float64

### Unique Values, Value counts 

In [133]:
obj = pd.Series(['c', 'a', 'd', 'a', 'a', 'b', 'b', 'c', 'c'])
uniques = obj.unique()

In [134]:
uniques

array(['c', 'a', 'd', 'b'], dtype=object)

In [135]:
obj.value_counts()

c    3
a    3
b    2
d    1
dtype: int64

In [136]:
pd.value_counts(obj.values, sort=False)

a    3
b    2
d    1
c    3
dtype: int64

In [137]:
obj

0    c
1    a
2    d
3    a
4    a
5    b
6    b
7    c
8    c
dtype: object

In [138]:
mask = obj.isin(['b','c'])

In [139]:
mask

0     True
1    False
2    False
3    False
4    False
5     True
6     True
7     True
8     True
dtype: bool

In [140]:
# Lets do an example to understand apply
data = pd.DataFrame({'Qu1': [1, 3, 4, 3, 4],
                     'Qu2': [2, 3, 1, 2, 3],
                     'Qu3': [1, 5, 2, 4, 4]})

In [141]:
data

Unnamed: 0,Qu1,Qu2,Qu3
0,1,2,1
1,3,3,5
2,4,1,2
3,3,2,4
4,4,3,4


In [142]:
# Fill 0's for every value_counts 
result = data.apply(pd.value_counts).fillna(0) 

In [143]:
"""the row labels in the result are the distinct values occurring in all of the columns.
The values are the respective counts of these values in each column"""
result

Unnamed: 0,Qu1,Qu2,Qu3
1,1.0,1.0,1.0
2,0.0,2.0,1.0
3,2.0,2.0,0.0
4,2.0,0.0,2.0
5,0.0,0.0,1.0
