# Pandas

- It is basically an advanced version of excel.
- It is a library used for data manipulation and analysis.

In [3]:
# import pandas and numpy
import pandas as pd
import numpy as np

## There are several data types in Pandas : 
- First and the most basic of which is a Pandas Series.
- A Pandas Series is similar to a NumPy array.
- A Pandas Series unlike a NumPy Array can be accessed using it's axis labels and not only the indices.

## Creating a Series
- A series can be created using a List, NumPy Array or even a Python Dictionary
- Doc [here](https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.Series.html)

In [4]:
my_list = [1, 2, 3]
labels = ['a', 'b', 'c']
array = np.array([10, 20, 30])
d = {'a' : 1, 'b' : 2, 'c' : 3}

In [6]:
# Using a list
series_using_list = pd.Series(data=my_list)
series_using_list

0    1
1    2
2    3
dtype: int64

In [7]:
# Using a list with labels
series_with_labels = pd.Series(data = my_list, index = labels)
series_with_labels

a    1
b    2
c    3
dtype: int64

In [9]:
# Using a NumPy array
series2 = pd.Series(data = array)
series2

0    10
1    20
2    30
dtype: int64

In [13]:
# Using a NumPy Array with labels
indices = ['A', 'B', 'C']

series3 = pd.Series(data = array, index = ['A', 'B', 'C'])
print(r'Series 3 : ')
series3

Series 3 : 


A    10
B    20
C    30
dtype: int64

In [14]:
print(r'Series 4 : ')
series4 = pd.Series(data = array, index = indices)
series4

Series 4 : 


A    10
B    20
C    30
dtype: int64

In [15]:
# Using a dictionary
series5 = pd.Series(data = d)
print(r'Series 5 : ')
series5

Series 5 : 


a    1
b    2
c    3
dtype: int64

## Note : 
- A Pandas Series can hold a variety of Python Objects

In [16]:
labels = [1, 2, 3]
vals = ['A', 'B', 'C']

series_exp = pd.Series(vals, labels)
series_exp

1    A
2    B
3    C
dtype: object

In [19]:
def func1():
    a = 1
    return a

In [20]:
series_with_functions = pd.Series([sum, print, len, func1])
series_with_functions

0            <built-in function sum>
1          <built-in function print>
2            <built-in function len>
3    <function func1 at 0x12a307ec0>
dtype: object

## Using information stored in a Pandas Series

In [21]:
ser1 = pd.Series([1, 2, 3, 4], index = ['Apple', 'Banana', 'Mango', 'Orange'])
ser1

Apple     1
Banana    2
Mango     3
Orange    4
dtype: int64

In [22]:
ser2 = pd.Series([1, 6, 7, 8], index = ['Apple', 'Banana', 'Mango', 'Orange'])
ser2

Apple     1
Banana    6
Mango     7
Orange    8
dtype: int64

In [24]:
ser = ser1 + ser2

In [25]:
ser

Apple      2
Banana     8
Mango     10
Orange    12
dtype: int64

In [26]:
ser['Apple']

2

In [27]:
ser['Banana']

8

## DataFrames

- A very simple definition of a Pandas DataFrame is that it is a bunch of Pandas Series put together and they share the same index.
- Doc [here](https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.html)

In [28]:
from numpy.random import randn
np.random.seed(42)

In [29]:
# Take a look at this example dataframe
df = pd.DataFrame(randn(5, 4), index=['A', 'B', 'C', 'D', 'E'], columns = ['W', 'X', 'Y', 'Z'])
df

Unnamed: 0,W,X,Y,Z
A,0.496714,-0.138264,0.647689,1.52303
B,-0.234153,-0.234137,1.579213,0.767435
C,-0.469474,0.54256,-0.463418,-0.46573
D,0.241962,-1.91328,-1.724918,-0.562288
E,-1.012831,0.314247,-0.908024,-1.412304


In [30]:
# Grabbing a Series from a DataFrame
df['W']

A    0.496714
B   -0.234153
C   -0.469474
D    0.241962
E   -1.012831
Name: W, dtype: float64

In [31]:
# Grabbing multiple columns
df['W', 'Z']

KeyError: ('W', 'Z')

### Note:
- When we have to grab multiple columns we pass them as a list, otherwise they are not interpreted correctly.

In [33]:
# Correct Way 
df[['W', 'Z']]

Unnamed: 0,W,Z
A,0.496714,1.52303
B,-0.234153,0.767435
C,-0.469474,-0.46573
D,0.241962,-0.562288
E,-1.012831,-1.412304


In [35]:
# SQL (Not used as much in practice, stick to key)
df.W

A    0.496714
B   -0.234153
C   -0.469474
D    0.241962
E   -1.012831
Name: W, dtype: float64

In [36]:
type(df['W'])

pandas.core.series.Series

In [37]:
# Creating new columns
# We want to create a new column named as V which is the difference in the values of the columns W and Z
df['V'] = df['W'] - df['Z']
df

Unnamed: 0,W,X,Y,Z,V
A,0.496714,-0.138264,0.647689,1.52303,-1.026316
B,-0.234153,-0.234137,1.579213,0.767435,-1.001588
C,-0.469474,0.54256,-0.463418,-0.46573,-0.003745
D,0.241962,-1.91328,-1.724918,-0.562288,0.80425
E,-1.012831,0.314247,-0.908024,-1.412304,0.399473


### Dropping columns / rows
- Doc [here](https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.drop.html)

In [41]:
# Removing columns
# axis = 0 for index / row
# axis = 1 for column / label
copy = df.drop('V', axis=1)
copy

Unnamed: 0,W,X,Y,Z
A,0.496714,-0.138264,0.647689,1.52303
B,-0.234153,-0.234137,1.579213,0.767435
C,-0.469474,0.54256,-0.463418,-0.46573
D,0.241962,-1.91328,-1.724918,-0.562288
E,-1.012831,0.314247,-0.908024,-1.412304


In [42]:
print(f'Copy is a {type(copy)}')

Copy is a <class 'pandas.core.frame.DataFrame'>


In [43]:
df

Unnamed: 0,W,X,Y,Z,V
A,0.496714,-0.138264,0.647689,1.52303,-1.026316
B,-0.234153,-0.234137,1.579213,0.767435,-1.001588
C,-0.469474,0.54256,-0.463418,-0.46573,-0.003745
D,0.241962,-1.91328,-1.724918,-0.562288,0.80425
E,-1.012831,0.314247,-0.908024,-1.412304,0.399473


In [44]:
# using inplace = True
df.drop('V', axis=1, inplace=True)

In [45]:
df

Unnamed: 0,W,X,Y,Z
A,0.496714,-0.138264,0.647689,1.52303
B,-0.234153,-0.234137,1.579213,0.767435
C,-0.469474,0.54256,-0.463418,-0.46573
D,0.241962,-1.91328,-1.724918,-0.562288
E,-1.012831,0.314247,-0.908024,-1.412304


In [46]:
# Dropping rows
# EXERCISE : Drop Row A to C
df.drop(['A', 'B', 'C'], axis=0)

Unnamed: 0,W,X,Y,Z
D,0.241962,-1.91328,-1.724918,-0.562288
E,-1.012831,0.314247,-0.908024,-1.412304


### loc and iloc
- Doc [here](https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.loc.html)

In [47]:
# Selecting Rows
df.loc['A']

W    0.496714
X   -0.138264
Y    0.647689
Z    1.523030
Name: A, dtype: float64

In [49]:
df.iloc[0]

W    0.496714
X   -0.138264
Y    0.647689
Z    1.523030
Name: A, dtype: float64

In [52]:
df

Unnamed: 0,W,X,Y,Z
A,0.496714,-0.138264,0.647689,1.52303
B,-0.234153,-0.234137,1.579213,0.767435
C,-0.469474,0.54256,-0.463418,-0.46573
D,0.241962,-1.91328,-1.724918,-0.562288
E,-1.012831,0.314247,-0.908024,-1.412304


In [50]:
# Selecting subsets
df.loc['B', 'Y']

1.5792128155073915

In [51]:
df.loc[['A', 'B'], ['W', 'Y']]

Unnamed: 0,W,Y
A,0.496714,0.647689
B,-0.234153,1.579213
