In [1]:
!pip install pandera

Collecting pandera
  Downloading pandera-0.18.0-py3-none-any.whl (209 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/209.0 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m [32m204.8/209.0 kB[0m [31m6.0 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m209.0/209.0 kB[0m [31m4.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting multimethod (from pandera)
  Downloading multimethod-1.10-py3-none-any.whl (9.9 kB)
Collecting typeguard>=3.0.2 (from pandera)
  Downloading typeguard-4.1.5-py3-none-any.whl (34 kB)
Collecting typing-inspect>=0.6.0 (from pandera)
  Downloading typing_inspect-0.9.0-py3-none-any.whl (8.8 kB)
Collecting typing-extensions>=4.7.0 (from typeguard>=3.0.2->pandera)
  Downloading typing_extensions-4.9.0-py3-none-any.whl (32 kB)
Collecting mypy-extensions>=0.3.0 (from typing-inspect>=0.6.0->pandera)
  Downloading mypy_extensions-1.0.0-py3-none-a

# **Pandas**
Pandas contains data structures and data manipulation tools designed to make data cleaning and analysis fast and convenient in Python. There are two basic types in pandas
- Series
- Dataframe

In [2]:
import numpy as np
import pandas as pd
import pandera as pa
from typing import List, Dict, Tuple, Union

## Series
A **series** is one dimensional array-like object containing a sequence of values of the same type and an associated array of data label called *index*

In [None]:
ser = pd.Series([1, 2, 3, 4, 5]) #creating a series
schema = pa.SeriesSchema(pa.Int) #validating the input type of the series.

schema(ser)#applying the schema. If any data other than int is entered, the
            #schema will raise error

0    1
1    2
2    3
3    4
4    5
dtype: int64

In [None]:
ser = pd.Series(['Dog', 'Cat', 'Cow', 'Goat']) #creating a series
schema = pa.SeriesSchema(pa.String) #creating schema. Only string entries allowed
schema(ser) #applying schema on the series

0     Dog
1     Cat
2     Cow
3    Goat
dtype: object

In [None]:
ser = pd.Series([1, 'apple', 2, 'mango', 3, 'strawberry', True]) #creating a series
schema = pa.SeriesSchema(pa.Object) #will now allow mix entries to the series
schema(ser)

0             1
1         apple
2             2
3         mango
4             3
5    strawberry
6          True
dtype: object

In [None]:
ser = pd.Series([1.0, 2.0, 3.0, 4.0, 5.0])
schema = pa.SeriesSchema(pa.Float64);
schema(ser)

0    1.0
1    2.0
2    3.0
3    4.0
4    5.0
dtype: float64

In [None]:
# We can also get array representation of the series
ser.array

<PandasArray>
[1.0, 2.0, 3.0, 4.0, 5.0]
Length: 5, dtype: float64

We can also declare indexes of the series

In [None]:
#We can also specify index of the series
ser = pd.Series({"a":10,
                 "b":20,
                 "d":30,
                 "e":40,
                 "f":50,
                 })
schema = pa.SeriesSchema(pa.Int, index=pa.Index(pa.String))
          #schema for both indexes and entries

# Validate the Series against the schema
schema(ser)


a    10
b    20
d    30
e    40
f    50
dtype: int64

Data can be passed as a dictionary

In [None]:
ser = pd.Series({1:'Dog',
                 3: 'Cat',
                 4: 'Cow',
                 2: 'Hen',
                 5: 'Goat',
                 })
schema = pa.SeriesSchema(pa.String, index=pa.Index(pa.Int))
schema(ser)

1     Dog
3     Cat
4     Cow
2     Hen
5    Goat
dtype: object

The data and can be passed as list

In [None]:
#another way to specify index
from typing import List

index: List[str] = ['a', 'd', 'b', 'e', 'c']
fruits: List[str] = ['mango', 'strawberry', 'pineapple', 'avocado', 'apple']

ser = pd.Series(fruits, index=index)
schema = pa.SeriesSchema(pa.String, index=pa.Index(pa.String))
schema(ser)


a         mango
d    strawberry
b     pineapple
e       avocado
c         apple
dtype: object

The index and data can also be passed using arrays

In [None]:
#can be done with array
index: np.ndarray = np.array(['a', 'd', 'b', 'e', 'c'])
fruits: np.ndarray = np.array(['mango', 'strawberry', 'pineapple', 'avocado', 'apple'])
ser = pd.Series(fruits, index=index)
schema = pa.SeriesSchema(pa.String, index=pa.Index(pa.String))
schema(ser)

a         mango
d    strawberry
b     pineapple
e       avocado
c         apple
dtype: object

In [None]:
display(ser['d'])
display(ser[['d', 'b', 'e']])

'strawberry'

d    strawberry
b     pineapple
e       avocado
dtype: object

We can have multiple indexes

In [None]:
from typing import Union
index: Union[List[int],List[str]] =[[1, 1, 1, 2, 2],['a', 'd', 'b', 'e', 'c']]
fruits: List[str] = ['mango', 'strawberry', 'pineapple', 'avocado', 'apple']

ser = pd.Series(fruits, index=pd.MultiIndex.from_arrays(index))

# Define a schema for the Series
schema = pa.SeriesSchema(pa.String, index=pa.MultiIndex([pa.Index(pa.Int), pa.Index(pa.String)]))

# Validate the Series against the schema
schema(ser)

1  a         mango
   d    strawberry
   b     pineapple
2  e       avocado
   c         apple
dtype: object

In [None]:
index: Union[List[str], List[int]] = [['BS', 'BS', 'MS', 'MS', 'BS'],[1, 2, 3, 4, 5]];
students : List[str] = ['Sam', 'Eliza', 'Jack', 'Will', 'Mary']

ser = pd.Series(students, index=pd.MultiIndex.from_arrays(index))
schema = pa.SeriesSchema(pa.String, index=pa.MultiIndex([pa.Index(pa.String),pa.Index(pa.Int)]))
schema(ser)

BS  1      Sam
    2    Eliza
MS  3     Jack
    4     Will
BS  5     Mary
dtype: object

We can specify name of the series and a name for the index


In [None]:
ser = pd.Series([1.0, 2.0, 3.0, 4.0, 5.0], name='Float Series')
schema = pa.SeriesSchema(pa.Float64);
ser.index.name='Index position'
schema(ser)

Index position
0    1.0
1    2.0
2    3.0
3    4.0
4    5.0
Name: Float Series, dtype: float64

We can convert the index of the series

Series can be converted to dictionary

In [None]:
ser.index = ['Bob', 'Steve', 'Jeff', 'Ryan', "Alex"]
schema(ser)

Bob      1.0
Steve    2.0
Jeff     3.0
Ryan     4.0
Alex     5.0
Name: Float Series, dtype: float64

In [None]:
ser.to_dict()#convert series to dictionary

{0: 1.0, 1: 2.0, 2: 3.0, 3: 4.0, 4: 5.0}

Data may have **nul** or **isna** numbers that need to be removed or replaced
with some number


In [None]:
ser = pd.Series([1.0, 2.0, 3.0, 4.0, 5.0], name='Float Series')
schema = pa.SeriesSchema(pa.Float64);
schema(ser)
pd.isna(schema(ser))#check if any of the entries is not a number
pd.notna(schema(ser)) #checks if any of the entries is not a none number

0    True
1    True
2    True
3    True
4    True
Name: Float Series, dtype: bool

## DataFrame
A dataframe represents a rectangular table of data and contains an ordered, named collection of columns, each of which can be of different value type.

There are different way to create dataframe. The most common way is to use dictionary

In [None]:

data: Dict[str, List[Union[str, int]]] = {'name': ['Alice', 'Bob', 'Charlie', 'David'],
        'age': [25, 30, 22, 35]}

df = pd.DataFrame(data)

# Define a Pandera schema for the DataFrame
schema = pa.DataFrameSchema({
    'name': pa.Column(pa.String),#the first column is name with string values
    'age': pa.Column(pa.Int) #the second column is age with int values
})

# Validate the DataFrame against the schema
schema.validate(df)

Unnamed: 0,name,age
0,Alice,25
1,Bob,30
2,Charlie,22
3,David,35


In [None]:
data : Dict[str, List[Union[str, int, float]]] = {
    "state": ["Ohio", "Ohion", "Ohio", "Nevada", "Nevada", "Nevada"],
    "year": [2000, 2001, 2002, 2001, 2002, 2003],
    "pop": [1.5, 1.7, 3.6, 2.4, 2.9, 3.2]
}

data_frame = pd.DataFrame(data)

schema = pa.DataFrameSchema({
    'state': pa.Column(pa.String),
    'year': pa.Column(pa.Int),
    'pop': pa.Column(pa.Float)
})

schema.validate(data_frame)

Unnamed: 0,state,year,pop
0,Ohio,2000,1.5
1,Ohion,2001,1.7
2,Ohio,2002,3.6
3,Nevada,2001,2.4
4,Nevada,2002,2.9
5,Nevada,2003,3.2


You can select to display only top 5 entries by using **pd.head()** command

In [None]:
data_frame.head()

Unnamed: 0,state,year,pop
0,Ohio,2000,1.5
1,Ohion,2001,1.7
2,Ohio,2002,3.6
3,Nevada,2001,2.4
4,Nevada,2002,2.9


To check the column of the dataframe, you can use **dataframe.columns**

In [None]:
data_frame.columns

Index(['state', 'year', 'pop'], dtype='object')

A column of the dataframe can be accessed using **dataframe['column_name']** or **dataframe.column_name**

In [None]:
data_frame['pop']

0    1.5
1    1.7
2    3.6
3    2.4
4    2.9
5    3.2
Name: pop, dtype: float64

In [None]:
data_frame.year

0    2000
1    2001
2    2002
3    2001
4    2002
5    2003
Name: year, dtype: int64

New column can be added to the dataframe using **dataframe['column_name'] = [data]**

In [None]:
data_frame['debt'] = [False, True, True, False, False, True]
data_frame

Unnamed: 0,state,year,pop,debt
0,Ohio,2000,1.5,False
1,Ohion,2001,1.7,True
2,Ohio,2002,3.6,True
3,Nevada,2001,2.4,False
4,Nevada,2002,2.9,False
5,Nevada,2003,3.2,True


Values of a column can also be changed

In [None]:
data_frame['debt'] = np.arange(6)
data_frame

Unnamed: 0,state,year,pop,debt
0,Ohio,2000,1.5,0
1,Ohion,2001,1.7,1
2,Ohio,2002,3.6,2
3,Nevada,2001,2.4,3
4,Nevada,2002,2.9,4
5,Nevada,2003,3.2,5


A column can be removed using **del dataframe['column_name']**

In [None]:
del data_frame['debt']
data_frame

Unnamed: 0,state,year,pop
0,Ohio,2000,1.5
1,Ohion,2001,1.7
2,Ohio,2002,3.6
3,Nevada,2001,2.4
4,Nevada,2002,2.9
5,Nevada,2003,3.2


Index of dataframe can be changed by passing the same length list as new index

In [None]:
new_index: List[str] = ['a', 'b', 'c', 'd', 'e', 'f']

# Set the new custom index
data_frame.index = new_index

# Print the DataFrame with the new index
data_frame

Unnamed: 0,state,year,pop
a,Ohio,2000,1.5
b,Ohion,2001,1.7
c,Ohio,2002,3.6
d,Nevada,2001,2.4
e,Nevada,2002,2.9
f,Nevada,2003,3.2


## Essential Functionalities

Indexes are generally immutable. If a new index is created with no corresponding value, NaN will appear for the data entry. Index can be altered using **reindex** method, which is applicable for both Series and Dataframe

In [None]:
new_index : List[str] = ['a', 'b', 'g', 'c', 'd', 'e', 'f']
data_frame = data_frame.reindex(index=new_index)
data_frame

Unnamed: 0,state,year,pop
a,Ohio,2000.0,1.5
b,Ohion,2001.0,1.7
g,,,
c,Ohio,2002.0,3.6
d,Nevada,2001.0,2.4
e,Nevada,2002.0,2.9
f,Nevada,2003.0,3.2


Columns can be reindexed with the **column** keyword

In [None]:
states : List[str] = ['Ohio', 'Utah', 'California']
data_frame.reindex(columns = states)
data_frame

Unnamed: 0,state,year,pop
a,Ohio,2000.0,1.5
b,Ohion,2001.0,1.7
g,,,
c,Ohio,2002.0,3.6
d,Nevada,2001.0,2.4
e,Nevada,2002.0,2.9
f,Nevada,2003.0,3.2


### Dropping Entries from an Axis
Entries can be dropped from an axis using **.drop()** method

In [None]:
ser = pd.Series(np.arange(5.), index=['a', 'b', 'c', 'd', 'e'])
schema = pa.SeriesSchema(pa.Float)

ser = ser.drop('c') #removing the entry 'c'
schema(ser)

ser = ser.drop(['a', 'e'])
schema(ser)

b    1.0
d    3.0
dtype: float64

With dataframe, index values can be deleted from either axis.

In [None]:
data = pd.DataFrame(np.arange(16).reshape((4, 4)),
                    index=["Ohio", "Colorado", "Utah", "New York"],
                    columns=["one", "two", "three", "four"])
schema = pa.DataFrameSchema({
    'one':pa.Column(pa.Int),
    'two':pa.Column(pa.Int),
    'three':pa.Column(pa.Int),
    'four':pa.Column(pa.Int),
})
display(schema(data))

data = data.drop(index=['Ohio', 'New York']) #removing the index Ohio and New York
schema(data)

data = data.drop(columns=['three']) #removing the column three
#schema(data) #this will give error because the column mentioned is removed
data

Unnamed: 0,one,two,three,four
Ohio,0,1,2,3
Colorado,4,5,6,7
Utah,8,9,10,11
New York,12,13,14,15


Unnamed: 0,one,two,four
Colorado,4,5,7
Utah,8,9,11


## Indexing, Selection, and Filtering

In [11]:
ser = pd.Series(np.arange(4.), index=["a", "b", "c", "d"])
schema = pa.SeriesSchema(pa.Float)
schema(ser)

print("Value at index c: ", ser['c'])
print("Value at index 1: ", ser[1])
display("Value at index 0 and 1", ser[0:2])
display("Value at index a, b, and c", ser[["a", "b", "c"]])

Value at index c:  2.0
Value at index 1:  1.0


'Value at index 0 and 1'

a    0.0
b    1.0
dtype: float64

'Value at index a, b, and c'

a    0.0
b    1.0
c    2.0
dtype: float64

There are many way to select data but the preferred way to select index value is **loc**. In Pandas, **loc** accessor is used to access a group of rows and columns by labels or boolean array. It allows you to select data based on the labels of rows and columns.

In [18]:
#loc operator indexes exclusivley with labels

print("Value at index c: ", ser.loc['c'])
display("Value at index a and b: ", ser.loc[['a', 'b']])
# print("Value at index 0 and 1", ser.loc[0:2])# this will give error because index
#doesnt contain integers

display("Value and index c and d: ", ser.loc['c': "d"])

Value at index c:  2.0


'Value at index a and b: '

a    0.0
b    1.0
dtype: float64

'Value and index c and d: '

c    2.0
d    3.0
dtype: float64

In Pandas, the **iloc** indexer is used to select data in Pandas by integer location-based indexing. <br/>
The difference between **loc** and **iloc** is:
- *iloc* uses integer-based indexing while *loc* use label-based indexing
- *iloc* is exclusive on the right end of the range whereas *loc* is inclusive on both ends
- *iloc* is useful when you want to access data based on its numerical position, especially whem the index is not integers
- *loc* is useful when you want to access data based on labels and when you are working with a labeled index

In [20]:
print("Value at index 0: ", ser.iloc[0])
display("Values at index 0 and 1: ", ser.iloc[0:2])

Value at index 0:  0.0


'Values at index 0 and 1: '

a    0.0
b    1.0
dtype: float64

In case of dataframe

In [28]:
data = pd.DataFrame(np.arange(16).reshape((4, 4)),
                    index=["Ohio", "Colorado", "Utah", "New York"],
                    columns=["one", "two", "three", "four"])
# schema = pa.DataFrameSchema({
#     'one':pa.Column(pa.Int),
#     'two':pa.Column(pa.Int),
#     'three':pa.Column(pa.Int),
#     'four':pa.Column(pa.Int),
# })

schema = pa.DataFrameSchema({
    'one' : pa.Column(pa.Int),
    'two' : pa.Column(pa.Int),
    'three' : pa.Column(pa.Int),
    'four' : pa.Column(pa.Int)
})
display(schema(data))
print('----------------------')
display('Data in row Ohio: ', data.loc['Ohio'])
display('Data in row Ohio and Colorado: ', data.loc[['Ohio', 'Colorado']])
display('Data in row Ohio and column one and two', data.loc["Ohio", ["two", "three"]])

Unnamed: 0,one,two,three,four
Ohio,0,1,2,3
Colorado,4,5,6,7
Utah,8,9,10,11
New York,12,13,14,15


----------------------


'Data in row Ohio: '

one      0
two      1
three    2
four     3
Name: Ohio, dtype: int64

'Data in column Ohio and Colorado: '

Unnamed: 0,one,two,three,four
Ohio,0,1,2,3
Colorado,4,5,6,7
