In [None]:
# We use 'pandas' for overall data manipulation in this session
# Pandas is a very famous python library for data analysis. We can efficiently manipulate, filter and create
# data using Pandas.
# DataFrames are the backbone of Pandas
# Dataframes are the two dimensional data structures in pandas


In [3]:
# Let's represent a two-dimensional data structure using Python dictionary
people = {
    "name": ["Jon", "Jane", "Ken", "Kevin"],
    "email": ["jon@email.com", "jane@email.com", "ken@email.com", "kevin@email.com"],
    "address": ["KTM", "BKT", "PKR", "LTP"]
}

In [4]:
import pandas as pd
df = pd.DataFrame(people)
df

Unnamed: 0,name,email,address
0,Jon,jon@email.com,KTM
1,Jane,jane@email.com,BKT
2,Ken,ken@email.com,PKR
3,Kevin,kevin@email.com,LTP


In [8]:
df.shape  # (rows, column)

(4, 3)

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4 entries, 0 to 3
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   name     4 non-null      object
 1   email    4 non-null      object
 2   address  4 non-null      object
dtypes: object(3)
memory usage: 224.0+ bytes


In [7]:
df.head(2)

Unnamed: 0,name,email,address
0,Jon,jon@email.com,KTM
1,Jane,jane@email.com,BKT


In [None]:
df.tail(2)

Unnamed: 0,name,email,address
2,Ken,ken@email.com,PKR
3,Kevin,kevin@email.com,LTP


In [None]:
df['email']

0      jon@email.com
1     jane@email.com
2      ken@email.com
3    kevin@email.com
Name: email, dtype: object

In [None]:
type(df['email']) # Series
# Series is a single column of a dataframe
# Dataframe is a collection of such series

pandas.core.series.Series

In [None]:
# We can also access the column / series using a dot (.) operator
df.email
# But using '.' is not recommended because it may conflict with the dataframe attributes

0      jon@email.com
1     jane@email.com
2      ken@email.com
3    kevin@email.com
Name: email, dtype: object

In [None]:
r = df[["name", "email"]]
type(r)   # dataframe
r

Unnamed: 0,name,email
0,Jon,jon@email.com
1,Jane,jane@email.com
2,Ken,ken@email.com
3,Kevin,kevin@email.com


In [None]:
df.columns

Index(['name', 'email', 'address'], dtype='object')

In [None]:
df.iloc[0]  # It gives the first row of the dataframe
# iloc stands for integer-location.
# We can give rows and coumns in the iloc
# If we want to provide multiple rows and multiple columns then they should be enclosed in [].

name                 Jon
email      jon@email.com
address              KTM
Name: 0, dtype: object

In [None]:
df.iloc[[0, 1]]

Unnamed: 0,name,email,address
0,Jon,jon@email.com,KTM
1,Jane,jane@email.com,BKT


In [None]:
df.iloc[0, 1]  # here 0 index is for row and 1 index is for column

'jon@email.com'

In [None]:
df.iloc[[0,1], 1]  # here [0, 1] are rows and 1 is for column

0     jon@email.com
1    jane@email.com
Name: email, dtype: object

In [None]:
df.iloc[[0, 1], [0, 1]]  # here [0, 1] are rows and [0, 1] are columns

Unnamed: 0,name,email
0,Jon,jon@email.com
1,Jane,jane@email.com


In [None]:
df.iloc[0:2, 0:2]

Unnamed: 0,name,email
0,Jon,jon@email.com
1,Jane,jane@email.com


In [None]:
# we also have a 'loc' method for a dataframe
df.loc[[0, 1], ["name", "email"]]

Unnamed: 0,name,email
0,Jon,jon@email.com
1,Jane,jane@email.com


In [None]:
df.loc[0:3, "name":"email"]

Unnamed: 0,name,email
0,Jon,jon@email.com
1,Jane,jane@email.com
2,Ken,ken@email.com
3,Kevin,kevin@email.com


In [None]:
# We can also set the index of our rows
df.set_index("email")

Unnamed: 0_level_0,name,address
email,Unnamed: 1_level_1,Unnamed: 2_level_1
jon@email.com,Jon,KTM
jane@email.com,Jane,BKT
ken@email.com,Ken,PKR
kevin@email.com,Kevin,LTP


In [None]:
df

Unnamed: 0,name,email,address
0,Jon,jon@email.com,KTM
1,Jane,jane@email.com,BKT
2,Ken,ken@email.com,PKR
3,Kevin,kevin@email.com,LTP


In [None]:
new_df = df.set_index('email')
df.set_index("email", inplace=True)
df

Unnamed: 0_level_0,name,address
email,Unnamed: 1_level_1,Unnamed: 2_level_1
jon@email.com,Jon,KTM
jane@email.com,Jane,BKT
ken@email.com,Ken,PKR
kevin@email.com,Kevin,LTP


In [None]:
df

Unnamed: 0_level_0,name,address
email,Unnamed: 1_level_1,Unnamed: 2_level_1
jon@email.com,Jon,KTM
jane@email.com,Jane,BKT
ken@email.com,Ken,PKR
kevin@email.com,Kevin,LTP


In [None]:
df.loc[["jon@email.com", "kevin@email.com"], ["name", "address"]]

Unnamed: 0_level_0,name,address
email,Unnamed: 1_level_1,Unnamed: 2_level_1
jon@email.com,Jon,KTM
kevin@email.com,Kevin,LTP


In [None]:
df.iloc[[0, 1], [0, 1]]

Unnamed: 0_level_0,name,address
email,Unnamed: 1_level_1,Unnamed: 2_level_1
jon@email.com,Jon,KTM
jane@email.com,Jane,BKT


In [None]:
df.reset_index(inplace=True)   # This removes the custom indexing and reset to default

In [None]:
df

Unnamed: 0,email,name,address
0,jon@email.com,Jon,KTM
1,jane@email.com,Jane,BKT
2,ken@email.com,Ken,PKR
3,kevin@email.com,Kevin,LTP


In [None]:
df.sort_index()

Unnamed: 0_level_0,name,address
email,Unnamed: 1_level_1,Unnamed: 2_level_1
jane@email.com,Jane,BKT
jon@email.com,Jon,KTM
ken@email.com,Ken,PKR
kevin@email.com,Kevin,LTP


In [None]:
df.sort_index(ascending=False)

Unnamed: 0_level_0,name,address
email,Unnamed: 1_level_1,Unnamed: 2_level_1
kevin@email.com,Kevin,LTP
ken@email.com,Ken,PKR
jon@email.com,Jon,KTM
jane@email.com,Jane,BKT


In [None]:
# Filtering
df["address"] == "PKR"  # it is just a masking
filt = (df["address"] == "PKR")
df[filt]

Unnamed: 0_level_0,name,address
email,Unnamed: 1_level_1,Unnamed: 2_level_1
ken@email.com,Ken,PKR


In [None]:
df.loc[filt]

Unnamed: 0_level_0,name,address
email,Unnamed: 1_level_1,Unnamed: 2_level_1
ken@email.com,Ken,PKR


In [None]:
df.loc[filt, "address"]

email
ken@email.com    PKR
Name: address, dtype: object

In [None]:
df.loc[filt, ["address", "name"]]

Unnamed: 0_level_0,address,name
email,Unnamed: 1_level_1,Unnamed: 2_level_1
ken@email.com,PKR,Ken


In [None]:
df

Unnamed: 0_level_0,name,address
email,Unnamed: 1_level_1,Unnamed: 2_level_1
jon@email.com,Jon,KTM
jane@email.com,Jane,BKT
ken@email.com,Ken,PKR
kevin@email.com,Kevin,LTP


In [None]:
filt = ((df["address"] == "BKT") | (df["name"] == "Kevin"))
df.loc[filt]

Unnamed: 0_level_0,name,address
email,Unnamed: 1_level_1,Unnamed: 2_level_1
jane@email.com,Jane,BKT
kevin@email.com,Kevin,LTP


In [None]:
df.loc[~filt]   # ~ refers to negate/ exclude

Unnamed: 0_level_0,name,address
email,Unnamed: 1_level_1,Unnamed: 2_level_1
jon@email.com,Jon,KTM
ken@email.com,Ken,PKR
