In [2]:
import numpy as np 
import pandas as pd

In [3]:
data = {
    'Name':['Tom','James','Ricky','Messi','Ronaldo',np.nan,'Jack'],
    'Age':[25,26,np.nan,23,np.nan,29,23],
    'Rating':[4.23,3.24,3.98,np.nan,4.6,np.nan,4.32]
}

df = pd.DataFrame(data)
df

Unnamed: 0,Name,Age,Rating
0,Tom,25.0,4.23
1,James,26.0,3.24
2,Ricky,,3.98
3,Messi,23.0,
4,Ronaldo,,4.6
5,,29.0,
6,Jack,23.0,4.32


    isnull() and isna() both are same it will give boolean as "True"
    whenever there is a null value or missing value or empty value

In [4]:
# isnull()
df.isnull()

Unnamed: 0,Name,Age,Rating
0,False,False,False
1,False,False,False
2,False,True,False
3,False,False,True
4,False,True,False
5,True,False,True
6,False,False,False


In [25]:
# to get the sum of the null values
df.isnull().sum()

Name      1
Age       2
Rating    2
dtype: int64

In [26]:
df.isna()

Unnamed: 0,Name,Age,Rating
0,False,False,False
1,False,False,False
2,False,True,False
3,False,False,True
4,False,True,False
5,True,False,True
6,False,False,False


In [27]:
# to get the sum of non-accesible value
df.isna().sum()

Name      1
Age       2
Rating    2
dtype: int64

In [28]:
# it will return "True" for the existing value
df.notnull()

Unnamed: 0,Name,Age,Rating
0,True,True,True
1,True,True,True
2,True,False,True
3,True,True,False
4,True,False,True
5,False,True,False
6,True,True,True


In [29]:
# it will return the sum of the existing value 
df.notnull().sum()

Name      6
Age       5
Rating    5
dtype: int64

In [30]:
df.notna()

Unnamed: 0,Name,Age,Rating
0,True,True,True
1,True,True,True
2,True,False,True
3,True,True,False
4,True,False,True
5,False,True,False
6,True,True,True


In [31]:
df.notna().sum()

Name      6
Age       5
Rating    5
dtype: int64

In [32]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7 entries, 0 to 6
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Name    6 non-null      object 
 1   Age     5 non-null      float64
 2   Rating  5 non-null      float64
dtypes: float64(2), object(1)
memory usage: 296.0+ bytes


In [33]:
df.isna()
# it will only return True for the "Nan" values or missing values or empty value

Unnamed: 0,Name,Age,Rating
0,False,False,False
1,False,False,False
2,False,True,False
3,False,False,True
4,False,True,False
5,True,False,True
6,False,False,False


In [34]:
df.isna().sum()

Name      1
Age       2
Rating    2
dtype: int64

## Cleaning and Filling Missing Data

In [35]:
df

Unnamed: 0,Name,Age,Rating
0,Tom,25.0,4.23
1,James,26.0,3.24
2,Ricky,,3.98
3,Messi,23.0,
4,Ronaldo,,4.6
5,,29.0,
6,Jack,23.0,4.32


In [36]:
df['Name'].fillna('Neymar',inplace=True)

In [37]:
df

Unnamed: 0,Name,Age,Rating
0,Tom,25.0,4.23
1,James,26.0,3.24
2,Ricky,,3.98
3,Messi,23.0,
4,Ronaldo,,4.6
5,Neymar,29.0,
6,Jack,23.0,4.32


## Handle The Missing Value

In [38]:
# It will by default delete the rows which consist "Nan" Values
df.dropna()

Unnamed: 0,Name,Age,Rating
0,Tom,25.0,4.23
1,James,26.0,3.24
6,Jack,23.0,4.32


In [43]:
# axis = 0 for Rows
# axis = 1 for columns
df.dropna(axis=1)
# This query will directly delete the whole column itself,
# only if that column has a "Nan" values

Unnamed: 0,Name
0,Tom
1,James
2,Ricky
3,Messi
4,Ronaldo
5,Neymar
6,Jack


In [44]:
df

Unnamed: 0,Name,Age,Rating
0,Tom,25.0,4.23
1,James,26.0,3.24
2,Ricky,,3.98
3,Messi,23.0,
4,Ronaldo,,4.6
5,Neymar,29.0,
6,Jack,23.0,4.32


## Replacing the Missing Value

In [41]:
df.replace(np.nan,'Blank')

Unnamed: 0,Name,Age,Rating
0,Tom,25.0,4.23
1,James,26.0,3.24
2,Ricky,Blank,3.98
3,Messi,23.0,Blank
4,Ronaldo,Blank,4.6
5,Neymar,29.0,Blank
6,Jack,23.0,4.32


In [42]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7 entries, 0 to 6
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Name    7 non-null      object 
 1   Age     5 non-null      float64
 2   Rating  5 non-null      float64
dtypes: float64(2), object(1)
memory usage: 296.0+ bytes


In [45]:
df.isnull().sum()

Name      0
Age       2
Rating    2
dtype: int64

In [46]:
df.replace('Blank',np.nan)

Unnamed: 0,Name,Age,Rating
0,Tom,25.0,4.23
1,James,26.0,3.24
2,Ricky,,3.98
3,Messi,23.0,
4,Ronaldo,,4.6
5,Neymar,29.0,
6,Jack,23.0,4.32


In [47]:
# Replace the missing value in the Age column with mean value
df.replace(np.nan,df.mean())

  df.replace(np.nan,df.mean())


Unnamed: 0,Name,Age,Rating
0,Tom,25.0,4.23
1,James,26.0,3.24
2,Ricky,25.2,3.98
3,Messi,23.0,4.074
4,Ronaldo,25.2,4.6
5,Neymar,29.0,4.074
6,Jack,23.0,4.32


In [49]:
# Bracket Notation
df['Age'].mean()

25.2

In [50]:
# Dot Notation
df.Age.mean()

25.2

In [51]:
df['Rating'].mean()

4.074

In [52]:
df.Rating.mean()

4.074

In [53]:
df.Rating.median()

4.23

In [54]:
df.Age.median()

25.0

## Drop Functions

In [55]:
df

Unnamed: 0,Name,Age,Rating
0,Tom,25.0,4.23
1,James,26.0,3.24
2,Ricky,,3.98
3,Messi,23.0,
4,Ronaldo,,4.6
5,Neymar,29.0,
6,Jack,23.0,4.32


In [57]:
# Drop the "Age" column
# whenever you want t o delete a column always mention "axis = 1"
# axis = 0 for Rows
# axis = 1 for Columns
df.drop(['Age'], axis=1)

Unnamed: 0,Name,Rating
0,Tom,4.23
1,James,3.24
2,Ricky,3.98
3,Messi,
4,Ronaldo,4.6
5,Neymar,
6,Jack,4.32


In [58]:
# Deleting multiple columns
df.drop(['Age','Rating'], axis=1)

Unnamed: 0,Name
0,Tom
1,James
2,Ricky
3,Messi
4,Ronaldo
5,Neymar
6,Jack


In [60]:
# Alternate way to solve the above column is:
# for single column 
df.drop(columns='Age')

Unnamed: 0,Name,Rating
0,Tom,4.23
1,James,3.24
2,Ricky,3.98
3,Messi,
4,Ronaldo,4.6
5,Neymar,
6,Jack,4.32


In [61]:
# for multiple column is:
df.drop(columns=['Age','Rating'])

Unnamed: 0,Name
0,Tom
1,James
2,Ricky
3,Messi
4,Ronaldo
5,Neymar
6,Jack


In [59]:
df

Unnamed: 0,Name,Age,Rating
0,Tom,25.0,4.23
1,James,26.0,3.24
2,Ricky,,3.98
3,Messi,23.0,
4,Ronaldo,,4.6
5,Neymar,29.0,
6,Jack,23.0,4.32


***lets try to drop the Rows using Indexing***

In [62]:
df

Unnamed: 0,Name,Age,Rating
0,Tom,25.0,4.23
1,James,26.0,3.24
2,Ricky,,3.98
3,Messi,23.0,
4,Ronaldo,,4.6
5,Neymar,29.0,
6,Jack,23.0,4.32


In [65]:
# here in the list we are passing the index value 
# dont count it as in the form of Rows and Columns 
df.drop([2,0])
# if you notice one thing my Rows [0 and 2] got delete
# to delete it permanently will use inplace = True


Unnamed: 0,Name,Age,Rating
1,James,26.0,3.24
3,Messi,23.0,
4,Ronaldo,,4.6
5,Neymar,29.0,
6,Jack,23.0,4.32


In [66]:
df

Unnamed: 0,Name,Age,Rating
0,Tom,25.0,4.23
1,James,26.0,3.24
2,Ricky,,3.98
3,Messi,23.0,
4,Ronaldo,,4.6
5,Neymar,29.0,
6,Jack,23.0,4.32


In [67]:
# now lete delete rows which has index value of 1,3,4,6
df.drop([1,3,4,6])
# As you can see it has deleted the rows which i passed in the list 

Unnamed: 0,Name,Age,Rating
0,Tom,25.0,4.23
2,Ricky,,3.98
5,Neymar,29.0,


In [68]:
# alternate way 
df.drop(index=[1,3,5])

Unnamed: 0,Name,Age,Rating
0,Tom,25.0,4.23
2,Ricky,,3.98
4,Ronaldo,,4.6
6,Jack,23.0,4.32


In [69]:
df

Unnamed: 0,Name,Age,Rating
0,Tom,25.0,4.23
1,James,26.0,3.24
2,Ricky,,3.98
3,Messi,23.0,
4,Ronaldo,,4.6
5,Neymar,29.0,
6,Jack,23.0,4.32


In [70]:
df.drop(index = 1, columns='Age')

Unnamed: 0,Name,Rating
0,Tom,4.23
2,Ricky,3.98
3,Messi,
4,Ronaldo,4.6
5,Neymar,
6,Jack,4.32


In [72]:
df.drop(index=1)

Unnamed: 0,Name,Age,Rating
0,Tom,25.0,4.23
2,Ricky,,3.98
3,Messi,23.0,
4,Ronaldo,,4.6
5,Neymar,29.0,
6,Jack,23.0,4.32


In [73]:
# As we can see like query is giving us the Output,
# but it doesnt make any sense it delete the whole columns
df.drop(index=[1,3,4],columns=['Age','Name'])

Unnamed: 0,Rating
0,4.23
2,3.98
5,
6,4.32


In [74]:
df.drop(columns=['Age','Rating'])

Unnamed: 0,Name
0,Tom
1,James
2,Ricky
3,Messi
4,Ronaldo
5,Neymar
6,Jack


In [82]:
# Let's sort the Values
df.sort_values(by = ['Age'])
# It has Sorted the Age values in the Ascending Order


Unnamed: 0,Name,Age,Rating
3,Messi,23.0,
6,Jack,23.0,4.32
0,Tom,25.0,4.23
1,James,26.0,3.24
5,Neymar,29.0,
2,Ricky,,3.98
4,Ronaldo,,4.6


In [83]:
# alternate way
df.Age.sort_values()

3    23.0
6    23.0
0    25.0
1    26.0
5    29.0
2     NaN
4     NaN
Name: Age, dtype: float64

In [84]:
df['Age'].sort_values()

3    23.0
6    23.0
0    25.0
1    26.0
5    29.0
2     NaN
4     NaN
Name: Age, dtype: float64

In [85]:
df.sort_values('Age')

Unnamed: 0,Name,Age,Rating
3,Messi,23.0,
6,Jack,23.0,4.32
0,Tom,25.0,4.23
1,James,26.0,3.24
5,Neymar,29.0,
2,Ricky,,3.98
4,Ronaldo,,4.6


In [88]:
df['Age'].sort_values(ascending=False)

5    29.0
1    26.0
0    25.0
3    23.0
6    23.0
2     NaN
4     NaN
Name: Age, dtype: float64

In [92]:
df.sort_values(by=["Age"],ascending=False)

Unnamed: 0,Name,Age,Rating
5,Neymar,29.0,
1,James,26.0,3.24
0,Tom,25.0,4.23
3,Messi,23.0,
6,Jack,23.0,4.32
2,Ricky,,3.98
4,Ronaldo,,4.6


In [89]:
df.sort_values(by=["Age"],ascending=False)

Unnamed: 0,Name,Age,Rating
5,Neymar,29.0,
1,James,26.0,3.24
0,Tom,25.0,4.23
3,Messi,23.0,
6,Jack,23.0,4.32
2,Ricky,,3.98
4,Ronaldo,,4.6


In [93]:
df.sort_values(by=['Age','Rating'])
# bydefault it will give the first preference to the Age column 


Unnamed: 0,Name,Age,Rating
6,Jack,23.0,4.32
3,Messi,23.0,
0,Tom,25.0,4.23
1,James,26.0,3.24
5,Neymar,29.0,
2,Ricky,,3.98
4,Ronaldo,,4.6


In [95]:
df.sort_values(by=['Rating','Age'],ascending=False)

Unnamed: 0,Name,Age,Rating
4,Ronaldo,,4.6
6,Jack,23.0,4.32
0,Tom,25.0,4.23
2,Ricky,,3.98
1,James,26.0,3.24
5,Neymar,29.0,
3,Messi,23.0,


In [94]:
df.sort_index()

Unnamed: 0,Name,Age,Rating
0,Tom,25.0,4.23
1,James,26.0,3.24
2,Ricky,,3.98
3,Messi,23.0,
4,Ronaldo,,4.6
5,Neymar,29.0,
6,Jack,23.0,4.32


In [96]:
df.sort_index(ascending=False)

Unnamed: 0,Name,Age,Rating
6,Jack,23.0,4.32
5,Neymar,29.0,
4,Ronaldo,,4.6
3,Messi,23.0,
2,Ricky,,3.98
1,James,26.0,3.24
0,Tom,25.0,4.23


In [98]:
df.sort_index(axis=1)
# it has sorted the column in ascending order (alphabetic way)

Unnamed: 0,Age,Name,Rating
0,25.0,Tom,4.23
1,26.0,James,3.24
2,,Ricky,3.98
3,23.0,Messi,
4,,Ronaldo,4.6
5,29.0,Neymar,
6,23.0,Jack,4.32


In [100]:
df.sort_index(axis=1,ascending=False)
# it has sorted the column in decending order (alphabetic way)

Unnamed: 0,Rating,Name,Age
0,4.23,Tom,25.0
1,3.24,James,26.0
2,3.98,Ricky,
3,,Messi,23.0
4,4.6,Ronaldo,
5,,Neymar,29.0
6,4.32,Jack,23.0


In [101]:
df.sort_index(axis=0)

Unnamed: 0,Name,Age,Rating
0,Tom,25.0,4.23
1,James,26.0,3.24
2,Ricky,,3.98
3,Messi,23.0,
4,Ronaldo,,4.6
5,Neymar,29.0,
6,Jack,23.0,4.32


In [102]:
# let see how clipboard() works 
# let a copy a tabular form dataset 
# after copying to do below following task 
df = pd.read_clipboard()
df

Unnamed: 0,Year,Winner,Runner Up,Venue,Number of teams,Player of the Match
0,2022,Gujarat Titans,Rajasthan Royals,Ahmedabad,10,--
1,2021,Chennai Super Kings,Kolkata Knight Riders,Dubai,8,Faf du Plessis
2,2020,Mumbai Indians,Delhi Capitals,Dubai,8,Trent Boult
3,2019,Mumbai Indians,Chennai Super Kings,Hyderabad,8,Jasprit Bumrah
4,2018,Chennai Super Kings,Sunrisers Hyderabad,Mumbai,8,Shane Watson
5,2017,Mumbai Indians,Rising Pune Supergiants,Hyderabad,8,Krunal Pandya
6,2016,Sunrisers Hyderabad,Royal Challengers Bangalore,Bangalore,8,Ben Cutting
7,2015,Mumbai Indians,Chennai Super Kings,Kolkata,8,Rohit Sharma
8,2014,Kolkata Knight Riders,Kings XI Punjab,Bangalore,8,Manish Pandey
9,2013,Mumbai Indians,Chennai Super Kings,Kolkata,9,Kieron Pollard


In [106]:
df['Player of the Match']=df['Player of the Match'].str.replace('--','xyz')

In [107]:
df.head()

Unnamed: 0,Year,Winner,Runner Up,Venue,Number of teams,Player of the Match
0,2022,Gujarat Titans,Rajasthan Royals,Ahmedabad,10,xyz
1,2021,Chennai Super Kings,Kolkata Knight Riders,Dubai,8,Faf du Plessis
2,2020,Mumbai Indians,Delhi Capitals,Dubai,8,Trent Boult
3,2019,Mumbai Indians,Chennai Super Kings,Hyderabad,8,Jasprit Bumrah
4,2018,Chennai Super Kings,Sunrisers Hyderabad,Mumbai,8,Shane Watson


In [108]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 15 entries, 0 to 14
Data columns (total 6 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   Year                 15 non-null     int64 
 1   Winner               15 non-null     object
 2   Runner Up            15 non-null     object
 3   Venue                15 non-null     object
 4   Number of teams      15 non-null     int64 
 5   Player of the Match  15 non-null     object
dtypes: int64(2), object(4)
memory usage: 840.0+ bytes
