Pandas dtype | Python type | Usage
-------------|-------------|--------------------------
object       | str         | Text or mixed types
int64        | int         | Integer numbers
float64      | float       | Floating-point numbers
bool         | bool        | Boolean (True/False) values
datetime64[ns]| datetime    | Datetime objects (nanosecond resolution)
timedelta64[ns]| timedelta   | Time differences
category     | N/A         | Categorical data (fixed set of values)

In [1]:
import pandas as pd

In [24]:
df = pd.read_csv("data_cleaning_sample.csv")
df

Unnamed: 0,Name,Age,City,Gender,Email,Join Date
0,Alice,25.0,New York,F,alice@example.com,01-05-2021
1,Charlie,,Delhi,M,charlie@example,20-07-2021
2,Bob,30.0,Los Angeles,M,bob@example.com,15-06-2020
3,Charlie,,Delhi,M,charlie@example,20-07-2021
4,David,22.0,Mumbai,M,david@example.com,12-11-2019
5,,28.0,Delhi,F,eve@domain.com,
6,Alice,25.0,New York,F,alice@example.com,01-05-2021
7,Alice,25.0,New York,F,alice@example.com,01-05-2021
8,Charlie,,Delhi,M,charlie@example,20-07-2021
9,Alice,25.0,New York,F,alice@example.com,01-05-2021


In [4]:
df.isnull()

Unnamed: 0,Name,Age,City,Gender,Email,Join Date
0,False,False,False,False,False,False
1,False,True,False,False,False,False
2,False,False,False,False,False,False
3,False,True,False,False,False,False
4,False,False,False,False,False,False
5,True,False,False,False,False,True
6,False,False,False,False,False,False
7,False,False,False,False,False,False
8,False,True,False,False,False,False


In [5]:
df.isnull().sum()

Name         1
Age          3
City         0
Gender       0
Email        0
Join Date    1
dtype: int64

In [6]:
df.dropna()

Unnamed: 0,Name,Age,City,Gender,Email,Join Date
0,Alice,25.0,New York,F,alice@example.com,01-05-2021
2,Bob,30.0,Los Angeles,M,bob@example.com,15-06-2020
4,David,22.0,Mumbai,M,david@example.com,12-11-2019
6,Alice,25.0,New York,F,alice@example.com,01-05-2021
7,Alice,25.0,New York,F,alice@example.com,01-05-2021


In [10]:
df.dropna(axis=1)

Unnamed: 0,City,Gender,Email
0,New York,F,alice@example.com
1,Delhi,M,charlie@example
2,Los Angeles,M,bob@example.com
3,Delhi,M,charlie@example
4,Mumbai,M,david@example.com
5,Delhi,F,eve@domain.com
6,New York,F,alice@example.com
7,New York,F,alice@example.com
8,Delhi,M,charlie@example


In [11]:
df.fillna(0)

Unnamed: 0,Name,Age,City,Gender,Email,Join Date
0,Alice,25.0,New York,F,alice@example.com,01-05-2021
1,Charlie,0.0,Delhi,M,charlie@example,20-07-2021
2,Bob,30.0,Los Angeles,M,bob@example.com,15-06-2020
3,Charlie,0.0,Delhi,M,charlie@example,20-07-2021
4,David,22.0,Mumbai,M,david@example.com,12-11-2019
5,0,28.0,Delhi,F,eve@domain.com,0
6,Alice,25.0,New York,F,alice@example.com,01-05-2021
7,Alice,25.0,New York,F,alice@example.com,01-05-2021
8,Charlie,0.0,Delhi,M,charlie@example,20-07-2021


In [16]:
df["Age"].fillna(df["Age"].mean())

0    25.000000
1    25.833333
2    30.000000
3    25.833333
4    22.000000
5    28.000000
6    25.000000
7    25.000000
8    25.833333
Name: Age, dtype: float64

In [17]:
df.fillna(method="ffill")

  df.fillna(method="ffill")


Unnamed: 0,Name,Age,City,Gender,Email,Join Date
0,Alice,25.0,New York,F,alice@example.com,01-05-2021
1,Charlie,25.0,Delhi,M,charlie@example,20-07-2021
2,Bob,30.0,Los Angeles,M,bob@example.com,15-06-2020
3,Charlie,30.0,Delhi,M,charlie@example,20-07-2021
4,David,22.0,Mumbai,M,david@example.com,12-11-2019
5,David,28.0,Delhi,F,eve@domain.com,12-11-2019
6,Alice,25.0,New York,F,alice@example.com,01-05-2021
7,Alice,25.0,New York,F,alice@example.com,01-05-2021
8,Charlie,25.0,Delhi,M,charlie@example,20-07-2021


In [18]:
df.fillna(method="bfill")

  df.fillna(method="bfill")


Unnamed: 0,Name,Age,City,Gender,Email,Join Date
0,Alice,25.0,New York,F,alice@example.com,01-05-2021
1,Charlie,30.0,Delhi,M,charlie@example,20-07-2021
2,Bob,30.0,Los Angeles,M,bob@example.com,15-06-2020
3,Charlie,22.0,Delhi,M,charlie@example,20-07-2021
4,David,22.0,Mumbai,M,david@example.com,12-11-2019
5,Alice,28.0,Delhi,F,eve@domain.com,01-05-2021
6,Alice,25.0,New York,F,alice@example.com,01-05-2021
7,Alice,25.0,New York,F,alice@example.com,01-05-2021
8,Charlie,,Delhi,M,charlie@example,20-07-2021


In [25]:
df.duplicated()

0     False
1     False
2     False
3      True
4     False
5     False
6      True
7      True
8      True
9      True
10    False
dtype: bool

In [27]:
df.drop_duplicates()

Unnamed: 0,Name,Age,City,Gender,Email,Join Date
0,Alice,25.0,New York,F,alice@example.com,01-05-2021
1,Charlie,,Delhi,M,charlie@example,20-07-2021
2,Bob,30.0,Los Angeles,M,bob@example.com,15-06-2020
4,David,22.0,Mumbai,M,david@example.com,12-11-2019
5,,28.0,Delhi,F,eve@domain.com,
10,Alice,35.0,Delhi,F,alicqe@example.com,01-05-2021


In [28]:
df.duplicated(subset=["Name","City"])

0     False
1     False
2     False
3      True
4     False
5     False
6      True
7      True
8      True
9      True
10    False
dtype: bool

In [31]:
df["Name"].str.lower()

0       alice
1     charlie
2         bob
3     charlie
4       david
5         NaN
6       alice
7       alice
8     charlie
9       alice
10      alice
Name: Name, dtype: object

In [32]:
df["Name"].str.upper()

0       ALICE
1     CHARLIE
2         BOB
3     CHARLIE
4       DAVID
5         NaN
6       ALICE
7       ALICE
8     CHARLIE
9       ALICE
10      ALICE
Name: Name, dtype: object

In [33]:
df["City"].str.contains("delhi", case=False) # Checks if 'delhi' is in the city name, case-insensitive.

0     False
1      True
2     False
3      True
4     False
5      True
6     False
7     False
8      True
9     False
10     True
Name: City, dtype: bool

In [34]:
df["Email"].str.split("@")

0      [alice, example.com]
1        [charlie, example]
2        [bob, example.com]
3        [charlie, example]
4      [david, example.com]
5         [eve, domain.com]
6      [alice, example.com]
7      [alice, example.com]
8        [charlie, example]
9      [alice, example.com]
10    [alicqe, example.com]
Name: Email, dtype: object

In [52]:
df = df.dropna()
df["Age"] = df["Age"].astype(int)

df

Unnamed: 0,Name,Age,City,Gender,Email,Join Date
0,Alice,25,New York,F,alice@example.com,01-05-2021
2,Bob,30,Los Angeles,M,bob@example.com,15-06-2020
4,David,22,Mumbai,M,david@example.com,12-11-2019
6,Alice,25,New York,F,alice@example.com,01-05-2021
7,Alice,25,New York,F,alice@example.com,01-05-2021
9,Alice,25,New York,F,alice@example.com,01-05-2021
10,Alice,35,Delhi,F,alicqe@example.com,01-05-2021


In [69]:
df["Join Date"] = pd.to_datetime(df["Join Date"],dayfirst=True)
df["Gender"]  = df["Gender"].astype("category")
df

Unnamed: 0,Name,Age,City,Gender,Email,Join Date
0,Alice,25,New York,F,alice@example.com,2021-05-01
2,Bob,30,Los Angeles,M,bob@example.com,2020-06-15
4,David,22,Mumbai,M,david@example.com,2019-11-12
6,Alice,25,New York,F,alice@example.com,2021-05-01
7,Alice,25,New York,F,alice@example.com,2021-05-01
9,Alice,25,New York,F,alice@example.com,2021-05-01
10,Alice,35,Delhi,F,alicqe@example.com,2021-05-01


In [71]:
df["Gender"].dtype

CategoricalDtype(categories=['F', 'M'], ordered=False, categories_dtype=object)