<a href="https://colab.research.google.com/github/lovnishverma/Python-Getting-Started/blob/main/Pandas.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Data Cleaning with Pandas

```
!pip install pandas
```

In [9]:
# Step 1: Import Necessary Libraries
import pandas as pd
import numpy as np

In [10]:
# Step 2: Load Data from CSV (or use sample data)
#https://github.com/lovnishverma/datasets/blob/main/pandas_tutorial1.csv

df = pd.read_csv("https://raw.githubusercontent.com/lovnishverma/datasets/refs/heads/main/pandas_tutorial1.csv")

# Display the original dataset
print("Original Data:")
print(df)
print(df.head())  # First 5 rows
print(df.tail())  # Last 5 rows
print(df.info())       # Data types & missing values
print(df.describe())   # Summary statistics


# Selecting & Filtering Data (Selecting Columns)

# print(df['Name'])  # Selecting a single column
# print(df[['Name', 'Age']])  # Selecting multiple columns


# You can drop a column from the DataFrame using the .drop() method.
# df.drop(columns=['City'], inplace=True)

# If you want to remove multiple columns, pass a list:
# df.drop(columns=['City', 'Salary'], inplace=True)



Original Data:
             Name   Age        City   Salary
0     aman Kapoor  25.0       Delhi  50000.0
1    Bharat Kumar   NaN     Kolkata  60000.0
2    charu thakur  30.0     Chennai      NaN
3    dinesh singh  35.0  Hoshiarpur  45000.0
4          Emilly  40.0      Mumbai  70000.0
5     Rajat gupta  28.0         NaN      NaN
6  Anamika Kumari   NaN    Dehradun  55000.0
7    dinesh singh  35.0  Hoshiarpur  45000.0
8      Amar kumar   NaN       Ropar  62000.0
           Name   Age        City   Salary
0   aman Kapoor  25.0       Delhi  50000.0
1  Bharat Kumar   NaN     Kolkata  60000.0
2  charu thakur  30.0     Chennai      NaN
3  dinesh singh  35.0  Hoshiarpur  45000.0
4        Emilly  40.0      Mumbai  70000.0
             Name   Age        City   Salary
4          Emilly  40.0      Mumbai  70000.0
5     Rajat gupta  28.0         NaN      NaN
6  Anamika Kumari   NaN    Dehradun  55000.0
7    dinesh singh  35.0  Hoshiarpur  45000.0
8      Amar kumar   NaN       Ropar  62000.0
<class 

In [11]:
# Step 3: Identify Missing Values
print("\nMissing Values Count:")
print(df.isnull().sum())


Missing Values Count:
Name      0
Age       3
City      1
Salary    2
dtype: int64


In [12]:
# Step 4: Handle Missing Values
# Replace 'NaN' strings with np.nan
df.replace({'NaN': np.nan}, inplace=True)

#  Convert Salary to numeric
df['Salary'] = pd.to_numeric(df['Salary'], errors='coerce')

#  Handle missing values safely using assignment
df['Age'] = df['Age'].fillna(df['Age'].mean())  # Fill Age with mean
df['Salary'] = df['Salary'].fillna(df['Salary'].median())  # Fill Salary with median
df['City'] = df['City'].fillna(df['City'].mode()[0])  # Fill City with mode (most frequent value)

# Print updated DataFrame
print("Cleaned Data:\n", df)

# Optionally, remove rows with any remaining missing values (if any)
df_cleaned = df.dropna()
print("\n Final Data After Dropping Missing Rows (if any left):\n", df_cleaned)

# Check if any missing values remain
print("\n Missing Values Summary:\n", df.isnull().sum())

Cleaned Data:
              Name        Age        City   Salary
0     aman Kapoor  25.000000       Delhi  50000.0
1    Bharat Kumar  32.166667     Kolkata  60000.0
2    charu thakur  30.000000     Chennai  55000.0
3    dinesh singh  35.000000  Hoshiarpur  45000.0
4          Emilly  40.000000      Mumbai  70000.0
5     Rajat gupta  28.000000  Hoshiarpur  55000.0
6  Anamika Kumari  32.166667    Dehradun  55000.0
7    dinesh singh  35.000000  Hoshiarpur  45000.0
8      Amar kumar  32.166667       Ropar  62000.0

 Final Data After Dropping Missing Rows (if any left):
              Name        Age        City   Salary
0     aman Kapoor  25.000000       Delhi  50000.0
1    Bharat Kumar  32.166667     Kolkata  60000.0
2    charu thakur  30.000000     Chennai  55000.0
3    dinesh singh  35.000000  Hoshiarpur  45000.0
4          Emilly  40.000000      Mumbai  70000.0
5     Rajat gupta  28.000000  Hoshiarpur  55000.0
6  Anamika Kumari  32.166667    Dehradun  55000.0
7    dinesh singh  35.000000

In [13]:
# Step 5: Remove Duplicates
print(df)
df.drop_duplicates(inplace=True)


             Name        Age        City   Salary
0     aman Kapoor  25.000000       Delhi  50000.0
1    Bharat Kumar  32.166667     Kolkata  60000.0
2    charu thakur  30.000000     Chennai  55000.0
3    dinesh singh  35.000000  Hoshiarpur  45000.0
4          Emilly  40.000000      Mumbai  70000.0
5     Rajat gupta  28.000000  Hoshiarpur  55000.0
6  Anamika Kumari  32.166667    Dehradun  55000.0
7    dinesh singh  35.000000  Hoshiarpur  45000.0
8      Amar kumar  32.166667       Ropar  62000.0


In [14]:
# Step 6: Standardize Text Columns
df['Name'] = df['Name'].str.title()  # Capitalize Names
df['City'] = df['City'].str.title().str.strip()  # Capitalize and trim spaces

In [15]:
# Step 7: Convert Data Types
# Convert Age and Salary to integer type
df['Age'] = df['Age'].astype(int)
df['Salary'] = df['Salary'].astype(int)

# Display the cleaned dataset
print("\nData After Cleaning:")
print(df)


Data After Cleaning:
             Name  Age        City  Salary
0     Aman Kapoor   25       Delhi   50000
1    Bharat Kumar   32     Kolkata   60000
2    Charu Thakur   30     Chennai   55000
3    Dinesh Singh   35  Hoshiarpur   45000
4          Emilly   40      Mumbai   70000
5     Rajat Gupta   28  Hoshiarpur   55000
6  Anamika Kumari   32    Dehradun   55000
8      Amar Kumar   32       Ropar   62000


In [16]:
# Step 8: Save the Cleaned Data to a CSV File
df.to_csv("cleaned_data.csv", index=False)
print("\nCleaned data saved as cleaned_data.csv")


Cleaned data saved as cleaned_data.csv
