<a href="https://colab.research.google.com/github/lovnishverma/Python-Getting-Started/blob/main/Pandas.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Data Cleaning with Pandas

```
!pip install pandas
```

In [53]:
# Step 1: Import Necessary Libraries
import pandas as pd

In [62]:
# Step 2: Load Data from CSV (or use sample data)
#https://github.com/lovnishverma/datasets/blob/main/pandas_tutorial1.csv

df = pd.read_csv("https://raw.githubusercontent.com/lovnishverma/datasets/refs/heads/main/pandas_tutorial1.csv")

# Display the original dataset
print("Original Data:")
print(df)
print(df.head())  # First 5 rows
print(df.tail())  # Last 5 rows
print(df.info())       # Data types & missing values
print(df.describe())   # Summary statistics


# Selecting & Filtering Data (Selecting Columns)

# print(df['Name'])  # Selecting a single column
# print(df[['Name', 'Age']])  # Selecting multiple columns


# You can drop a column from the DataFrame using the .drop() method.
# df.drop(columns=['City'], inplace=True)

# If you want to remove multiple columns, pass a list:
# df.drop(columns=['City', 'Salary'], inplace=True)



Original Data:
             Name   Age        City   Salary
0     aman Kapoor  25.0       Delhi  50000.0
1    Bharat Kumar   NaN     Kolkata  60000.0
2    charu thakur  30.0     Chennai      NaN
3    dinesh singh  35.0  Hoshiarpur  45000.0
4          Emilly  40.0      Mumbai  70000.0
5     Rajat gupta  28.0         NaN      NaN
6  Anamika Kumari   NaN    Dehradun  55000.0
7    dinesh singh  35.0  Hoshiarpur  45000.0
8      Amar kumar   NaN       Ropar  62000.0
           Name   Age        City   Salary
0   aman Kapoor  25.0       Delhi  50000.0
1  Bharat Kumar   NaN     Kolkata  60000.0
2  charu thakur  30.0     Chennai      NaN
3  dinesh singh  35.0  Hoshiarpur  45000.0
4        Emilly  40.0      Mumbai  70000.0
             Name   Age        City   Salary
4          Emilly  40.0      Mumbai  70000.0
5     Rajat gupta  28.0         NaN      NaN
6  Anamika Kumari   NaN    Dehradun  55000.0
7    dinesh singh  35.0  Hoshiarpur  45000.0
8      Amar kumar   NaN       Ropar  62000.0
<class 

In [55]:
# Step 3: Identify Missing Values
print("\nMissing Values Count:")
print(df.isnull().sum())


Missing Values Count:
Name      0
Age       3
City      1
Salary    2
dtype: int64


In [56]:
# Replace 'NaN' strings and None values with actual NaN values
df.replace({'NaN': None}, inplace=True)

# Convert 'Salary' column to numeric values
df['Salary'] = pd.to_numeric(df['Salary'], errors='coerce')

# Handle missing values correctly
df = df.assign(
    Age=df['Age'].fillna(df['Age'].mean()),  # Fill missing Age with mean
    City=df['City'].fillna("Unknown"),       # Fill missing City with "Unknown"
    Salary=df['Salary'].fillna(df['Salary'].median())  # Fill missing Salary with median
)

print("\nMissing Values After Handling:")
print(df.isnull().sum())



Missing Values After Handling:
Name      0
Age       0
City      0
Salary    0
dtype: int64


In [57]:
# Step 5: Remove Duplicates
print(df)
df.drop_duplicates(inplace=True)


             Name        Age        City   Salary
0     aman Kapoor  25.000000       Delhi  50000.0
1    Bharat Kumar  32.166667     Kolkata  60000.0
2    charu thakur  30.000000     Chennai  55000.0
3    dinesh singh  35.000000  Hoshiarpur  45000.0
4          Emilly  40.000000      Mumbai  70000.0
5     Rajat gupta  28.000000     Unknown  55000.0
6  Anamika Kumari  32.166667    Dehradun  55000.0
7    dinesh singh  35.000000  Hoshiarpur  45000.0
8      Amar kumar  32.166667       Ropar  62000.0


In [58]:
# Step 6: Standardize Text Columns
df['Name'] = df['Name'].str.title()  # Capitalize Names
df['City'] = df['City'].str.title().str.strip()  # Capitalize and trim spaces

In [59]:
# Step 7: Convert Data Types
# Convert Age and Salary to integer type
df['Age'] = df['Age'].astype(int)
df['Salary'] = df['Salary'].astype(int)

# Display the cleaned dataset
print("\nData After Cleaning:")
print(df)


Data After Cleaning:
             Name  Age        City  Salary
0     Aman Kapoor   25       Delhi   50000
1    Bharat Kumar   32     Kolkata   60000
2    Charu Thakur   30     Chennai   55000
3    Dinesh Singh   35  Hoshiarpur   45000
4          Emilly   40      Mumbai   70000
5     Rajat Gupta   28     Unknown   55000
6  Anamika Kumari   32    Dehradun   55000
8      Amar Kumar   32       Ropar   62000


In [60]:
# Step 8: Save the Cleaned Data to a CSV File
df.to_csv("cleaned_data.csv", index=False)
print("\nCleaned data saved as cleaned_data.csv")


Cleaned data saved as cleaned_data.csv
