Steps for Data Cleaning and Preprocessing

In [None]:
# Load the Data

import pandas as pd
df = pd.read_csv('your_data.csv')

# Basic Inspection

print(df.head())
print(df.info())
print(df.describe())

# Check for Missing Values (NaNs)

print(df.isna().sum())

# Handle Missing Values

# Remove Missing Values:

df.dropna(inplace=True)  # Remove rows with missing values
df.dropna(axis=1, inplace=True)  # Remove columns with missing values

# Fill with a Specific Value:

df.fillna(0, inplace=True)  # Fill missing values with 0

# Fill with Mean/Median/Mode:

df['column_name'].fillna(df['column_name'].mean(), inplace=True)  # Fill with mean
df['column_name'].fillna(df['column_name'].median(), inplace=True)  # Fill with median
df['column_name'].fillna(df['column_name'].mode()[0], inplace=True)  # Fill with mode

# Forward Fill/Backward Fill:

df.fillna(method='ffill', inplace=True)  # Forward fill
df.fillna(method='bfill', inplace=True)  # Backward fill

# Linear interpolation:

df.interpolate(method='linear', inplace=True)  

# Check Data Types and Conversion

print(df.dtypes)
df['column_name'] = df['column_name'].astype('int')

# Check for Duplicates

print(df.duplicated().sum())
df.drop_duplicates(inplace=True)

# Outlier Detection and Handling

import seaborn as sns
import matplotlib.pyplot as plt

sns.boxplot(x=df['column_name'])
plt.show()

df = df[df['column_name'] < threshold]

# Data Visualization

df.hist(bins=50, figsize=(20, 15))
plt.show()

# Save the Cleaned Data

df.to_csv('cleaned_data.csv', index=False)