In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Load your dataset
# Replace 'your_dataset.csv' with your file name
data = pd.read_csv('your_dataset.csv')

# 1. Overview of the dataset
print("Dataset Shape:", data.shape)
print("\nDataset Info:")
data.info()

print("\nFirst 5 Rows of the Dataset:")
print(data.head())

# 2. Check for missing values
print("\nMissing Values:")
print(data.isnull().sum())

# Visualize missing data
sns.heatmap(data.isnull(), cbar=False, cmap="viridis")
plt.title("Missing Data Heatmap")
plt.show()

# Fill or drop missing values (choose one based on your data)
# Example: Filling numeric columns with mean and categorical with mode
for column in data.select_dtypes(include=np.number).columns:
    data[column].fillna(data[column].mean(), inplace=True)
for column in data.select_dtypes(include='object').columns:
    data[column].fillna(data[column].mode()[0], inplace=True)

# 3. Handle duplicates
print("\nNumber of Duplicates:", data.duplicated().sum())
data.drop_duplicates(inplace=True)

# 4. Statistical summary of numeric columns
print("\nStatistical Summary:")
print(data.describe())

# 5. Check and encode categorical variables
print("\nCategorical Columns:")
categorical_cols = data.select_dtypes(include='object').columns
print(categorical_cols)

# Example: Encode categorical variables
for column in categorical_cols:
    data[column] = data[column].astype('category').cat.codes

# 6. Outlier detection
# Boxplot for numeric columns
numeric_cols = data.select_dtypes(include=np.number).columns
for column in numeric_cols:
    plt.figure(figsize=(8, 4))
    sns.boxplot(data[column])
    plt.title(f"Boxplot for {column}")
    plt.show()

# 7. Correlation matrix
plt.figure(figsize=(12, 6))
correlation_matrix = data.corr()
sns.heatmap(correlation_matrix, annot=True, cmap="coolwarm")
plt.title("Correlation Matrix")
plt.show()

# 8. Visualizing distributions
for column in numeric_cols:
    plt.figure(figsize=(8, 4))
    sns.histplot(data[column], kde=True, bins=30)
    plt.title(f"Distribution of {column}")
    plt.show()

# 9. Scatterplots for relationships
# Replace 'col1' and 'col2' with your columns of interest
plt.figure(figsize=(8, 6))
sns.scatterplot(x=data[numeric_cols[0]], y=data[numeric_cols[1]])
plt.title(f"Scatterplot between {numeric_cols[0]} and {numeric_cols[1]}")
plt.show()

# 10. Save cleaned data
data.to_csv('cleaned_dataset.csv', index=False)
print("\nCleaned dataset saved as 'cleaned_dataset.csv'")
