# Data Cleaning & Analysis Template
This notebook provides a structured approach to data cleaning and preprocessing for both tabular and geospatial datasets.

In [13]:
import pandas as pd
import numpy as np
import geopandas as gpd
import os
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler

## 1. Load Data

In [None]:
# ----------------------
# 1. Load Data
# ----------------------
DEFAULT_DATA_DIR = r"YOUR PATH TO ALL DATASETS"

def load_data(file_name):
    """Load a dataset from the default directory and return it as a DataFrame."""
    file_path = os.path.join(DEFAULT_DATA_DIR, file_name)
    return pd.read_csv(file_path)




In [None]:
# ----------------------
# 2. Load Geospatial Data (Optional)
# ----------------------
def load_geodata(file_name):
    """Load a geospatial dataset from the default directory and return it as a GeoDataFrame."""
    file_path = os.path.join(DEFAULT_DATA_DIR, file_name)
    return gpd.read_file(file_path)

In [None]:
# ----------------------
# 3. Example Usage
# ----------------------
if __name__ == "__main__":
    # Define datasets
    datasets = {
        "df_1": "dataset_1.csv",
        "df_2": "dataset_2.csv",
        "df_3": "dataset_3.csv",
        "df_4": "dataset_4.csv"
    }
    
    # Load datasets dynamically
    dfs = {df_name: load_data(file) for df_name, file in datasets.items()}
    
    print("Datasets loaded successfully.")
    
    # Define geospatial datasets (Optional)
    geodatasets = {
        "geo_df_1": "geodata_1.shp",
        "geo_df_2": "geodata_2.geojson"
    }
    
    # Load geospatial datasets dynamically
    geodfs = {df_name: load_geodata(file) for df_name, file in geodatasets.items()}
    
    print("Geospatial datasets loaded successfully.")

## 2. Basic Inspection

In [None]:
def inspect_data(df):
    print("\nDataset Info:")
    print(df.info())
    print("\nSummary Statistics:")
    print(df.describe(include='all'))
    print("\nMissing Values:")
    print(df.isnull().sum())
    print("\nFirst 5 Rows:")
    print(df.head())

## 3. Handle Missing Values

In [None]:
def handle_missing_values(df, method='ffill'):
    if method == 'drop':
        return df.dropna()
    return df.fillna(method=method)

## 4. Handle Duplicates

In [None]:
def remove_duplicates(df):
    return df.drop_duplicates()

## 5. Data Type Conversion

In [None]:
def convert_data_types(df, date_columns=None, category_columns=None):
    if date_columns:
        for col in date_columns:
            df[col] = pd.to_datetime(df[col])
    if category_columns:
        for col in category_columns:
            df[col] = df[col].astype('category')
    return df

## 6. Detect Outliers

In [None]:
def detect_outliers(df, column):
    plt.figure(figsize=(8, 4))
    sns.boxplot(x=df[column])
    plt.title(f'Outlier Detection - {column}')
    plt.show()

## 7. Feature Engineering

In [None]:
def create_features(df):
    if 'column1' in df.columns and 'column2' in df.columns:
        df['new_feature'] = df['column1'] / (df['column2'] + 1e-9)  # Avoid division by zero
    return df

## 8. Normalize/Scale Data

In [None]:
def normalize_data(df, columns):
    scaler = StandardScaler()
    df[columns] = scaler.fit_transform(df[columns])
    return df

## 9. Geospatial Data Cleaning

In [None]:
def clean_geospatial_data(gdf):
    gdf = gdf.to_crs(epsg=4326)
    gdf = gdf.dropna(subset=['geometry'])
    gdf['centroid'] = gdf.geometry.centroid
    return gdf

## 10. Save Cleaned Data

In [None]:
def save_data(df, file_path, is_geospatial=False):
    if is_geospatial:
        df.to_file(file_path, driver='GeoJSON')
    else:
        df.to_csv(file_path, index=False)

## Example Usage

In [None]:
if __name__ == "__main__":
    file_path = "your_dataset.csv"
    is_geospatial = False
    
    df = load_data(file_path, is_geospatial)
    inspect_data(df)
    df = handle_missing_values(df, method='ffill')
    df = remove_duplicates(df)
    df = convert_data_types(df, date_columns=['date_column'], category_columns=['category_column'])
    df = create_features(df)
    df = normalize_data(df, columns=['numeric_column'])
    
    if is_geospatial:
        df = clean_geospatial_data(df)
    
    save_data(df, "cleaned_data.csv", is_geospatial)
    print("Data cleaning complete and saved.")