In [1]:
# ===============================
# üìä Data Cleaning + Visualization + Notebook Report
# ===============================

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import nbformat
from nbformat.v4 import new_notebook, new_markdown_cell, new_code_cell
import os


In [2]:

# --- 1Ô∏è‚É£ Load the CSV file ---
input_path = "DATA.csv"  
output_clean_csv = "cleaned_data.csv"
notebook_path = "data_cleaning_analysis.ipynb"


In [3]:
encodings = ["utf-8", "latin1", "cp1252"]
for enc in encodings:
    try:
        df = pd.read_csv(input_path, encoding=enc)
        print(f"‚úÖ Loaded with encoding: {enc}")
        break
    except Exception:
        df = None
if df is None:
    raise RuntimeError("‚ùå Could not read the file with any common encoding.")


‚úÖ Loaded with encoding: utf-8


In [4]:

# --- 2Ô∏è‚É£ Basic Cleaning ---
print("üîπ Original shape:", df.shape)

# Normalize column names
df.columns = [str(c).strip().replace("\n", " ").strip() for c in df.columns]

# Drop duplicates
df.drop_duplicates(inplace=True)

# Strip spaces + convert empty strings to NaN
for col in df.select_dtypes(include=["object"]).columns:
    df[col] = df[col].astype(str).str.strip().replace({"": np.nan, "NA": np.nan, "N/A": np.nan, "none": np.nan})

# Detect date columns
date_cols = [c for c in df.columns if "date" in c.lower() or "time" in c.lower()]
for c in date_cols:
    df[c] = pd.to_datetime(df[c], errors="coerce")

# Convert numeric-looking object columns
for c in df.columns:
    if df[c].dtype == "object":
        coerced = pd.to_numeric(df[c].str.replace(",", "").str.replace("$", ""), errors="coerce")
        if coerced.notna().mean() > 0.6:
            df[c] = coerced

# Fill missing numeric with median, categorical with mode
for c in df.select_dtypes(include=[np.number]).columns:
    df[c].fillna(df[c].median(), inplace=True)

for c in df.select_dtypes(exclude=[np.number]).columns:
    if df[c].mode(dropna=True).shape[0] > 0:
        df[c].fillna(df[c].mode()[0], inplace=True)

# Fill date NaT with earliest date
for c in date_cols:
    if df[c].notna().any():
        df[c].fillna(df[c].min(), inplace=True)

print("‚úÖ Cleaning done. New shape:", df.shape)

üîπ Original shape: (10959, 7)
‚úÖ Cleaning done. New shape: (10959, 7)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[c].fillna(df[c].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[c].fillna(df[c].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always 

In [None]:
# --- 3Ô∏è‚É£ Exploratory Visualizations ---
os.makedirs("figures", exist_ok=True)
numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()

# Histograms
for col in numeric_cols[:6]:
    plt.figure(figsize=(6,4))
    df[col].hist(bins=30)
    plt.title(f"Distribution of {col}")
    plt.savefig(f"figures/hist_{col}.png")
    plt.close()

# Boxplots
for col in numeric_cols[:6]:
    plt.figure(figsize=(6,2.5))
    plt.boxplot(df[col].dropna(), vert=False)
    plt.title(f"Boxplot of {col}")
    plt.savefig(f"figures/box_{col}.png")
    plt.close()

# Bar chart for top categorical
cat_cols = [c for c in df.columns if c not in numeric_cols and c not in date_cols]
for c in cat_cols[:3]:
    plt.figure(figsize=(6,4))
    df[c].value_counts().head(10).plot(kind='bar')
    plt.title(f"Top 10 values in {c}")
    plt.savefig(f"figures/bar_{c}.png")
    plt.close()

# Correlation matrix
if len(numeric_cols) >= 2:
    corr = df[numeric_cols].corr()
    plt.figure(figsize=(6,5))
    plt.imshow(corr, cmap="coolwarm", aspect="auto")
    plt.colorbar()
    plt.xticks(range(len(numeric_cols)), numeric_cols, rotation=90)
    plt.yticks(range(len(numeric_cols)), numeric_cols)
    plt.title("Correlation Matrix")
    plt.tight_layout()
    plt.savefig("figures/corr_matrix.png")
    plt.close()

# Save cleaned file
df.to_csv(output_clean_csv, index=False)
print(f"üíæ Cleaned CSV saved as {output_clean_csv}")

üíæ Cleaned CSV saved as cleaned_data.csv


In [6]:
# --- 4Ô∏è‚É£ Auto-create Jupyter Notebook with report ---
nb = new_notebook()

nb.cells = [
    new_markdown_cell("# üßπ Data Cleaning & EDA Report"),
    new_code_cell(f"import pandas as pd\nimport matplotlib.pyplot as plt\ndf = pd.read_csv('{output_clean_csv}')\ndf.head()"),
    new_code_cell(f"df.info()"),
    new_code_cell("# Basic stats\nprint(df.describe())"),
    new_code_cell("# Show saved figures\nimport os\nos.listdir('figures')")
]

with open(notebook_path, "w", encoding="utf-8") as f:
    nbformat.write(nb, f)

print(f"üìò Jupyter Notebook created: {notebook_path}")


üìò Jupyter Notebook created: data_cleaning_analysis.ipynb
