In [5]:
import pandas as pd
import numpy as np
from tabulate import tabulate

In [6]:
Invoices = pd.read_csv("../datasets/Invoices.csv", encoding='latin-1')

columns = {
    "Invoices": ["Quantity", "UnitPrice"]
}

def profile_quantitative(df, file_name, col_name):
    total = len(df)
    missing = df[col_name].isna().sum()
    distinct = df[col_name].nunique(dropna=True)
    cardinality = round(distinct / total, 3)
    
    col = df[col_name].dropna()
    min_val = col.min() if not col.empty else None
    max_val = col.max() if not col.empty else None
    mean_val = col.mean() if not col.empty else None
    median_val = col.median() if not col.empty else None
    std_val = col.std() if not col.empty else None
    
    outliers = ((col < (mean_val - 3 * std_val)) | (col > (mean_val + 3 * std_val))).sum() if std_val and not np.isnan(std_val) else 0
    outlier_pct = round(outliers / total * 100, 3)
    
    return {
        "Tập tin": file_name,
        "Cột": col_name,
        "Tổng": total,
        "Thiếu (%)": round(missing / total * 100, 2),
        "Giá trị riêng biệt": distinct,
        "Độ phân biệt": cardinality,
        "Min": min_val,
        "Max": max_val,
        "Trung bình": round(mean_val, 2) if mean_val is not None else None,
        "Trung vị": round(median_val, 2) if median_val is not None else None,
        "Độ lệch chuẩn": round(std_val, 2) if std_val is not None else None,
        "Giá trị ngoại lai (%)": outlier_pct
    }

results = []
for file, cols in columns.items():
    for col in cols:
        df = eval(file)
        results.append(profile_quantitative(df, file, col))

quantitative_profile = pd.DataFrame(results)

table_text = tabulate(quantitative_profile, headers='keys', tablefmt='grid', showindex=False)
print(table_text)

+-----------+-----------+--------+-------------+----------------------+----------------+----------+-------+--------------+------------+-----------------+-------------------------+
| Tập tin   | Cột       |   Tổng |   Thiếu (%) |   Giá trị riêng biệt |   Độ phân biệt |      Min |   Max |   Trung bình |   Trung vị |   Độ lệch chuẩn |   Giá trị ngoại lai (%) |
| Invoices  | Quantity  | 541909 |           0 |                  722 |          0.001 | -80995   | 80995 |         9.55 |       3    |          218.08 |                   0.064 |
+-----------+-----------+--------+-------------+----------------------+----------------+----------+-------+--------------+------------+-----------------+-------------------------+
| Invoices  | UnitPrice | 541909 |           0 |                 1630 |          0.003 | -11062.1 | 38970 |         4.61 |       2.08 |           96.76 |                   0.069 |
+-----------+-----------+--------+-------------+----------------------+----------------+----------+-