In [1]:
import pandas as pd

# Load the two CSV files
csv1 = "file1.csv"
csv2 = "file2.csv"

data1 = pd.read_csv(csv1)
data2 = pd.read_csv(csv2)

# Ensure both datasets have the same columns
assert set(data1.columns) == set(data2.columns), "Column names do not match."


FileNotFoundError: [Errno 2] No such file or directory: 'file1.csv'

In [None]:
from scipy.stats import ks_2samp

# Compare distributions of numerical features
numerical_columns = data1.select_dtypes(include=["float64", "int64"]).columns
for column in numerical_columns:
    stat, p_value = ks_2samp(data1[column], data2[column])
    print(f"Feature: {column}, KS Statistic: {stat}, p-value: {p_value}")
    if p_value < 0.05:
        print(f"--> Drift detected in column '{column}' (p-value < 0.05).")


In [None]:
from scipy.stats import chi2_contingency

# Compare distributions of categorical features
categorical_columns = data1.select_dtypes(include=["object", "category"]).columns
for column in categorical_columns:
    contingency_table = pd.concat(
        [data1[column].value_counts(normalize=True),
         data2[column].value_counts(normalize=True)],
        axis=1,
        keys=["Dataset1", "Dataset2"]
    ).fillna(0)

    stat, p_value, _, _ = chi2_contingency(contingency_table)
    print(f"Feature: {column}, Chi-square Statistic: {stat}, p-value: {p_value}")
    if p_value < 0.05:
        print(f"--> Drift detected in column '{column}' (p-value < 0.05).")


In [None]:
import pandas as pd
from evidently.report import Report
from evidently.metrics import DataDriftMetric

# Load your datasets
data1 = pd.read_csv("file1.csv")  # Reference dataset
data2 = pd.read_csv("file2.csv")  # Current dataset

# Create a data drift report
report = Report(metrics=[DataDriftMetric()])

# Run the report
report.run(reference_data=data1, current_data=data2)

# Save the report as an HTML file for visualization
report.save_html("data_drift_report.html")

print("Data drift report saved as 'data_drift_report.html'")


ModuleNotFoundError: No module named 'evidently.dashboard'

In [None]:
from deepchecks.tabular import Dataset
from deepchecks.tabular.checks import TrainTestFeatureDrift

# Create Deepchecks datasets
ds1 = Dataset(data1, label=None)
ds2 = Dataset(data2, label=None)

# Run feature drift check
check = TrainTestFeatureDrift()
result = check.run(ds1, ds2)
result.show()


In [None]:
import matplotlib.pyplot as plt

for column in numerical_columns:
    plt.figure()
    data1[column].hist(alpha=0.5, label="Dataset1", bins=30)
    data2[column].hist(alpha=0.5, label="Dataset2", bins=30)
    plt.title(f"Histogram for {column}")
    plt.legend()
    plt.show()


In [None]:
for column in categorical_columns:
    plt.figure()
    pd.concat([data1[column].value_counts(normalize=True),
               data2[column].value_counts(normalize=True)],
              axis=1,
              keys=["Dataset1", "Dataset2"]
              ).plot(kind="bar")
    plt.title(f"Bar Chart for {column}")
    plt.show()
