This script will:
Load each dataset (customers.csv, transactions.csv, etc.)

Check for:

❌ Missing values (null, NaN)

❌ Duplicate rows

❌ Invalid values (e.g., wrong date format or negative amounts)

Save a quality report

In [1]:
# Step 1: Import libraries and define file list
import pandas as pd

# listof dataset files to check
files= ['customers.csv','transactions.csv','employees.csv','reports.csv']

#create an empty list to collect result
quality_report=[]

# Step 2: Loop through each file and run checks
for file in files:
    try:
        df = pd.read_csv(file)
        dataset_name=file.replace('.csv','')

        #basic checks
        total_rows= len(df)
        null_count= df.isnull().sum().sum()
        duplicate_count = df.duplicated().sum()

        #sample custom checks:
        #for transaction :amount should be >0
        invalid_amounts=0
        if 'amount' in df.columns:
            invalid_amounts = df[df['amount'] <=0].shape[0]

        #for reports :total revenue should be >0
        invalid_revenue =0
        if 'total_revenue' in df.columns:
            invalid_revenue = df[df['total_revenue'] <=0].shape[0]

        #append results
        quality_report.append({
             'dataset': dataset_name,
            'total_rows': total_rows,
            'null_values': null_count,
            'duplicate_rows': duplicate_count,
            'invalid_amounts': invalid_amounts,
            'invalid_revenue': invalid_revenue
        })
    except Exception as e:
        print(f"Failed to process{file}:{e}")

#Step 3: Save and show the report
report_df = pd.DataFrame(quality_report)

# Show in notebook
report_df

report_df.to_csv('data_quality_report.csv', index=False)
print("Data quality report saved as data_quality_report.csv")

Data quality report saved as data_quality_report.csv
