# Data Validation & Quality Checks

In [1]:
import pandas as pd
import numpy as np

df = pd.DataFrame({
    'id': [1, 2, 3, 4],
    'age': [25, 30, -5, 200],
    'salary': [50000, 60000, None, 70000],
    'city': ['Delhi', 'Mumbai', 'Delhi', 'Delhi']
})

df

Unnamed: 0,id,age,salary,city
0,1,25,50000.0,Delhi
1,2,30,60000.0,Mumbai
2,3,-5,,Delhi
3,4,200,70000.0,Delhi


## Schema validation

In [None]:
df.dtypes

In [None]:
expected_types = {
    'id': 'int64',
    'age': 'int64',
    'salary': 'float64',
    'city': 'object'
}

df.dtypes.astype(str) == expected_types

## Value range checks

In [None]:
(df['age'] >= 0) & (df['age'] <= 120)

In [None]:
df.loc[~((df['age'] >= 0) & (df['age'] <= 120))]

## Consistency checks

In [None]:
df['city'].isin(['Delhi', 'Mumbai', 'Chennai'])

In [None]:
df.loc[~df['city'].isin(['Delhi', 'Mumbai', 'Chennai'])]

## Handling invalid data

In [None]:
df.loc[df['age'] < 0, 'age'] = np.nan
df.loc[df['age'] > 120, 'age'] = np.nan
df

In [None]:
df['salary'] = df['salary'].fillna(df['salary'].median())
df

## Assertions

In [None]:
assert df['age'].dropna().between(0, 120).all()

In [None]:
assert df['id'].is_unique

In [None]:
assert not df.isnull().any().any()