# Data Quality Analysis on Adult Dataset
#### Using Great Expectations

##### **Importing necessary lib and dataset**

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
import os
import gc
from IPython.core.display import display, HTML
warnings.filterwarnings( "ignore",    message=".*should_run_async.*",category=DeprecationWarning)
warnings.filterwarnings("ignore")
for dirname, _, filenames in os.walk(f'E:\Sem 4\DPDQ\HW1\adult'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

  for dirname, _, filenames in os.walk(f'E:\Sem 4\DPDQ\HW1\adult'):
  from IPython.core.display import display, HTML


In [2]:
# !pip install great_expectations

## Data loading and basic data cleaning like data type changing 

In [3]:
column_names = [
    'age', 'workclass', "fnlwgt",'education', 'education-num', 'marital-status', 
    'occupation', 'relationship', 'race', 'sex', 'capital-gain', 
    'capital-loss', 'hours-per-week', 'native-country', 'income'
]
input
adult_data = pd.read_csv(r"E:\Sem 4\DPDQ\HW1\adult\adult.data", names=column_names)
adult_test = pd.read_csv(r"E:\Sem 4\DPDQ\HW1\adult\adult.test", names=column_names)
adult_test.drop(index=0,inplace=True)

adult_data_full = pd.concat([adult_data, adult_test], axis=0)

adult_data_full.index = adult_data_full.index.astype(str)

int_columns = ['age', 'fnlwgt', 'education-num', 'capital-gain', 'capital-loss', 'hours-per-week']
for col in int_columns:
    adult_data_full[col] = adult_data_full[col].astype(int)

In [4]:
adult_data_full.sample(5)

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
9332,38,Private,91039,Bachelors,13,Married-civ-spouse,Sales,Husband,White,Male,5178,0,48,United-States,>50K.
25151,40,Self-emp-not-inc,406811,Some-college,10,Married-civ-spouse,Prof-specialty,Wife,White,Female,0,0,40,United-States,>50K
3854,60,Private,83861,HS-grad,9,Married-civ-spouse,Handlers-cleaners,Husband,White,Male,0,0,40,United-States,<=50K
1189,27,Local-gov,247507,Bachelors,13,Never-married,Prof-specialty,Own-child,White,Male,0,0,35,United-States,<=50K
16125,37,Private,210830,Assoc-voc,11,Divorced,Prof-specialty,Unmarried,White,Female,0,0,38,United-States,<=50K.


In [5]:
adult_data_full.info()

<class 'pandas.core.frame.DataFrame'>
Index: 48842 entries, 0 to 16281
Data columns (total 15 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   age             48842 non-null  int32 
 1   workclass       48842 non-null  object
 2   fnlwgt          48842 non-null  int32 
 3   education       48842 non-null  object
 4   education-num   48842 non-null  int32 
 5   marital-status  48842 non-null  object
 6   occupation      48842 non-null  object
 7   relationship    48842 non-null  object
 8   race            48842 non-null  object
 9   sex             48842 non-null  object
 10  capital-gain    48842 non-null  int32 
 11  capital-loss    48842 non-null  int32 
 12  hours-per-week  48842 non-null  int32 
 13  native-country  48842 non-null  object
 14  income          48842 non-null  object
dtypes: int32(6), object(9)
memory usage: 4.8+ MB


In [15]:
string_columns = adult_data_full.select_dtypes(include=['object']).columns

for col in string_columns:
    adult_data_full[col] = adult_data_full[col].str.strip()

In [16]:
string_columns

Index(['workclass', 'education', 'marital-status', 'occupation',
       'relationship', 'race', 'sex', 'native-country', 'income'],
      dtype='object')

In [17]:
adult_data_full.shape

(48842, 15)

In [18]:
adult_data_full.sample(5)

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
12460,32,Private,180799,Some-college,10,Married-civ-spouse,Transport-moving,Husband,White,Male,0,0,60,United-States,>50K
6868,32,Private,128016,HS-grad,9,Married-spouse-absent,Other-service,Unmarried,White,Female,0,0,20,United-States,<=50K
167,46,State-gov,102628,Masters,14,Widowed,Protective-serv,Unmarried,White,Male,0,0,40,United-States,<=50K
366,56,Self-emp-not-inc,183081,Some-college,10,Married-civ-spouse,Sales,Husband,White,Male,0,0,45,United-States,<=50K
21292,62,Self-emp-inc,134768,Masters,14,Married-civ-spouse,Sales,Husband,White,Male,0,0,35,United-States,<=50K


# **Great Expectation**

In [19]:
import great_expectations as gx

In [20]:
from great_expectations import get_context  

# Initialize the DataContext
context = get_context()

# Now you can create your dataset with the active context
adult_data_gx = gx.dataset.PandasDataset(adult_data_full)

In [21]:
adult_data_gx.expect_column_values_to_be_between("age",min_value=1, max_value=100)

{
  "success": true,
  "result": {
    "element_count": 48842,
    "missing_count": 0,
    "missing_percent": 0.0,
    "unexpected_count": 0,
    "unexpected_percent": 0.0,
    "unexpected_percent_total": 0.0,
    "unexpected_percent_nonmissing": 0.0,
    "partial_unexpected_list": []
  },
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  }
}

In [22]:
adult_data_gx.expect_column_values_to_be_in_set("workclass", ('Private', 'Self-emp-not-inc', 'Self-emp-inc', 'Federal-gov', 'Local-gov', 'State-gov', 'Without-pay', 'Never-worked',"?"))

{
  "success": true,
  "result": {
    "element_count": 48842,
    "missing_count": 0,
    "missing_percent": 0.0,
    "unexpected_count": 0,
    "unexpected_percent": 0.0,
    "unexpected_percent_total": 0.0,
    "unexpected_percent_nonmissing": 0.0,
    "partial_unexpected_list": []
  },
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  }
}

In [23]:
adult_data_gx.expect_column_values_to_be_in_set("education", ('Bachelors', 'Some-college', '11th', 'HS-grad', 'Prof-school', 'Assoc-acdm', 'Assoc-voc', '9th', '7th-8th', '12th', 'Masters', '1st-4th', '10th', 'Doctorate', '5th-6th', 'Preschool'))


{
  "success": true,
  "result": {
    "element_count": 48842,
    "missing_count": 0,
    "missing_percent": 0.0,
    "unexpected_count": 0,
    "unexpected_percent": 0.0,
    "unexpected_percent_total": 0.0,
    "unexpected_percent_nonmissing": 0.0,
    "partial_unexpected_list": []
  },
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  }
}

In [24]:
adult_data_gx.expect_column_values_to_be_in_set("marital-status", ('Married-civ-spouse', 'Divorced', 'Never-married', 'Separated', 'Widowed', 'Married-spouse-absent', 'Married-AF-spouse'))

{
  "success": true,
  "result": {
    "element_count": 48842,
    "missing_count": 0,
    "missing_percent": 0.0,
    "unexpected_count": 0,
    "unexpected_percent": 0.0,
    "unexpected_percent_total": 0.0,
    "unexpected_percent_nonmissing": 0.0,
    "partial_unexpected_list": []
  },
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  }
}

In [25]:
adult_data_gx.expect_column_values_to_be_in_set("occupation", ("?",'Tech-support', 'Craft-repair', 'Other-service', 'Sales', 'Exec-managerial', 'Prof-specialty', 'Handlers-cleaners', 'Machine-op-inspct', 'Adm-clerical', 'Farming-fishing', 'Transport-moving', 'Priv-house-serv', 'Protective-serv', 'Armed-Forces'))

{
  "success": true,
  "result": {
    "element_count": 48842,
    "missing_count": 0,
    "missing_percent": 0.0,
    "unexpected_count": 0,
    "unexpected_percent": 0.0,
    "unexpected_percent_total": 0.0,
    "unexpected_percent_nonmissing": 0.0,
    "partial_unexpected_list": []
  },
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  }
}

In [26]:
adult_data_gx.expect_column_values_to_be_in_set("relationship", ('Wife', 'Own-child', 'Husband', 'Not-in-family', 'Other-relative', 'Unmarried'))

{
  "success": true,
  "result": {
    "element_count": 48842,
    "missing_count": 0,
    "missing_percent": 0.0,
    "unexpected_count": 0,
    "unexpected_percent": 0.0,
    "unexpected_percent_total": 0.0,
    "unexpected_percent_nonmissing": 0.0,
    "partial_unexpected_list": []
  },
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  }
}

In [27]:
adult_data_gx.expect_column_values_to_be_in_set("race", ('White', 'Asian-Pac-Islander', 'Amer-Indian-Eskimo', 'Other', 'Black'))

{
  "success": true,
  "result": {
    "element_count": 48842,
    "missing_count": 0,
    "missing_percent": 0.0,
    "unexpected_count": 0,
    "unexpected_percent": 0.0,
    "unexpected_percent_total": 0.0,
    "unexpected_percent_nonmissing": 0.0,
    "partial_unexpected_list": []
  },
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  }
}

In [28]:
adult_data_gx.expect_column_values_to_be_in_set("sex", ('Female', 'Male'))

{
  "success": true,
  "result": {
    "element_count": 48842,
    "missing_count": 0,
    "missing_percent": 0.0,
    "unexpected_count": 0,
    "unexpected_percent": 0.0,
    "unexpected_percent_total": 0.0,
    "unexpected_percent_nonmissing": 0.0,
    "partial_unexpected_list": []
  },
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  }
}

In [29]:
adult_data_gx.expect_column_values_to_be_in_set("native-country", ("?",'United-States', 'Cambodia', 'England', 'Puerto-Rico', 'Canada', 'Germany', 'Outlying-US(Guam-USVI-etc)', 'India', 'Japan', 'Greece', 'South', 'China', 'Cuba', 'Iran', 'Honduras', 'Philippines', 'Italy', 'Poland', 'Jamaica', 'Vietnam', 'Mexico', 'Portugal', 'Ireland', 'France', 'Dominican-Republic', 'Laos', 'Ecuador', 'Taiwan', 'Haiti', 'Columbia', 'Hungary', 'Guatemala', 'Nicaragua', 'Scotland', 'Thailand', 'Yugoslavia', 'El-Salvador', 'Trinadad&Tobago', 'Peru', 'Hong', 'Holand-Netherlands'))


{
  "success": true,
  "result": {
    "element_count": 48842,
    "missing_count": 0,
    "missing_percent": 0.0,
    "unexpected_count": 0,
    "unexpected_percent": 0.0,
    "unexpected_percent_total": 0.0,
    "unexpected_percent_nonmissing": 0.0,
    "partial_unexpected_list": []
  },
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  }
}

In [31]:
adult_data_gx.expect_column_values_to_be_in_set("income", ('>50K', '<=50K','>50K,', '<=50K.'))

{
  "success": false,
  "result": {
    "element_count": 48842,
    "missing_count": 0,
    "missing_percent": 0.0,
    "unexpected_count": 3846,
    "unexpected_percent": 7.874370418901765,
    "unexpected_percent_total": 7.874370418901765,
    "unexpected_percent_nonmissing": 7.874370418901765,
    "partial_unexpected_list": [
      ">50K.",
      ">50K.",
      ">50K.",
      ">50K.",
      ">50K.",
      ">50K.",
      ">50K.",
      ">50K.",
      ">50K.",
      ">50K.",
      ">50K.",
      ">50K.",
      ">50K.",
      ">50K.",
      ">50K.",
      ">50K.",
      ">50K.",
      ">50K.",
      ">50K.",
      ">50K."
    ]
  },
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  }
}