In [1]:
!pip install -q pyspark==3.2.0
from pyspark.sql import SparkSession, Row
!pyspark --version
%env SPARK_VERSION=3.3

Welcome to
      ____              __
     / __/__  ___ _____/ /__
    _\ \/ _ \/ _ `/ __/  '_/
   /___/ .__/\_,_/_/ /_/\_\   version 3.2.0
      /_/
                        
Using Scala version 2.12.15, OpenJDK 64-Bit Server VM, 11.0.26
Branch HEAD
Compiled by user ubuntu on 2021-10-06T12:46:30Z
Revision 5d45a415f3a29898d92380380cfd82bfc7f579ea
Url https://github.com/apache/spark
Type --help for more information.
env: SPARK_VERSION=3.3


In [2]:
!pip install pydeequ



In [3]:
import pydeequ
from pydeequ.profiles import *
import os
import kagglehub
import pandas as pd
spark = (SparkSession
    .builder
    .config("spark.jars.packages", pydeequ.deequ_maven_coord)
    .config("spark.jars.excludes", pydeequ.f2j_maven_coord)
    .getOrCreate())

In [9]:
import pandas as pd

In [8]:
# Download latest version from Kaggle
dataset_path = kagglehub.dataset_download("siddheshmaheshwari/adult")
print("Path to dataset files:", dataset_path)

# Load data (adjust file names based on what's available in the dataset)
train_file = os.path.join(dataset_path, "adult.data")
test_file = os.path.join(dataset_path, "adult.test")

Downloading from https://www.kaggle.com/api/v1/datasets/download/siddheshmaheshwari/adult?dataset_version_number=2...


100%|██████████| 708k/708k [00:00<00:00, 1.14MB/s]

Extracting files...
Path to dataset files: /root/.cache/kagglehub/datasets/siddheshmaheshwari/adult/versions/2





In [10]:
column_names = [
    'age', 'workclass', "fnlwgt",'education', 'education-num', 'marital-status',
    'occupation', 'relationship', 'race', 'sex', 'capital-gain',
    'capital-loss', 'hours-per-week', 'native-country', 'income'
]

adult_data = pd.read_csv(train_file, names=column_names)
adult_test = pd.read_csv(test_file, names=column_names)
adult_test.drop(index=0,inplace=True)

adult_data_full = pd.concat([adult_data, adult_test], axis=0)

adult_data_full.index = adult_data_full.index.astype(str)

int_columns = ['age', 'fnlwgt', 'education-num', 'capital-gain', 'capital-loss', 'hours-per-week']
for col in int_columns:
    adult_data_full[col] = adult_data_full[col].astype(int)

In [11]:

# Save the DataFrame to a CSV file in Google Drive
from google.colab import drive
drive.mount('/content/drive')

csv_file_path = '/content/drive/MyDrive/adult_data_full.csv'  # Specify the desired path
adult_data_full.to_csv(csv_file_path, index=False) # Set index=False to avoid saving the index

print(f"Data saved to: {csv_file_path}")


Mounted at /content/drive
Data saved to: /content/drive/MyDrive/adult_data_full.csv


In [14]:
# Cell 2: Import libraries
import json
from pyspark.sql import SparkSession
import pydeequ
from pydeequ.profiles import ColumnProfilerRunner
from pydeequ.suggestions import ConstraintSuggestionRunner, DEFAULT
from pydeequ.checks import Check, CheckLevel
from pydeequ.verification import VerificationSuite, VerificationResult
from pydeequ.suggestions import *
from pydeequ.analyzers import (
    AnalysisRunner,
    AnalyzerContext,
    Size,
    Completeness,
    Mean,
    ApproxCountDistinct,
    CountDistinct
)

In [4]:
# Cell 3: Create Spark session
spark = (SparkSession
    .builder
    .appName("PyDeequ Analysis")
    .config("spark.jars.packages", pydeequ.deequ_maven_coord)
    .config("spark.jars.excludes", pydeequ.f2j_maven_coord)
    .getOrCreate())

In [5]:
# Cell 4: Load dataset
# Replace the path with the actual location of your adult_data file
adult_data = spark.read.option("header", "true").csv("/content/drive/MyDrive/adult_data_full.csv")
adult_data.show(5)

+---+-----------------+------+----------+-------------+-------------------+------------------+--------------+------+-------+------------+------------+--------------+--------------+------+
|age|        workclass|fnlwgt| education|education-num|     marital-status|        occupation|  relationship|  race|    sex|capital-gain|capital-loss|hours-per-week|native-country|income|
+---+-----------------+------+----------+-------------+-------------------+------------------+--------------+------+-------+------------+------------+--------------+--------------+------+
| 39|        State-gov| 77516| Bachelors|           13|      Never-married|      Adm-clerical| Not-in-family| White|   Male|        2174|           0|            40| United-States| <=50K|
| 50| Self-emp-not-inc| 83311| Bachelors|           13| Married-civ-spouse|   Exec-managerial|       Husband| White|   Male|           0|           0|            13| United-States| <=50K|
| 38|          Private|215646|   HS-grad|            9|     

In [7]:
# Cell 5: Column Profiling
print("=== Running Column Profiling ===")
profile_result = ColumnProfilerRunner(spark).onData(adult_data).run()

# Print profiling results
for col, profile in profile_result.profiles.items():
    print(f"Column: {col}")
    print(f"  Completeness: {profile.completeness}")
    print(f"  Distinct values: {profile.approximateNumDistinctValues}")
    print(f"  Data type: {profile.dataType}")
    print()

=== Running Column Profiling ===
Column: hours-per-week
  Completeness: 1.0
  Distinct values: 99
  Data type: Integral

Column: capital-gain
  Completeness: 1.0
  Distinct values: 130
  Data type: Integral

Column: education-num
  Completeness: 1.0
  Distinct values: 16
  Data type: Integral

Column: marital-status
  Completeness: 1.0
  Distinct values: 7
  Data type: String

Column: age
  Completeness: 1.0
  Distinct values: 76
  Data type: Integral

Column: sex
  Completeness: 1.0
  Distinct values: 2
  Data type: String

Column: relationship
  Completeness: 1.0
  Distinct values: 6
  Data type: String

Column: education
  Completeness: 1.0
  Distinct values: 16
  Data type: String

Column: income
  Completeness: 1.0
  Distinct values: 4
  Data type: String

Column: race
  Completeness: 1.0
  Distinct values: 5
  Data type: String

Column: native-country
  Completeness: 1.0
  Distinct values: 39
  Data type: String

Column: fnlwgt
  Completeness: 1.0
  Distinct values: 27906
  Data 

In [12]:
# Cell 6: Generate Constraint Suggestions
print("=== Generating Constraint Suggestions ===")
suggestion_result = (ConstraintSuggestionRunner(spark)
    .onData(adult_data)
    .addConstraintRule(DEFAULT())
    .run())

# Print constraint suggestions in JSON format
print(json.dumps(suggestion_result, indent=2))

=== Generating Constraint Suggestions ===
{
  "constraint_suggestions": [
    {
      "constraint_name": "ComplianceConstraint(Compliance('hours-per-week' has value range '40', '50', '45', '60', '35', '20', '30', '55', '25', '48', '38', '15', '70', '10', '32', '65', '24', '42', '36', '44', '16', '12', '37', '43', '8', '80', '52', '56', '28', '99', '18', '46', '72', '75', '5', '6', '4', '47', '84', '39', '54', '22', '33', '3', '41', '14', '2', '34', '21', '7', '27', '17', '90', '26', '23', '53', '49', '58', '13', '1', '9', '62', '66', '64', '11', '51', '57', '19', '85', '68', '63', '29', '98', '78', '31', '96', '77', '59', '67', '86', '76', '61', '88', '73', '91', '74', '92', '81', '89', '97', '95', '69', '82', '87', '79', '94',`hours-per-week` IN ('40', '50', '45', '60', '35', '20', '30', '55', '25', '48', '38', '15', '70', '10', '32', '65', '24', '42', '36', '44', '16', '12', '37', '43', '8', '80', '52', '56', '28', '99', '18', '46', '72', '75', '5', '6', '4', '47', '84', '39', '54', 

In [18]:
# Cell 7: Verify Data Quality
print("=== Verifying Data Quality ===")
check = (Check(spark, CheckLevel.Warning, "Adult Data Quality Check")
    .hasSize(lambda x: x >= 30000)  # Adjust threshold based on your dataset size
    .isComplete("age")
    .isComplete("income")
    .isNonNegative("hours_per_week")
    .isContainedIn("income", [">50K", "<=50K"]))

verification_result = (VerificationSuite(spark)
    .onData(adult_data)
    .addCheck(check)
    .run())
verification_result_df = VerificationResult.checkResultsAsDataFrame(spark, verification_result)
verification_result_df.show(truncate=False)

=== Verifying Data Quality ===
+------------------------+-----------+------------+----------------------------------------------------------------------------------------------------------------------------------------------------------+-----------------+----------------------------------------------------+
|check                   |check_level|check_status|constraint                                                                                                                                                |constraint_status|constraint_message                                  |
+------------------------+-----------+------------+----------------------------------------------------------------------------------------------------------------------------------------------------------+-----------------+----------------------------------------------------+
+------------------------+-----------+------------+------------------------------------------------------------------------------------

In [20]:
# Cell 8: Analyze Data Metrics
print("=== Analyzing Data Metrics ===")
analysis_result = (AnalysisRunner(spark)
    .onData(adult_data)
    .addAnalyzer(Size())
    .addAnalyzer(Completeness("income"))
    .addAnalyzer(Completeness("workclass"))
    .addAnalyzer(Mean("age"))
    .addAnalyzer(ApproxCountDistinct("education"))
    .addAnalyzer(CountDistinct("income"))
    .run())

# Display analysis results
metrics_df = AnalyzerContext.successMetricsAsDataFrame(spark, analysis_result)
metrics_df.show(truncate=False)

=== Analyzing Data Metrics ===
+-------+---------+-------------------+-------+
|entity |instance |name               |value  |
+-------+---------+-------------------+-------+
|Column |education|ApproxCountDistinct|16.0   |
|Column |income   |CountDistinct      |4.0    |
|Dataset|*        |Size               |48842.0|
|Column |workclass|Completeness       |1.0    |
|Column |income   |Completeness       |1.0    |
+-------+---------+-------------------+-------+



In [23]:
# Cell 9: Custom Analysis - Distribution of Age Groups
print("=== Age Distribution Analysis ===")
adult_data.groupBy("age").count().orderBy("age").show(20)

# Cell 10: Custom Analysis - Income Distribution by Education
print("=== Income Distribution by Education ===")
adult_data.groupBy("education", "income").count().orderBy("education", "income").show(20)


=== Age Distribution Analysis ===
+---+-----+
|age|count|
+---+-----+
| 17|  595|
| 18|  862|
| 19| 1053|
| 20| 1113|
| 21| 1096|
| 22| 1178|
| 23| 1329|
| 24| 1206|
| 25| 1195|
| 26| 1153|
| 27| 1232|
| 28| 1280|
| 29| 1223|
| 30| 1278|
| 31| 1325|
| 32| 1253|
| 33| 1335|
| 34| 1303|
| 35| 1337|
| 36| 1348|
+---+-----+
only showing top 20 rows

=== Income Distribution by Education ===
+---------+-------+-----+
|education| income|count|
+---------+-------+-----+
|     10th|  <=50K|  871|
|     10th| <=50K.|  431|
|     10th|   >50K|   62|
|     10th|  >50K.|   25|
|     11th|  <=50K| 1115|
|     11th| <=50K.|  605|
|     11th|   >50K|   60|
|     11th|  >50K.|   32|
|     12th|  <=50K|  400|
|     12th| <=50K.|  209|
|     12th|   >50K|   33|
|     12th|  >50K.|   15|
|  1st-4th|  <=50K|  162|
|  1st-4th| <=50K.|   77|
|  1st-4th|   >50K|    6|
|  1st-4th|  >50K.|    2|
|  5th-6th|  <=50K|  317|
|  5th-6th| <=50K.|  165|
|  5th-6th|   >50K|   16|
|  5th-6th|  >50K.|   11|
+---------+--