In [1]:
import os
import json
from sqlalchemy import create_engine
import pandas as pd
import great_expectations as ge
from great_expectations.core.expectation_configuration import ExpectationConfiguration

In [2]:
file_path = '../Credentials/keys.json'
if os.path.exists(file_path):
    with open(file_path, 'r') as json_file:
        data = json.load(json_file)
        user = data["user"]
        password = data["password"]
        port = data["port"]
        server = data["server"]
        db = data["db"]
else:
    print(f"File '{file_path}' not found.")
db_connection = f"postgresql://{user}:{password}@{server}:{port}/{db}"
engine = create_engine(db_connection)
print(f"connected!")

connected!


In [3]:
engine = create_engine(db_connection) 
connection = engine.connect() 
table_name = 'jobs_api'  
df = pd.read_sql_table(table_name, connection)
API=df

In [4]:
# Creamos un dataset de GX a partir del DataFrame
API_gx=ge.from_pandas(API)

In [5]:
API.head(5)

Unnamed: 0,work_year,experience_level,employment_type,job_title,salary,salary_currency,salary_in_usd,employee_residence,remote_ratio,company_location,company_size
0,2023,SE,FT,Principal Data Scientist,80000,EUR,85847,ES,100,ES,L
1,2023,MI,CT,ML Engineer,30000,USD,30000,US,100,US,S
2,2023,MI,CT,ML Engineer,25500,USD,25500,US,100,US,S
3,2023,SE,FT,Data Scientist,175000,USD,175000,CA,100,CA,M
4,2023,SE,FT,Data Scientist,120000,USD,120000,CA,100,CA,M


In [6]:
API.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3755 entries, 0 to 3754
Data columns (total 11 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   work_year           3755 non-null   int64 
 1   experience_level    3755 non-null   object
 2   employment_type     3755 non-null   object
 3   job_title           3755 non-null   object
 4   salary              3755 non-null   int64 
 5   salary_currency     3755 non-null   object
 6   salary_in_usd       3755 non-null   int64 
 7   employee_residence  3755 non-null   object
 8   remote_ratio        3755 non-null   int64 
 9   company_location    3755 non-null   object
 10  company_size        3755 non-null   object
dtypes: int64(4), object(7)
memory usage: 322.8+ KB


In [7]:
API.columns

Index(['work_year', 'experience_level', 'employment_type', 'job_title',
       'salary', 'salary_currency', 'salary_in_usd', 'employee_residence',
       'remote_ratio', 'company_location', 'company_size'],
      dtype='object')

In [10]:
# Define tus expectativas
expectations = []

# Expects that the numeric columns do not have null values.
numerical_columns = ['work_year', 'salary_in_usd', 'remote_ratio']
for column in numerical_columns:
    expectation = ExpectationConfiguration(
        expectation_type="expect_column_values_to_not_be_null",
        kwargs={"column": column}
    )
    expectations.append(expectation)

# Expect the salary columns to be greater than 0.
salary_columns = ['salary', 'salary_in_usd']
for column in salary_columns:
    expectation = ExpectationConfiguration(
        expectation_type="expect_column_values_to_be_greater_than",
        kwargs={"column": column, "value": 0}
    )
    expectations.append(expectation)

# Expects text columns to have no null values
text_columns = ['experience_level', 'employment_type', 'job_title', 'employee_residence', 'company_location', 'company_size']
for column in text_columns:
    expectation = ExpectationConfiguration(
        expectation_type="expect_column_values_to_not_be_null",
        kwargs={"column": column}
    )
    expectations.append(expectation)

# Create a suite of expectations
suite_name = "API_suite"
suite = ge.core.ExpectationSuite(expectation_suite_name=suite_name)
suite.expectations.extend(expectations)


# Validate the dataset against the suite of expectations
result = API_gx.validate(expectation_suite=suite)

# Save the results in a text file
result_output_path = "validation_results_API.txt"
with open(result_output_path, 'w') as f:
    for idx, res in enumerate(result["results"]):
        expectation_config = res["expectation_config"]
        success = res["success"]
        f.write(f"Expectation {idx + 1}: {expectation_config['expectation_type']}\n")
        f.write(f"Column: {expectation_config['kwargs'].get('column', 'N/A')}\n")
        if "value" in expectation_config["kwargs"]:
            f.write(f"Value: {expectation_config['kwargs']['value']}\n")
        if "regex" in expectation_config["kwargs"]:
            f.write(f"Regex: {expectation_config['kwargs']['regex']}\n")
        if "strftime_format" in expectation_config["kwargs"]:
            f.write(f"Strftime Format: {expectation_config['kwargs']['strftime_format']}\n")
        f.write(f"Success: {success}\n")
        if not success:
            f.write("Result details:\n")
            f.write(json.dumps(res["result"], indent=2))
            f.write("\n")
        f.write("-" * 40)
        f.write("\n")

print("Validation results saved to:", result_output_path)

Validation results saved to: validation_results_API.txt
