<a href="https://colab.research.google.com/github/maheshboj/agenticai_basics/blob/Langchain_components/ETL_Pipeline_using_GenAI.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
%pip install -qU langchain langchain-openai langchain_community

# DDL Generation

In [None]:
#from langchain.chains.llm import LLMChain
#from langchain.chains.combine_documents import create_stuff_documents_chain

In [64]:
import os
import pandas as pd
import getpass
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.documents import Document
from langchain_openai import ChatOpenAI
from IPython.display import Markdown
os.environ["LANGCHAIN_TRACING_V2"] = "false"

# Read Files

In [98]:
# Read the CSV files
dim_customer = pd.read_csv('DIM_Customer.csv')
dim_product = pd.read_csv('DIM_Product.csv')
fact_internet_sales = pd.read_csv('FACT_InternetSales.csv')

# Display basic information about each dataframe
print("DIM_Customer Shape:", dim_customer.shape)
print("\nDIM_Product Shape:", dim_product.shape)
print("\nFACT_InternetSales Shape:", fact_internet_sales.shape)

# Display first few rows of each dataframe
print("\nDIM_Customer Preview:")
display(dim_customer.head())

print("\nDIM_Product Preview:")
display(dim_product.head())

print("\nFACT_InternetSales Preview:")
display(fact_internet_sales.head())

DIM_Customer Shape: (18484, 7)

DIM_Product Shape: (606, 11)

FACT_InternetSales Shape: (58168, 7)

DIM_Customer Preview:


Unnamed: 0,CustomerKey,First Name,Last Name,Full Name,Gender,DateFirstPurchase,Customer City
0,11000.0,Jon,Yang,Jon Yang,Male,2018-01-19,Rockhampton
1,11001.0,Eugene,Huang,Eugene Huang,Male,2018-01-15,Seaford
2,11002.0,Ruben,Torres,Ruben Torres,Male,18-01-07,
3,11003.0,Christy,Zhu,Christy Zhu,Female,2017-12-29,North Ryde
4,11004.0,Elizabeth,Johnson,Elizabeth Johnson,Female,2018-01-23,Wollongong



DIM_Product Preview:


Unnamed: 0,ProductKey,ProductItemCode,Product Name,Sub Category,Product Category,Product Color,Product Size,Product Line,Product Model Name,Product Description,Product Status
0,1.0,AR-5381,Adjustable Race,,,,,,,,Current
1,2.0,BA-8327,Bearing Ball,,,,,,,,Current
2,3.0,BE-2349,BB Ball Bearing,,,,,,,,Current
3,4.0,BE-2908,Headset Ball Bearings,,,,,,,,Current
4,5.0,BL-2036,Blade,,,,,,,,Current



FACT_InternetSales Preview:


Unnamed: 0,ProductKey,OrderDateKey,DueDateKey,ShipDateKey,CustomerKey,SalesOrderNumber,SalesAmount
0,381,20190101,20190113,20190108,16942,SO46700,1000.4375
1,375,20190101,20190113,20190108,15114,SO46701,2181.5625
2,369,20190101,20190113,20190108,15116,SO46702,2443.35
3,337,20190101,20190113,20190108,20576,SO46703,782.99
4,370,20190101,20190113,20190108,13059,SO46704,2443.35


In [99]:
from google.colab import userdata
os.environ['OPENAI_API_KEY']=userdata.get('OPENAI_APIKEY')

In [100]:
from langchain_openai import ChatOpenAI

llm = ChatOpenAI(model_name="gpt-4o-mini", temperature=0)

In [101]:
from langchain_core.prompts import ChatPromptTemplate

snowflake_ddl_prompt = ChatPromptTemplate.from_template(
"""
You are a data engineering assistant that generates Snowflake DDL statements from CSV schemas.

Given the CSV file named {file_name} and the following column metadata:
{columns}

Generate a Snowflake-compatible CREATE TABLE DDL that:
- Uses uppercase table and column names.
- Infers appropriate Snowflake data types based on column content.
- Includes reasonable VARCHAR length limits.
- Uses DATE for ISO date-like fields.
- Outputs only valid SQL ‚Äî no explanations or comments.

Example output format:
CREATE OR REPLACE TABLE TABLE_NAME (
    COLUMN1 NUMBER(38,0),
    COLUMN2 VARCHAR(200),
    COLUMN3 DATE
);

Now generate the DDL for the provided input.
"""
)


In [102]:
file_name="DIM_CUSTOMER.csv"

In [103]:
columns_description = """
CustomerKey: integer values (e.g., 11000)
First Name: text (e.g., Jon)
Last Name: text (e.g., Yang)
Full Name: text (e.g., Jon Yang)
Gender: text (Male/Female)
DateFirstPurchase: date (e.g., 2018-01-19)
Customer City: text (e.g., Rockhampton)
"""

In [104]:
chain = snowflake_ddl_prompt | llm | StrOutputParser()


In [105]:
result = chain.invoke({
    "file_name": file_name,
    "columns": columns_description
})

In [106]:
print(result)

CREATE OR REPLACE TABLE DIM_CUSTOMER (
    CUSTOMERKEY INTEGER,
    FIRST_NAME VARCHAR(50),
    LAST_NAME VARCHAR(50),
    FULL_NAME VARCHAR(100),
    GENDER VARCHAR(10),
    DATEFIRSTPURCHASE DATE,
    CUSTOMER_CITY VARCHAR(100)
);


# Automating the Schema Generation

In [107]:
def infer_schema_from_csv(file_path: str, sample_size: int = 5) -> str:
    df = pd.read_csv(file_path)
    schema_desc = []
    for col in df.columns:
        dtype = df[col].dtype
        sample_value = df[col].dropna().iloc[0] if df[col].notna().any() else "NULL"
        # Simplify dtype mapping
        if pd.api.types.is_integer_dtype(df[col]):
            inferred_type = "integer"
        elif pd.api.types.is_float_dtype(df[col]):
            inferred_type = "float"
        elif pd.api.types.is_datetime64_any_dtype(df[col]):
            inferred_type = "date/time"
        else:
            inferred_type = "text"
        schema_desc.append(f"{col}: {inferred_type} values (e.g., {sample_value})")
    return "\n".join(schema_desc)


In [108]:
snowflake_ddl_prompt = ChatPromptTemplate.from_template(
"""
You are a data engineering assistant that generates Snowflake DDL statements from CSV schemas.

Given the CSV file named {file_name} and the following column metadata:
{columns}

Generate a Snowflake-compatible CREATE TABLE DDL that:
- Uses uppercase table and column names.
- Infers appropriate Snowflake data types based on column content.
- Uses DATE for ISO-formatted date fields.
- Uses VARCHAR with reasonable length limits for text fields.
- Outputs only valid SQL (no markdown formatting, no explanations).

Output:
CREATE OR REPLACE TABLE {{<TABLE_NAME>}} (
    ...
);
"""
)


In [109]:
llm = ChatOpenAI(model_name="gpt-4o-mini", temperature=0)
parser = StrOutputParser()
chain = snowflake_ddl_prompt | llm | parser

In [110]:
file_path = "DIM_Product.csv"
columns_description = infer_schema_from_csv(file_path)

In [111]:
result = chain.invoke({
    "file_name": file_path,
    "columns": columns_description
})

print("\nüßä Generated Snowflake DDL:\n")
print(result)


üßä Generated Snowflake DDL:

CREATE OR REPLACE TABLE DIM_PRODUCT (
    PRODUCTKEY FLOAT,
    PRODUCTITEMCODE VARCHAR(20),
    PRODUCTNAME VARCHAR(100),
    SUBCATEGORY VARCHAR(50),
    PRODUCTCATEGORY VARCHAR(50),
    PRODUCTCOLOR VARCHAR(20),
    PRODUCTSIZE VARCHAR(10),
    PRODUCTLINE VARCHAR(10),
    PRODUCTMODELNAME VARCHAR(100),
    PRODUCTDESCRIPTION VARCHAR(500),
    PRODUCTSTATUS VARCHAR(20)
);


# Data Profiler


# Basic data profiling


In [115]:
def profile_csv(file_path: str) -> str:
    df = pd.read_csv(file_path)
    profile = []

    # Summarize columns
    for col in df.columns:
        dtype = df[col].dtype
        nulls = df[col].isnull().sum()
        unique = df[col].nunique()
        sample_values = df[col].dropna().astype(str).head(3).tolist()

        # Simple heuristic for dates or numeric errors
        possible_issue = ""
        if df[col].isnull().mean() > 0.2:
            possible_issue += "‚ö†Ô∏è High null percentage. "
        if pd.api.types.is_numeric_dtype(df[col]) and (df[col] < 0).any():
            possible_issue += "‚ö†Ô∏è Negative numeric values found. "
        if "date" in col.lower() and not pd.to_datetime(df[col], errors='coerce').notna().all():
            possible_issue += "‚ö†Ô∏è Invalid date formats detected. "

        profile.append(f"""
Column: {col}
  - Dtype: {dtype}
  - Nulls: {nulls}
  - Unique: {unique}
  - Sample: {sample_values}
  - Notes: {possible_issue if possible_issue else "No obvious issues"}
""")
    return "\n".join(profile)

In [116]:
data_validation_prompt = ChatPromptTemplate.from_template(
"""
You are a senior data engineer tasked with validating a CSV file before it is loaded into Snowflake.

File: {file_name}

Here is a data profile summary of the columns:
{profile}

Analyze this data profile and produce a validation report that includes:
1. A summary of potential data quality issues (missing values, invalid data, type mismatches, duplicates, etc.)
2. Recommended solutions or preprocessing steps before loading into Snowflake.
3. Flag any column that might require schema adjustment (e.g., change VARCHAR length, cast to DATE, handle NULLs).

Output the result as a clear, structured text report:
Validation Summary:
- ...

Recommended Fixes:
- ...
"""
)

chain = data_validation_prompt | llm | parser # Create a new chain using data_validation_prompt

file_path = "/content/DIM_Customer.csv"
profile_summary = profile_csv(file_path)

validation_report = chain.invoke({
    "file_name": file_path,
    "profile": profile_summary
})


print("\nüßæ Data Validation Report:\n")
display(Markdown(validation_report))



üßæ Data Validation Report:



### Validation Report for DIM_Customer.csv

#### Validation Summary:
1. **CustomerKey**
   - **Nulls**: 1 missing value detected.
   - **Unique**: 18483 unique values, which is acceptable.
   - **Data Type**: float64, but represented as strings in the sample.
   - **Issue**: Presence of a null value.

2. **First Name**
   - **Nulls**: 0 missing values.
   - **Unique**: 671 unique values.
   - **Data Type**: object.
   - **Issue**: None detected.

3. **Last Name**
   - **Nulls**: 0 missing values.
   - **Unique**: 375 unique values.
   - **Data Type**: object.
   - **Issue**: None detected.

4. **Full Name**
   - **Nulls**: 0 missing values.
   - **Unique**: 18400 unique values.
   - **Data Type**: object.
   - **Issue**: None detected.

5. **Gender**
   - **Nulls**: 0 missing values.
   - **Unique**: 2 unique values (likely 'Male' and 'Female').
   - **Data Type**: object.
   - **Issue**: None detected.

6. **DateFirstPurchase**
   - **Nulls**: 0 missing values.
   - **Unique**: 1124 unique values.
   - **Data Type**: object.
   - **Issue**: Invalid date formats detected (e.g., '18-01-07' is not a standard format).

7. **Customer City**
   - **Nulls**: 1 missing value detected.
   - **Unique**: 269 unique values.
   - **Data Type**: object.
   - **Issue**: Presence of a null value.

#### Recommended Fixes:
1. **CustomerKey**
   - **Action**: Impute or remove the row with the missing value. Consider converting the column to an integer type if appropriate, as customer keys are typically whole numbers.

2. **First Name, Last Name, Full Name, Gender**
   - **Action**: No action required. Data is clean.

3. **DateFirstPurchase**
   - **Action**: Standardize the date format. Convert all entries to a consistent format (e.g., 'YYYY-MM-DD'). Rows with invalid formats should be corrected or removed.

4. **Customer City**
   - **Action**: Impute or remove the row with the missing value. Consider using a placeholder value (e.g., 'Unknown') if appropriate.

#### Schema Adjustments:
- **CustomerKey**: Consider changing the data type to INTEGER if it is confirmed that all values are whole numbers.
- **DateFirstPurchase**: Change the data type to DATE after standardizing the format.
- **Customer City**: Ensure that the VARCHAR length is sufficient to accommodate the longest city name in the dataset.

### Conclusion
Before loading the DIM_Customer.csv file into Snowflake, it is crucial to address the identified data quality issues and implement the recommended preprocessing steps to ensure data integrity and consistency.

# End-to-End Automation
## let‚Äôs now take this all the way to automation.

We‚Äôll build a LangChain-powered, end-to-end data quality pipeline that:

Reads the CSV

Profiles the data

Uses an LLM to identify issues and propose fixes

Parses those suggestions

Applies the suggested cleaning steps automatically in pandas

Returns a cleaned DataFrame, validation report, and ready-to-load Snowflake DDL

In [117]:
def profile_csv(file_path: str) -> pd.DataFrame:
    df = pd.read_csv(file_path)
    return df

def generate_profile_summary(df: pd.DataFrame) -> str:
    profile = []
    for col in df.columns:
        dtype = df[col].dtype
        nulls = df[col].isnull().sum()
        unique = df[col].nunique()
        sample_values = df[col].dropna().astype(str).head(10).tolist()

        issue_hint = ""
        if df[col].isnull().mean() > 0.2:
            issue_hint += "High null ratio. "
        if "date" in col.lower():
            try:
                pd.to_datetime(df[col], errors="raise")
            except Exception:
                issue_hint += "Possible invalid date formats. "
        if pd.api.types.is_numeric_dtype(df[col]) and (df[col] < 0).any():
            issue_hint += "Negative values detected. "

        profile.append(f"""
Column: {col}
  - Dtype: {dtype}
  - Nulls: {nulls}
  - Unique: {unique}
  - Sample: {sample_values}
  - Notes: {issue_hint if issue_hint else "No obvious issues"}
""")
    return "\n".join(profile)


validation_prompt = ChatPromptTemplate.from_template(
"""
You are an expert data engineer responsible for preparing CSV data for loading into Snowflake.

File: {file_name}

Here is a data profile summary:
{profile}

Analyze the profile and output a JSON object with two keys:
1. "issues": list of identified data issues (as human-readable descriptions)
2. "fix_instructions": list of cleaning actions in plain English, each one referencing the column name and a specific fix.

Examples of fixes:
- "Trim whitespace from all string columns"
- "Replace nulls in 'Customer City' with 'UNKNOWN'"
- "Convert 'DateFirstPurchase' to YYYY-MM-DD date format"

Output **only JSON**, no explanations.
"""
)

chain = validation_prompt | llm | parser

# -------------------------------
# 4Ô∏è‚É£ Run validation
# -------------------------------

file_path = "DIM_Customer.csv"
df = profile_csv(file_path)
profile_summary = generate_profile_summary(df)


In [118]:
print(profile_summary)


Column: CustomerKey
  - Dtype: float64
  - Nulls: 1
  - Unique: 18483
  - Sample: ['11000.0', '11001.0', '11002.0', '11003.0', '11004.0', '11005.0', '11006.0', '11007.0', '11008.0', '11010.0']
  - Notes: No obvious issues


Column: First Name
  - Dtype: object
  - Nulls: 0
  - Unique: 671
  - Sample: ['Jon', 'Eugene', 'Ruben', 'Christy', '     Elizabeth     ', 'Julio', 'Janet', 'Marco', 'Rob', 'Shannon']
  - Notes: No obvious issues


Column: Last Name
  - Dtype: object
  - Nulls: 0
  - Unique: 375
  - Sample: ['Yang', 'Huang', 'Torres', 'Zhu', 'Johnson', 'Ruiz', 'Alvarez', 'Mehta', 'Verhoff', 'Carlson']
  - Notes: No obvious issues


Column: Full Name
  - Dtype: object
  - Nulls: 0
  - Unique: 18400
  - Sample: ['Jon Yang', 'Eugene Huang', 'Ruben Torres', 'Christy Zhu', 'Elizabeth Johnson', 'Julio Ruiz', 'Janet Alvarez', 'Marco Mehta', 'Rob Verhoff', 'Shannon Carlson']
  - Notes: No obvious issues


Column: Gender
  - Dtype: object
  - Nulls: 0
  - Unique: 2
  - Sample: ['Male', 'Mal

In [119]:

result_text = chain.invoke({
    "file_name": file_path,
    "profile": profile_summary
})


In [120]:
print(result_text)

```json
{
  "issues": [
    "The 'CustomerKey' column has 1 null value.",
    "The 'DateFirstPurchase' column contains possible invalid date formats.",
    "The 'Customer City' column has 1 null value.",
    "The 'First Name' column contains leading and trailing whitespace in some entries."
  ],
  "fix_instructions": [
    "Replace nulls in 'CustomerKey' with a default value or remove the row.",
    "Convert 'DateFirstPurchase' to a consistent YYYY-MM-DD date format.",
    "Replace nulls in 'Customer City' with 'UNKNOWN'.",
    "Trim whitespace from all entries in the 'First Name' column."
  ]
}
```


In [121]:
import re
import json

# --- Replace the parsing section with this block ---

raw_text = result_text.strip()

# Remove markdown code fences if present
clean_text = re.sub(r"^```(?:json)?|```$", "", raw_text, flags=re.MULTILINE).strip()

try:
    validation_result = json.loads(clean_text)
except json.JSONDecodeError as e:
    print("‚ö†Ô∏è Model output not valid JSON even after cleanup. Showing raw output:\n")
    print(raw_text)
    validation_result = {"issues": [], "fix_instructions": []}

print("\nüßæ Data Validation Report:\n")
for issue in validation_result.get("issues", []):
    print(f"- {issue}")


üßæ Data Validation Report:

- The 'CustomerKey' column has 1 null value.
- The 'DateFirstPurchase' column contains possible invalid date formats.
- The 'Customer City' column has 1 null value.
- The 'First Name' column contains leading and trailing whitespace in some entries.


In [122]:
validation_result

{'issues': ["The 'CustomerKey' column has 1 null value.",
  "The 'DateFirstPurchase' column contains possible invalid date formats.",
  "The 'Customer City' column has 1 null value.",
  "The 'First Name' column contains leading and trailing whitespace in some entries."],
 'fix_instructions': ["Replace nulls in 'CustomerKey' with a default value or remove the row.",
  "Convert 'DateFirstPurchase' to a consistent YYYY-MM-DD date format.",
  "Replace nulls in 'Customer City' with 'UNKNOWN'.",
  "Trim whitespace from all entries in the 'First Name' column."]}

In [123]:

# -------------------------------
# 6Ô∏è‚É£ Auto-apply fixes (simplified)
# -------------------------------


In [124]:
def apply_fixes(df: pd.DataFrame, fix_instructions: list) -> pd.DataFrame:
    for fix in fix_instructions:
        fix_lower = fix.lower()
        if "trim whitespace" in fix_lower:
            df = df.applymap(lambda x: x.strip() if isinstance(x, str) else x)
        if "replace nulls" in fix_lower and "unknown" in fix_lower:
            col = fix.split("'")[1] if "'" in fix else None
            if col in df.columns:
                df[col] = df[col].fillna("UNKNOWN")
        if "convert" in fix_lower and "date" in fix_lower:
            for col in df.columns:
                if "date" in col.lower():
                    df[col] = pd.to_datetime(df[col], errors="coerce").dt.strftime("%Y-%m-%d")
    return df

clean_df = apply_fixes(df, validation_result.get("fix_instructions", []))

print("\n‚úÖ Data cleaned successfully based on suggested fixes.\n")

# -------------------------------
# 7Ô∏è‚É£ Optional: Generate DDL from cleaned data
# -------------------------------

def infer_snowflake_types(df: pd.DataFrame) -> str:
    ddl_lines = []
    for col in df.columns:
        if pd.api.types.is_integer_dtype(df[col]):
            dtype = "NUMBER(38,0)"
        elif pd.api.types.is_float_dtype(df[col]):
            dtype = "FLOAT"
        elif "date" in col.lower():
            dtype = "DATE"
        else:
            max_len = int(df[col].astype(str).map(len).max())
            dtype = f"VARCHAR({min(max_len + 20, 500)})"
        ddl_lines.append(f"    {col.upper().replace(' ', '_')} {dtype}")
    return "CREATE OR REPLACE TABLE DIM_CUSTOMER (\n" + ",\n".join(ddl_lines) + "\n);"

ddl_sql = infer_snowflake_types(clean_df)

print("üßä Generated Snowflake DDL:\n")
print(ddl_sql)



‚úÖ Data cleaned successfully based on suggested fixes.

üßä Generated Snowflake DDL:

CREATE OR REPLACE TABLE DIM_CUSTOMER (
    CUSTOMERKEY FLOAT,
    FIRST_NAME VARCHAR(31),
    LAST_NAME VARCHAR(36),
    FULL_NAME VARCHAR(46),
    GENDER VARCHAR(26),
    DATEFIRSTPURCHASE DATE,
    CUSTOMER_CITY VARCHAR(41)
);


  df = df.applymap(lambda x: x.strip() if isinstance(x, str) else x)


# Compliance Report Pipeline

In [125]:
def generate_compliance_profile(df: pd.DataFrame) -> str:
    profile = []
    for col in df.columns:
        dtype = df[col].dtype
        null_pct = round(df[col].isnull().mean() * 100, 2)
        unique_pct = round(df[col].nunique() / len(df) * 100, 2)
        sample_values = df[col].dropna().astype(str).head(3).tolist()

        # Detect potential PII patterns (emails, phone numbers)
        pii_flag = ""
        sample_text = " ".join(sample_values).lower()
        if re.search(r"\b[a-z0-9._%+-]+@[a-z0-9.-]+\.[a-z]{2,}\b", sample_text):
            pii_flag = "Contains email-like values."
        elif re.search(r"\b\d{3}[-.]?\d{3}[-.]?\d{4}\b", sample_text):
            pii_flag = "Contains phone number-like values."
        elif any(x in col.lower() for x in ["name", "email", "phone"]):
            pii_flag = "Possible PII column."

        profile.append(f"""
Column: {col}
  - Type: {dtype}
  - Null %: {null_pct}
  - Unique %: {unique_pct}
  - Sample: {sample_values}
  - PII Risk: {pii_flag or "None detected"}
""")
    return "\n".join(profile)

# -------------------------------
# 2Ô∏è‚É£ Compliance prompt
# -------------------------------

compliance_prompt = ChatPromptTemplate.from_template(
"""
You are a data governance officer performing a compliance review of a CSV dataset before it is loaded into Snowflake.

Dataset: {file_name}

Below is the dataset's profiling summary:
{profile}

Assess the dataset across the following dimensions:
1. **Completeness** ‚Äì any missing data concerns?
2. **Consistency** ‚Äì are data types and formats consistent?
3. **Validity** ‚Äì do columns have reasonable values?
4. **Uniqueness** ‚Äì is the key column unique?
5. **Privacy / PII** ‚Äì does the dataset expose sensitive information?

Output a structured compliance report in this format:

Compliance Report:
- Overall Compliance Score (0‚Äì100):
- Summary:
- Key Findings:
  - Completeness:
  - Consistency:
  - Validity:
  - Uniqueness:
  - Privacy/PII:
- Recommended Actions:
"""
)

chain = compliance_prompt | llm | parser


file_path = "/content/DIM_Customer.csv"
df = pd.read_csv(file_path)
profile_text = generate_compliance_profile(df)

report = chain.invoke({
    "file_name": file_path,
    "profile": profile_text
})

print("\nüìã Data Compliance Report:\n")
display(Markdown(report))



üìã Data Compliance Report:



**Compliance Report:**

- **Overall Compliance Score (0‚Äì100):** 85

- **Summary:** The dataset is largely complete and consistent, with a unique key column. However, there are potential privacy concerns due to the presence of personally identifiable information (PII) in several columns.

- **Key Findings:**
  - **Completeness:** 
    - The dataset shows a very low percentage of null values across all columns, with the highest being 0.01% in the CustomerKey and Customer City columns. This indicates that the dataset is mostly complete with minimal missing data concerns.
  
  - **Consistency:** 
    - The data types are mostly consistent, with the CustomerKey as float64 and other columns as object types. However, the DateFirstPurchase column has a sample value that appears to be incorrectly formatted ('18-01-07' instead of '2018-01-07'), indicating a potential inconsistency in date formats.

  - **Validity:** 
    - The values in the dataset appear reasonable, but the DateFirstPurchase column needs to be validated for correct date formats. The presence of a non-standard date format raises concerns about the validity of the data.

  - **Uniqueness:** 
    - The CustomerKey column has a uniqueness percentage of 99.99%, indicating that it is effectively unique and serves its purpose as a primary key.

  - **Privacy/PII:** 
    - The First Name, Last Name, and Full Name columns are flagged as possible PII, which raises concerns about the exposure of sensitive information. While the dataset does not contain explicit PII risks, the presence of names could lead to identification of individuals when combined with other data.

- **Recommended Actions:**
  1. **Data Cleaning:** Standardize the date format in the DateFirstPurchase column to ensure consistency and validity.
  2. **PII Mitigation:** Consider anonymizing or pseudonymizing the First Name, Last Name, and Full Name columns to reduce the risk of exposing PII.
  3. **Review Data Usage Policies:** Ensure that data handling and storage policies are in place to protect any PII present in the dataset.
  4. **Further Validation:** Conduct a deeper validation of the dataset to ensure that all values conform to expected formats and ranges, particularly for the DateFirstPurchase column.

In [126]:
from typing import List, Optional
from pydantic import BaseModel, Field

class KeyFindings(BaseModel):
    completeness: Optional[str] = Field(None, description="Completeness findings")
    consistency: Optional[str] = Field(None, description="Consistency findings")
    validity: Optional[str] = Field(None, description="Validity findings")
    uniqueness: Optional[str] = Field(None, description="Uniqueness findings")
    privacy_pii: Optional[str] = Field(None, description="Privacy or PII findings")


class ComplianceReport(BaseModel):
    overall_compliance_score: Optional[int] = Field(
        None, ge=0, le=100, description="Overall score from 0‚Äì100"
    )
    summary: Optional[str] = Field(None, description="High-level summary of compliance")
    key_findings: KeyFindings = Field(default_factory=KeyFindings)
    recommended_actions: Optional[str] = Field(
        None, description="Actionable recommendations"
    )


In [127]:
llm_with_parser = llm.with_structured_output(ComplianceReport)
#

In [128]:
compliance_report_chain = compliance_prompt | llm_with_parser
#

In [129]:
final_result=compliance_report_chain.invoke({
    "file_name": file_path,
    "profile": profile_text
})

In [130]:
final_result

ComplianceReport(overall_compliance_score=85, summary='The dataset is largely complete and unique, but there are concerns regarding the potential exposure of PII in several columns.', key_findings=KeyFindings(completeness='The dataset has minimal missing data, with only 0.01% null values in CustomerKey and Customer City.', consistency='Data types are consistent across columns, but the DateFirstPurchase column has an inconsistent date format (YYYY-MM-DD vs. YY-MM-DD).', validity='Most columns contain reasonable values, but the DateFirstPurchase column needs standardization for date formats.', uniqueness='The CustomerKey column is unique with a uniqueness percentage of 99.99%.', privacy_pii='Columns such as First Name, Last Name, and Full Name are flagged as possible PII, which requires careful handling.'), recommended_actions='1. Standardize the date format in the DateFirstPurchase column to ensure consistency. 2. Review and implement data masking or encryption for PII columns (First Na

# Faker-Based Synthetic Data Generator

In [131]:
# Install faker library
%pip install -qU faker

In [132]:
import pandas as pd
import random
from faker import Faker

# Initialize faker
fake = Faker()
Faker.seed(42)
random.seed(42)

# -------------------------------
# 1Ô∏è‚É£ Define your target schema
# -------------------------------

schema = {
    "CustomerKey": "int",
    "First Name": "str",
    "Last Name": "str",
    "Full Name": "str",
    "Gender": "str",
    "DateFirstPurchase": "date",
    "Customer City": "str"
}

# -------------------------------
# 2Ô∏è‚É£ Function to generate one row
# -------------------------------

def generate_customer_record(i):
    gender = random.choice(["Male", "Female"])
    first_name = fake.first_name_male() if gender == "Male" else fake.first_name_female()
    last_name = fake.last_name()
    full_name = f"{first_name} {last_name}"

    # Introduce some data quality issues intentionally
    date = fake.date_between(start_date="-5y", end_date="today")
    if random.random() < 0.05:  # 5% invalid date
        date = "202-13-99"
    city = fake.city() if random.random() > 0.05 else None  # 5% nulls

    # Random inconsistent casing
    if random.random() < 0.05:
        gender = gender.lower()

    return {
        "CustomerKey": 11000 + i,
        "First Name": first_name.strip(),
        "Last Name": last_name.strip(),
        "Full Name": full_name.strip(),
        "Gender": gender,
        "DateFirstPurchase": date,
        "Customer City": city
    }

# -------------------------------
# 3Ô∏è‚É£ Generate synthetic dataset
# -------------------------------

n_rows = 500
synthetic_data = [generate_customer_record(i) for i in range(n_rows)]
df_synthetic = pd.DataFrame(synthetic_data)

# -------------------------------
# 4Ô∏è‚É£ Save synthetic dataset
# -------------------------------

output_file = "DIM_CUSTOMER_SYNTHETIC.csv"
df_synthetic.to_csv(output_file, index=False)

print(f"‚úÖ Generated {n_rows} synthetic records in '{output_file}'")
print("\nüß© Preview:\n")
print(df_synthetic.head(10))


‚úÖ Generated 500 synthetic records in 'DIM_CUSTOMER_SYNTHETIC.csv'

üß© Preview:

   CustomerKey First Name Last Name        Full Name  Gender  \
0        11000       Mark   Johnson     Mark Johnson    Male   
1        11001    Michael   Mcclain  Michael Mcclain    Male   
2        11002    Colleen    Wagner   Colleen Wagner  Female   
3        11003     Joshua  Robinson  Joshua Robinson    Male   
4        11004       Gina     Moore       Gina Moore  Female   
5        11005      Brent    Abbott     Brent Abbott    Male   
6        11006    Valerie      Gray     Valerie Gray  Female   
7        11007      Angel     Perez      Angel Perez    Male   
8        11008      Jamie    Chavez     Jamie Chavez  Female   
9        11009     Meagan     Miles     Meagan Miles  Female   

  DateFirstPurchase      Customer City  
0         202-13-99          East Jill  
1        2021-05-07        Johnsonland  
2         202-13-99         Lake Debra  
3        2023-11-11  Port Lindachester  
4     