In [1]:
import pandas as pd

In [10]:
# list of fail files:
fail_files = ["data/reverse_engineered_ST02_MLGCS.json", "data/reverse_engineered_Commercial_NRT.json"]

# list of pass files:
# finish  "data/pass_reasoning_Commercial_NRT.json" later
pass_files = ["data/pass_reasoning_ST02_MLGCS.json"]

In [20]:
# read in all fail files, and concatenate them into a single dataframe

fail_df = pd.DataFrame()
for file in fail_files:
    df = pd.read_json(file)
    df['fail'] = True
    fail_df = pd.concat([fail_df, df])

# read in all pass files, and concatenate them into a single dataframe

pass_df = pd.DataFrame()
for file in pass_files:
    df = pd.read_json(file)
    df['fail'] = False
    pass_df = pd.concat([pass_df, df])

In [21]:
#rename reverse_engineered_description column to description
fail_df = fail_df.rename(columns={"reverse_engineered_description": "description"})
# drop final_description column
fail_df = fail_df.drop(columns=['final_description'])
fail_df.head()

Unnamed: 0,description,comment,fail
0,This table contains the details for case resol...,Add the source and source table names and grai...,True
1,This table contains Customer data. It is a SCD...,Add the source and source table names and grai...,True
2,This attribute is derived using decode logic o...,It is not a decode logic. change accordingly.\...,True
3,This is a reference key to DimDate table to ge...,If it is a direct mapping to Commerical attrib...,True
4,This is a reference key to DimTime table to ge...,If it is a direct mapping to Commerical attrib...,True


In [22]:
# rename final_description column to description
pass_df = pass_df.rename(columns={"final_description": "description"})
pass_df.head()

Unnamed: 0,description,comment,fail
0,This table contains details for issue and reso...,The description contains source information by...,False
1,Surrogate key generated on SupportTopicFullP...,The description includes source information by...,False
2,Timestamp in UTC at which record is inserted b...,This description indicates the source as UDP a...,False
3,Timestamp in UTC at which record is updated by...,"The description provides the source, UDP, poin...",False
4,Direct mapping to column SupportTopicFullPath ...,This description provides essential source inf...,False


In [23]:
# concatenate the fail and pass dataframes into a single dataframe
df = pd.concat([fail_df, pass_df])
df.head()

Unnamed: 0,description,comment,fail
0,This table contains the details for case resol...,Add the source and source table names and grai...,True
1,This table contains Customer data. It is a SCD...,Add the source and source table names and grai...,True
2,This attribute is derived using decode logic o...,It is not a decode logic. change accordingly.\...,True
3,This is a reference key to DimDate table to ge...,If it is a direct mapping to Commerical attrib...,True
4,This is a reference key to DimTime table to ge...,If it is a direct mapping to Commerical attrib...,True


In [25]:
# hold out 10% of the data for testing
from sklearn.model_selection import train_test_split
train_df, test_df = train_test_split(df, test_size=0.1)

train_df.value_counts("fail")



fail
False    349
True     202
Name: count, dtype: int64

In [29]:
test_df.value_counts("fail")

fail
False    34
True     28
Name: count, dtype: int64

In [30]:
system_msg = """You are a helpful AI assistant that reviews technical descriptions for data entities and attributes.
Your task is to determine whether the description meets quality standards. If it does, respond with **Pass**. If not, respond with **Fail**.
- A **Pass** means the description is clear, complete, and accurately describes the attribute and its source.
- A **Fail** means the description lacks clarity, completeness, or essential details.
Please provide a short explanation for your decision.
Your output format should be:
Reason: <your explanation>  
Decision: Pass or Fail"""

# output to jsonl files with format:
# {text: “<|system|>{system_msg}<|end|><|user|>{user_msg}<|end|><|assistant|>{assistant_msg}”}

# train data
with open("data/train_ALL.jsonl", "w") as f:
    for i, row in train_df.iterrows():
        user_msg = f"Description: {row['description']}"
        assistant_msg = f"Reason: {row['comment']}\nDecision: {'Pass' if row['fail'] == False else 'Fail'}"
        f.write(f'{{"text": "<|system|>{system_msg}<|end|><|user|>{user_msg}<|end|><|assistant|>{assistant_msg}"}}\n')

# test data
with open("data/test_ALL.jsonl", "w") as f:
    for i, row in test_df.iterrows():
        user_msg = f"Description: {row['description']}"
        assistant_msg = f"Reason: {row['comment']}\nDecision: {'Pass' if row['fail'] == False else 'Fail'}"
        f.write(f'{{"text": "<|system|>{system_msg}<|end|><|user|>{user_msg}<|end|><|assistant|>{assistant_msg}"}}\n')