In [1]:
import pandas as pd
import json
import os

In [2]:
# Load your dataset
df = pd.read_csv('data/EDA_outputs/kcc_dataset_QA_cleaned_60K.csv')

In [3]:
df.head()

Unnamed: 0,QueryText,KccAns
0,GROUNDNUT VERITIES,GROUNDNUT VERITIES -TAG 24 TG 26
1,ASKED ABOUT SOWING TIME OF GROUND NUT,SOWING TIME OF GROUND NUT-IT GENERALLY CULTIVA...
2,TELL ME CONTROL LAT IN GROUNDNUT,SPRAY OF PROFENOFOS 15 ML PER LITER WATER
3,weed management in groundnut,hand weeding
4,Farmer wants to know about fungal disease mana...,-- -


In [9]:
# Configuration for splitting into smaller batches
batch_size = 6000  # Adjust the batch size based on your limit and data size
batch_number = 1
batch_dir = "batches"

# Create a directory to store the batch files
if not os.path.exists(batch_dir):
    os.makedirs(batch_dir)

for start in range(0, len(df), batch_size):
    end = start + batch_size
    batch_df = df[start:end]
    
    # Create the JSONL batch file
    batch_tasks = []

    for index, row in batch_df.iterrows():
        question = row['QueryText']
        answer = row['KccAns']
        task = {
            "custom_id": f"task-{start + index}",
            "method": "POST",
            "url": "/v1/chat/completions",
            "body": {
                "model": "gpt-4o-mini",
                "messages": [
                    {"role": "system", "content": "You are a helpful assistant that corrects grammar while preserving the original meaning."},
                    {"role": "user", "content": f"Correct the grammar of the following answer based on the question '{question}', maintaining its original meaning. Use lowercase letters. In case the answer is empty or has symbols and numbers, provide a suitable answer to the farmer's question: '{answer}'"}
                ]
            }
        }
        batch_tasks.append(task)

    # Write the tasks to a .jsonl file
    batch_file_name = os.path.join(batch_dir, f"batch_tasks_queries_{batch_number}.jsonl")

    with open(batch_file_name, 'w') as file:
        for task in batch_tasks:
            file.write(json.dumps(task) + '\n')

    print(f"Batch file '{batch_file_name}' created successfully.")
    batch_number += 1


Batch file 'batches/batch_tasks_queries_1.jsonl' created successfully.
Batch file 'batches/batch_tasks_queries_2.jsonl' created successfully.
Batch file 'batches/batch_tasks_queries_3.jsonl' created successfully.
Batch file 'batches/batch_tasks_queries_4.jsonl' created successfully.
Batch file 'batches/batch_tasks_queries_5.jsonl' created successfully.
Batch file 'batches/batch_tasks_queries_6.jsonl' created successfully.
Batch file 'batches/batch_tasks_queries_7.jsonl' created successfully.
Batch file 'batches/batch_tasks_queries_8.jsonl' created successfully.
Batch file 'batches/batch_tasks_queries_9.jsonl' created successfully.
Batch file 'batches/batch_tasks_queries_10.jsonl' created successfully.
Batch file 'batches/batch_tasks_queries_11.jsonl' created successfully.
Batch file 'batches/batch_tasks_queries_12.jsonl' created successfully.
Batch file 'batches/batch_tasks_queries_13.jsonl' created successfully.
Batch file 'batches/batch_tasks_queries_14.jsonl' created successfully.
B

In [3]:
# Create the JSONL batch file
batch_tasks = []

for index, row in df.iterrows():
    question = row['QueryText']
    answer = row['KccAns']
    task = {
        "custom_id": f"task-{index}",
        "method": "POST",
        "url": "/v1/chat/completions",
        "body": {
            "model": "gpt-4o-mini",
            "messages": [
                {"role": "system", "content": "You are a helpful assistant that corrects grammar while preserving the original meaning."},
                {"role": "user", "content": f"Correct the grammar of the following answer based on the question '{question}', maintaining its original meaning. Use lowercase letters. In case the answer is empty or has symbols and numbers, provide a suitable answer to the farmer's question: '{answer}'"}
            ]
        }
    }
    batch_tasks.append(task)

# Write the tasks to a .jsonl file
file_name = "batch_tasks_queries.jsonl"

with open(file_name, 'w') as file:
    for task in batch_tasks:
        file.write(json.dumps(task) + '\n')

print(f"Batch file '{file_name}' created successfully.")


Batch file 'batch_tasks_queries.jsonl' created successfully.


In [None]:
{"custom_id": "task-0", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "gpt-4o-mini", "messages": [{"role": "system", "content": "You are a helpful assistant that corrects grammar while preserving the original meaning."}, {"role": "user", "content": "Correct the grammar of the following answer based on the question 'GROUNDNUT VERITIES', maintaining its original meaning. Use lowercase letters. In case the answer is empty or has symbols and numbers, provide a suitable answer to the farmer's question: 'GROUNDNUT VERITIES -TAG 24 TG 26'"}]}}

In [11]:
import pandas as pd
import os
from sklearn.model_selection import train_test_split

In [34]:
# Directory paths for input and output files
batches_dir = "batches/"
outputs_dir = "batches/outputs/"
data_dir = "data/"
output_file = os.path.join(data_dir, "kcc_dataset_QA_cleaned_60K_grammar_improved.csv")

In [25]:
# Define a function to read both input and output JSONL files using pandas
def read_files(file_names):
    data = []
    for file_name in file_names:
        try:
            # Read each file as a DataFrame and convert it to a list of dictionaries
            df = pd.read_json(file_name, lines=True)
            data.append(df)
            print(f"Successfully loaded {file_name}")
        except ValueError as e:
            print(f"Skipping file {file_name} due to error: {e}")
    # Concatenate all data into a single DataFrame if there is data
    if data:
        return pd.concat(data, ignore_index=True)
    else:
        return pd.DataFrame()

# List of input file names
input_file_names = [
    "batches/batch_tasks_queries_1.jsonl",
    "batches/batch_tasks_queries_2.jsonl",
    "batches/batch_tasks_queries_3.jsonl",
    "batches/batch_tasks_queries_4.jsonl",
    "batches/batch_tasks_queries_5.jsonl",
    "batches/batch_tasks_queries_6.jsonl",
    "batches/batch_tasks_queries_7.jsonl",
    "batches/batch_tasks_queries_8.jsonl",
    "batches/batch_tasks_queries_9.jsonl",
    "batches/batch_tasks_queries_10.jsonl",
    "batches/batch_tasks_queries_11.jsonl",
    "batches/batch_tasks_queries_12.jsonl",
    "batches/batch_tasks_queries_13.jsonl",
    "batches/batch_tasks_queries_14.jsonl",
    "batches/batch_tasks_queries_15.jsonl"
]

# List of output file names
output_file_names = [
    "batches/outputs/1_batch_PoC7eDs9NQBh7fjeU24oPLbj_output.jsonl",
    "batches/outputs/2_batch_TtqZkmGuJkvBiDMFlnfYrz26_output.jsonl",
    "batches/outputs/3_batch_CD1D0C8yA7pD8B3Lii8hS80E_output.jsonl",
    "batches/outputs/4_batch_zxAevbs0cwwEArQ6ehIhpxo2_output.jsonl",
    "batches/outputs/5_batch_fmBtKML1oXWpEgt1Wwwe9tEf_output.jsonl",
    "batches/outputs/6_batch_FWgAv510sVRhDmbwYquOtpnF_output.jsonl",
    "batches/outputs/7_batch_Pj2nb7ICOeH3tDX044ZvyXeS_output.jsonl",
    "batches/outputs/8_batch_KWDRxZD0l9HNeFIXeAShabFQ_output.jsonl",
    "batches/outputs/9_batch_0MTrNU0cmxgodLvYKsXK8VON_output.jsonl",
    "batches/outputs/10_batch_7g0csgkSanQPM2sE3EOTo9yc_output.jsonl",
    "batches/outputs/11_batch_KOp7vbxMAYa6rXyYOizUfVel_output.jsonl",
    "batches/outputs/12_batch_yMPPBChPtn5y3QcrhshpMH5O_output.jsonl",
    "batches/outputs/13_batch_bpZzVjOPDXAxL8ISaKK2Bn1N_output.jsonl",
    "batches/outputs/14_batch_oZREwstzByDley0gFHgPtIVp_output.jsonl",
    "batches/outputs/15_batch_BFHNGbckBNadpPEaFtVgmqfj_output.jsonl"
]

In [27]:
# Read the input files one by one and concatenate the results
input_data = read_files(input_file_names)
print("Input Data Sample:")
input_data.head()

Successfully loaded batches/batch_tasks_queries_1.jsonl
Successfully loaded batches/batch_tasks_queries_2.jsonl
Successfully loaded batches/batch_tasks_queries_3.jsonl
Successfully loaded batches/batch_tasks_queries_4.jsonl
Successfully loaded batches/batch_tasks_queries_5.jsonl
Successfully loaded batches/batch_tasks_queries_6.jsonl
Successfully loaded batches/batch_tasks_queries_7.jsonl
Successfully loaded batches/batch_tasks_queries_8.jsonl
Successfully loaded batches/batch_tasks_queries_9.jsonl
Successfully loaded batches/batch_tasks_queries_10.jsonl
Successfully loaded batches/batch_tasks_queries_11.jsonl
Successfully loaded batches/batch_tasks_queries_12.jsonl
Successfully loaded batches/batch_tasks_queries_13.jsonl
Successfully loaded batches/batch_tasks_queries_14.jsonl
Successfully loaded batches/batch_tasks_queries_15.jsonl
Input Data Sample:


Unnamed: 0,custom_id,method,url,body
0,task-0,POST,/v1/chat/completions,"{'model': 'gpt-4o-mini', 'messages': [{'role':..."
1,task-1,POST,/v1/chat/completions,"{'model': 'gpt-4o-mini', 'messages': [{'role':..."
2,task-2,POST,/v1/chat/completions,"{'model': 'gpt-4o-mini', 'messages': [{'role':..."
3,task-3,POST,/v1/chat/completions,"{'model': 'gpt-4o-mini', 'messages': [{'role':..."
4,task-4,POST,/v1/chat/completions,"{'model': 'gpt-4o-mini', 'messages': [{'role':..."


In [28]:
# Read the output files one by one and concatenate the results
output_data = read_files(output_file_names)
print("Output Data Sample:")
output_data.head()

Successfully loaded batches/outputs/1_batch_PoC7eDs9NQBh7fjeU24oPLbj_output.jsonl
Successfully loaded batches/outputs/2_batch_TtqZkmGuJkvBiDMFlnfYrz26_output.jsonl
Successfully loaded batches/outputs/3_batch_CD1D0C8yA7pD8B3Lii8hS80E_output.jsonl
Successfully loaded batches/outputs/4_batch_zxAevbs0cwwEArQ6ehIhpxo2_output.jsonl
Successfully loaded batches/outputs/5_batch_fmBtKML1oXWpEgt1Wwwe9tEf_output.jsonl
Successfully loaded batches/outputs/6_batch_FWgAv510sVRhDmbwYquOtpnF_output.jsonl
Successfully loaded batches/outputs/7_batch_Pj2nb7ICOeH3tDX044ZvyXeS_output.jsonl
Successfully loaded batches/outputs/8_batch_KWDRxZD0l9HNeFIXeAShabFQ_output.jsonl
Successfully loaded batches/outputs/9_batch_0MTrNU0cmxgodLvYKsXK8VON_output.jsonl
Successfully loaded batches/outputs/10_batch_7g0csgkSanQPM2sE3EOTo9yc_output.jsonl
Successfully loaded batches/outputs/11_batch_KOp7vbxMAYa6rXyYOizUfVel_output.jsonl
Successfully loaded batches/outputs/12_batch_yMPPBChPtn5y3QcrhshpMH5O_output.jsonl
Successfully 

Unnamed: 0,id,custom_id,response,error
0,batch_req_8b6CbNmyCH9AO6gQbZUXmLsv,task-0,"{'status_code': 200, 'request_id': '78de2c2333...",
1,batch_req_ifp4wcKyhaI7KRS2QR3nA0SD,task-1,"{'status_code': 200, 'request_id': '75a7f0dfad...",
2,batch_req_nzw0TlP3fAM60IF91dn4uGGH,task-2,"{'status_code': 200, 'request_id': '1bf8565724...",
3,batch_req_45hEMH0HrF4L6ViY1W6LcW2t,task-3,"{'status_code': 200, 'request_id': '0ee60a5eef...",
4,batch_req_YWrN2JqQalx0St8PJdKO349T,task-4,"{'status_code': 200, 'request_id': 'e2c598d0c4...",


In [29]:
input_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 87083 entries, 0 to 87082
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   custom_id  87083 non-null  object
 1   method     87083 non-null  object
 2   url        87083 non-null  object
 3   body       87083 non-null  object
dtypes: object(4)
memory usage: 2.7+ MB


In [30]:
output_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 87083 entries, 0 to 87082
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   id         87083 non-null  object 
 1   custom_id  87083 non-null  object 
 2   response   87083 non-null  object 
 3   error      0 non-null      float64
dtypes: float64(1), object(3)
memory usage: 2.7+ MB


In [31]:
# Function to pair improved grammar answers back to their respective questions using custom_id
def pair_questions_with_improved_answers(input_data, output_data):
    qa_pairs = []
    
    # Create a dictionary for output data based on custom_id for faster lookups
    output_dict = {item["custom_id"]: item for item in output_data.to_dict(orient='records')}

    # Iterate through the input data to match each question with its improved answer
    for _, row in input_data.iterrows():
        custom_id = row["custom_id"]
        
        # Extract the question and original answer from the input data
        question = row["body"]["messages"][1]["content"].split("question '")[1].split("',")[0]
        original_answer = row["body"]["messages"][1]["content"].split("question: '")[1].rstrip("'")
        
        # Look up the corresponding improved answer in the output data
        response_item = output_dict.get(custom_id, {})
        
        # If a valid response exists, extract the improved answer
        if response_item and response_item.get("response", {}).get("status_code") == 200:
            improved_answer = response_item["response"]["body"]["choices"][0]["message"]["content"]
            qa_pairs.append({
                "question": question,
                "original_answer": original_answer,
                "improved_answer": improved_answer
            })
    
    # Convert the paired data into a DataFrame
    return pd.DataFrame(qa_pairs)

# Perform the pairing of questions with improved grammar answers
paired_qa_df = pair_questions_with_improved_answers(input_data, output_data)

# Display the first few rows of the paired questions and answers
paired_qa_df.head()

Unnamed: 0,question,original_answer,improved_answer
0,GROUNDNUT VERITIES,GROUNDNUT VERITIES -TAG 24 TG 26,"groundnut varieties include tag 24 and tg 26, ..."
1,ASKED ABOUT SOWING TIME OF GROUND NUT,SOWING TIME OF GROUND NUT-IT GENERALLY CULTIVA...,the sowing time for groundnut is generally fro...
2,TELL ME CONTROL LAT IN GROUNDNUT,SPRAY OF PROFENOFOS 15 ML PER LITER WATER,the control of lat in groundnut involves the s...
3,weed management in groundnut,hand weeding,hand weeding is one of the most effective meth...
4,Farmer wants to know about fungal disease mana...,-- -,fungal disease management in groundnut involve...


In [35]:
print(output_file)

data/kcc_dataset_QA_cleaned_60K_grammar_improved.csv


In [36]:
# Save the final dataset to a CSV file
os.makedirs(data_dir, exist_ok=True)
paired_qa_df.to_csv(output_file, index=False)

In [38]:
from datasets import Dataset, DatasetDict

# Step 1: Assume `paired_qa_df` is already available
# If it isn't, you need to ensure it has columns: 'question', 'original_answer', 'improved_answer'

# Step 2: Split the dataset into train, validation, and test sets
train_df, temp_df = train_test_split(paired_qa_df, test_size=0.2, random_state=42)
val_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=42)

# Step 3: Convert to Hugging Face datasets
train_dataset = Dataset.from_pandas(train_df)
val_dataset = Dataset.from_pandas(val_df)
test_dataset = Dataset.from_pandas(test_df)

# Combine into a DatasetDict for easier management
dataset_dict = DatasetDict({
    "train": train_dataset,
    "validation": val_dataset,
    "test": test_dataset
})

In [39]:
# Save the dataset dict with the name 'kcc_dataset_grammar_improved'
dataset_dict.save_to_disk("data/kcc_dataset_grammar_improved")

Saving the dataset (0/1 shards):   0%|          | 0/69666 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/8708 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/8709 [00:00<?, ? examples/s]

In [40]:
# Step 4: Push the dataset to the Hugging Face Hub
# Replace 'your_dataset_name' with your desired dataset name
# Replace 'your_username' with your Hugging Face username
dataset_dict.push_to_hub("matovu-ronald/kisan_call_centre_groundnut_crop_QA_dataset_improved_grammar", token="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")

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/70 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/9 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/9 [00:00<?, ?ba/s]

README.md:   0%|          | 0.00/24.0 [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/datasets/matovu-ronald/kisan_call_centre_groundnut_crop_QA_dataset_improved_grammar/commit/487017ad479845527479e56c9a06b31c92dfb7b8', commit_message='Upload dataset', commit_description='', oid='487017ad479845527479e56c9a06b31c92dfb7b8', pr_url=None, pr_revision=None, pr_num=None)