In [24]:
import json
import pandas as pd
from datasets import load_dataset

In [9]:
def load_csv(file_path):  
    data = pd.read_csv(file_path)  
    return data 

In [10]:
def convert_csv_disk(df, path_out):
    # Convert the DataFrame to the desired format  
    formatted_data = df.to_dict(orient='records')  
    with open(path_out, 'w') as outfile:  
        for line in formatted_data:
            json.dump(line, outfile)  
            outfile.write('\n')
    return formatted_data[:1]

In [11]:
def create_finetune_prompt(start_prompt, df):
    text_list = []
    for i in range( len(df) ):
        text = start_prompt + df["text"][i] + "\n" + df["b_class"][i] 
        text_list.append(text)
        
    df["full_text"] = text_list    
    df.drop(['text', 'class', 'b_class'], axis=1, inplace=True)
    df.rename(columns={'full_text': 'text'}, inplace=True)  
    
    return df

In [12]:
# test the load_dataset and see how the downloaded dataset look like
# dataset = load_dataset('stanfordnlp/imdb')  
# dataset

# promise

In [13]:
df_train = load_csv('processed_promise_train.csv' ) 
df_val = load_csv('processed_promise_val.csv' ) 
df = pd.concat([df_train, df_val], ignore_index=True)  
start_prompt = "As a senior software engineer who is experienced in software requirement classification, please classify the given requirement into 'functional requirement' or 'non-functional requirement'. \nThe given requirement: "
df_finetune = create_finetune_prompt(start_prompt, df)
path_out = 'llama_training_data/llama_processed_promise_train_val.jsonl'
convert_csv_disk(df_finetune, path_out)

[{'text': "As a senior software engineer who is experienced in software requirement classification, please classify the given requirement into 'functional requirement' or 'non-functional requirement'. \nThe given requirement: 'Product shall allow entering storing and modifying product menus used by POS terminals.'\nFunctional"}]

In [14]:
df_test = load_csv('processed_promise_test.csv' ) 
start_prompt = "As a senior software engineer who is experienced in software requirement classification, please classify the given requirement into 'functional requirement' or 'non-functional requirement'. \nThe given requirement: "
df_finetune_test = create_finetune_prompt(start_prompt, df_test)
path_out = 'llama_training_data/llama_processed_promise_test.jsonl'
convert_csv_disk(df_finetune_test, path_out)

[{'text': "As a senior software engineer who is experienced in software requirement classification, please classify the given requirement into 'functional requirement' or 'non-functional requirement'. \nThe given requirement: 'The system shall be able to display a printable summary for individual cohorts which will include the students enlisted the Program of study sequence of classes cohort progress through the program and timeline of completion.'\nFunctional"}]

In [15]:
train_file = 'llama_training_data/llama_processed_promise_train_val.jsonl'
test_file = 'llama_training_data/llama_processed_promise_test.jsonl'
raw_dataset = load_dataset(
    "json",
    data_files={
        "train": train_file,
        "test": test_file
    },
)
print(raw_dataset)
#print(loaded_data)

Generating train split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['text'],
        num_rows: 489
    })
    test: Dataset({
        features: ['text'],
        num_rows: 123
    })
})


# pure

In [16]:
df_train = load_csv('processed_pure_train.csv' ) 
df_val = load_csv('processed_pure_val.csv' ) 
df = pd.concat([df_train, df_val], ignore_index=True)  
start_prompt = "As a senior software engineer who is experienced in software requirement classification, please classify the given requirement into 'functional requirement' or 'non-functional requirement'. \nThe given requirement: "
df_finetune = create_finetune_prompt(start_prompt, df)
path_out = 'llama_training_data/llama_processed_pure_train_val.jsonl'
convert_csv_disk(df_finetune, path_out)

[{'text': "As a senior software engineer who is experienced in software requirement classification, please classify the given requirement into 'functional requirement' or 'non-functional requirement'. \nThe given requirement: User shall have confidence in the system's performance as it supports multiple users simultaneously.\nNon-functional"}]

In [17]:
df_test = load_csv('processed_pure_test.csv' ) 
start_prompt = "As a senior software engineer who is experienced in software requirement classification, please classify the given requirement into 'functional requirement' or 'non-functional requirement'. \nThe given requirement: "
df_finetune_test = create_finetune_prompt(start_prompt, df_test)
path_out = 'llama_training_data/llama_processed_pure_test.jsonl'
convert_csv_disk(df_finetune_test, path_out)

[{'text': "As a senior software engineer who is experienced in software requirement classification, please classify the given requirement into 'functional requirement' or 'non-functional requirement'. \nThe given requirement: System shall utilize SSL (Secured Socket Layer) certificate for protection.\nNon-functional"}]

In [18]:
train_file = 'llama_training_data/llama_processed_pure_train_val.jsonl'
test_file = 'llama_training_data/llama_processed_pure_test.jsonl'
raw_dataset = load_dataset(
    "json",
    data_files={
        "train": train_file,
        "test": test_file
    },
)
print(raw_dataset)
#print(loaded_data)

Generating train split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['text'],
        num_rows: 5477
    })
    test: Dataset({
        features: ['text'],
        num_rows: 609
    })
})


In [26]:
test_file = 'llama_training_data/llama_processed_promise_test.jsonl'
raw_dataset = load_dataset(
    "json",
    data_files={
        "test": test_file
    },
)

Generating test split: 0 examples [00:00, ? examples/s]

In [37]:
test_sample_list = raw_dataset["test"]['text']
prompt_list = [text.strip("Functional").strip("Non-functional") for text in test_sample_list]
print(prompt_list[0], prompt_list[1], prompt_list[20])

As a senior software engineer who is experienced in software requirement classification, please classify the given requirement into 'functional requirement' or 'non-functional requirement'. 
The given requirement: 'The system shall be able to display a printable summary for individual cohorts which will include the students enlisted the Program of study sequence of classes cohort progress through the program and timeline of completion.'
 As a senior software engineer who is experienced in software requirement classification, please classify the given requirement into 'functional requirement' or 'non-functional requirement'. 
The given requirement: 'When a ship is sunk the product shall simulate the sound of a sinking ship.'
 As a senior software engineer who is experienced in software requirement classification, please classify the given requirement into 'functional requirement' or 'non-functional requirement'. 
The given requirement: 'The product should be able to be used by 90% of no