# load the datasets

In [1]:
import csv  
import pandas as pd  
import numpy as np
from sklearn.model_selection import train_test_split  
from few_shot_list_creation import *

%reload_ext autoreload
%autoreload 2

  from tqdm.autonotebook import tqdm, trange


In [2]:
# def load_csv(file_path):  
#     data = []  
#     with open(file_path, 'r') as file:  
#         csv_reader = csv.reader(file)  
#         for row in csv_reader:  
#             data.append(row)  
#     return data  

# file_path = 'promise-km-100-f.csv'  
# loaded_data = load_csv(file_path)  

## load the promise

In [90]:
def load_csv(file_path):  
    data = pd.read_csv(file_path)  
    return data  


In [91]:
file_path = 'promise-km-100-f.csv'  
loaded_data = load_csv(file_path) 

In [92]:
loaded_data["Class"].unique()

array(['PE', 'LF', 'US', 'A', 'SE', 'F', 'FT', 'SC', 'PO', 'O', 'L', 'MN'],
      dtype=object)

In [93]:
## create a new column for binary FR or NFR classification
loaded_data_class = loaded_data[["RequirementText", "Class"]]
loaded_data_class["b_class"] = "F"

#selected_rows = loaded_data_class.loc[loaded_data_class['Class'] != 'FR']
loaded_data_class.loc[loaded_data_class['Class'] != 'F', "b_class"] = "NF"

loaded_data_class[:10]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  loaded_data_class["b_class"] = "F"


Unnamed: 0,RequirementText,Class,b_class
0,'The system shall refresh the display every 60...,PE,NF
1,'The application shall match the color of the ...,LF,NF
2,'If projected the data must be readable. On a ...,US,NF
3,'The product shall be available during normal ...,A,NF
4,'If projected the data must be understandable....,US,NF
5,'The product shall ensure that it can only be ...,SE,NF
6,'The product shall be intuitive and self-expla...,US,NF
7,'The product shall respond fast to keep up-to-...,PE,NF
8,'The system shall have a MDI form that allows ...,F,F
9,'The system shall display Events in a vertical...,F,F


In [94]:
# change abbreviations to textual requirement, e.g. F -> Functional, PE -> Performance, 
loaded_data_class.loc[loaded_data_class['b_class'] == 'F', "b_class"] = "Functional"
loaded_data_class.loc[loaded_data_class['b_class'] == 'NF', "b_class"] = "Non-functional"
loaded_data_class.loc[loaded_data_class['Class'] == 'F', "Class"] = "Functional"
loaded_data_class.loc[loaded_data_class['Class'] == 'A', "Class"] = "Availability"
loaded_data_class.loc[loaded_data_class['Class'] == 'FT', "Class"] = "Fault Tolerance"
loaded_data_class.loc[loaded_data_class['Class'] == 'LF', "Class"] = "Look and Feel"
loaded_data_class.loc[loaded_data_class['Class'] == 'MN', "Class"] = "Maintainability"
loaded_data_class.loc[loaded_data_class['Class'] == 'O', "Class"] = "Operational"
loaded_data_class.loc[loaded_data_class['Class'] == 'PE', "Class"] = "Performance"
loaded_data_class.loc[loaded_data_class['Class'] == 'PO', "Class"] = "Portability"
loaded_data_class.loc[loaded_data_class['Class'] == 'SC', "Class"] = "Scalability"
loaded_data_class.loc[loaded_data_class['Class'] == 'SE', "Class"] = "Security"
loaded_data_class.loc[loaded_data_class['Class'] == 'US', "Class"] = "Usability"
loaded_data_class.loc[loaded_data_class['Class'] == 'RE', "Class"] = "Reliability"

loaded_data_class[:10]

Unnamed: 0,RequirementText,Class,b_class
0,'The system shall refresh the display every 60...,Performance,Non-functional
1,'The application shall match the color of the ...,Look and Feel,Non-functional
2,'If projected the data must be readable. On a ...,Usability,Non-functional
3,'The product shall be available during normal ...,Availability,Non-functional
4,'If projected the data must be understandable....,Usability,Non-functional
5,'The product shall ensure that it can only be ...,Security,Non-functional
6,'The product shall be intuitive and self-expla...,Usability,Non-functional
7,'The product shall respond fast to keep up-to-...,Performance,Non-functional
8,'The system shall have a MDI form that allows ...,Functional,Functional
9,'The system shall display Events in a vertical...,Functional,Functional


In [95]:
loaded_data_class["Class"].unique(), loaded_data_class.loc[loaded_data_class['Class'] == 'L'].shape

(array(['Performance', 'Look and Feel', 'Usability', 'Availability',
        'Security', 'Functional', 'Fault Tolerance', 'Scalability',
        'Portability', 'Operational', 'L', 'Maintainability'], dtype=object),
 (13, 3))

In [99]:
# see the text of category "l"
for text in loaded_data_class[(loaded_data_class['Class'] == 'L')]['RequirementText']:
    print(text)

'The Disputes application must conform to the legal requirements as specified by the Merchant Operating Regulations.'
'All business rules specified in the Disputes System shall be in compliance with the Merchant Operating Regulations.'
'The Disputes application must conform to the legal requirements as specified by Regulation E and Regulation Z that govern credit card disputes processing.'
'All business rules specified in the Disputes System shall be in compliance to the guidelines of Regulation E and Regulation Z.'
'The Disputes application must maintain a detailed history of every action that a user takes on a dispute case. This ensures a complete audit trail if questions arise later on with regard to a particular dispute case.'
'All actions that modify an existing dispute case must be recorded in the case history.'
'The product must comply with Sarbanes-Oxley.'
'The product shall comply with the estimatics laws relating to recycled parts usage.'
'The product shall comply with insura

In [9]:
# Drop rows with specific values  
loaded_data_class = loaded_data_class[~(loaded_data_class['Class'] == 'L')]  

In [10]:
loaded_data_class.rename(columns={'RequirementText': 'text','Class': 'class'}, inplace=True)  

In [11]:
loaded_data_class.to_csv('processed_promise.csv', index=False)  

### train test split

In [12]:
file_path = 'processed_promise.csv'  
X = load_csv(file_path)  
X.shape

(612, 3)

In [13]:
# Assuming your dataset is stored in X  
X_train, X_temp = train_test_split(X, test_size=0.4, random_state=42)  

# Further split the temporary data into validation and test data  
X_val, X_test = train_test_split(X_temp, test_size=0.5, random_state=42)  
  

In [14]:
X_train.to_csv('processed_promise_train.csv', index=False)
X_val.to_csv('processed_promise_val.csv', index=False) 
X_test.to_csv('processed_promise_test.csv', index=False)  

### new multimodal dataset based on ISO/IEC 25010

In [155]:
file_path = 'processed_promise.csv'  
df = load_csv(file_path)  
df['class'].value_counts()

class
Functional         255
Usability           67
Security            66
Operational         62
Performance         54
Look and Feel       38
Availability        21
Scalability         21
Maintainability     17
Fault Tolerance     10
Portability          1
Name: count, dtype: int64

In [149]:
# df_promise_new = df[df["class"].isin(["Usability","Security","Operational","Performance"]) ]
# print (df_promise_new['class'].value_counts(), len(df) )
# df_promise_new.to_csv('processed_promise_new.csv', index=False)


# # Assuming your dataset is stored in X  
# X_train, X_temp = train_test_split(df_promise_new, test_size=0.4, random_state=42)  
# # Further split the temporary data into validation and test data  
# X_val, X_test = train_test_split(X_temp, test_size=0.5, random_state=42)  

# X_train.to_csv('processed_promise_new_train.csv', index=False)
# X_val.to_csv('processed_promise_new_val.csv', index=False) 
# X_test.to_csv('processed_promise_new_test.csv', index=False)  

class
Usability      67
Security       66
Operational    62
Performance    54
Name: count, dtype: int64 612


In [137]:
# in iso:    ["Performance", "Usability", "Security", "Functional", "Portability", "Maintainability"]
#not in iso: ["Look and Feel", "Availability", "Fault Tolerance", "Scalability", "Operational" ]


# "Performance" to "Performance Efficiency"
# "Look and Feel" to "Usability", 
# "Availability", "Fault Tolerance" to "Reliability"
# "Scalability" to "Performance Efficiency"

# get overview of "operational" requirement
# operational = df[(df['class'] == 'Operational')]['text']  
# for i, sentence in enumerate(operational):
#     print(sentence)
#     if i % 10 == 9: 
#         print("--------------------------------------")

In [156]:
#### keep the same
df.loc[df['class'] == 'Functional', "class"] = "Functional Suitability"
df.loc[df['class'] == 'Availability', "class"] = "Reliability"
df.loc[df['class'] == 'Operational', "class"] = "Compatibility"
df.loc[df['class'] == 'Scalability', "class"] = "Performance Efficiency"

#### delete directly
df.loc[df['class'] == 'Performance', "class"] = "Performance Efficiency"
df.loc[df['class'] == 'Look and Feel', "class"] = "Usability"
#
df.loc[df['class'] == 'Fault Tolerance', "class"] = "Reliability"
df.to_csv('processed_promise_new.csv', index=False)


In [157]:
df['class'].unique()

array(['Performance Efficiency', 'Usability', 'Reliability', 'Security',
       'Functional Suitability', 'Portability', 'Compatibility',
       'Maintainability'], dtype=object)

In [158]:
len(df), df[(df['class'] == 'Portability')]

(612,
                                                  text        class  \
 53  'The product is expected to run on Windows CE ...  Portability   
 
            b_class  
 53  Non-functional  )

In [159]:
df = df.drop(df[df['class'] == 'Portability'].index)  
df['class'].value_counts(), len(df)

(class
 Functional Suitability    255
 Usability                 105
 Performance Efficiency     75
 Security                   66
 Compatibility              62
 Reliability                31
 Maintainability            17
 Name: count, dtype: int64,
 611)

In [160]:
# Assuming your dataset is stored in X  
X_train, X_temp = train_test_split(df, test_size=0.4, random_state=42)  
# Further split the temporary data into validation and test data  
X_val, X_test = train_test_split(X_temp, test_size=0.5, random_state=42)  

X_train.to_csv('processed_promise_new_train.csv', index=False)
X_val.to_csv('processed_promise_new_val.csv', index=False) 
X_test.to_csv('processed_promise_new_test.csv', index=False)  

# load PURE

In [15]:
# Specify the path of the Excel file  
excel_file = 'FR_NFR_Dataset.xlsx'    
loaded_data = pd.read_excel(excel_file)  

print(loaded_data)  
print(loaded_data["Type"].unique() )

                                       Requirement Text Type
0     The app shall run on a smart phone with Androi...  NFR
1     All layout shall be according to the TU/e corp...  NFR
2     The icons shall be according to the Android Ic...  NFR
3     The user interfaces shall be according to the ...  NFR
4     The Twitter "tweet" interface shall be accordi...  NFR
...                                                 ...  ...
6112  The system shall implement stringent access co...  NFR
6113  The system shall enforce robust user authentic...  NFR
6114  The system shall ensure continuous availabilit...  NFR
6115  The system shall maximize security measures to...  NFR
6116  The system shall restrict access to maintainer...  NFR

[6117 rows x 2 columns]
['NFR' 'FR' nan]


In [16]:
loaded_data_class = loaded_data
loaded_data_class.rename(columns={'Requirement Text': 'text','Type': 'b_class'}, inplace=True)  

In [17]:
loaded_data_class.loc[loaded_data_class['b_class'] == 'FR', "b_class"] = "Functional"
loaded_data_class.loc[loaded_data_class['b_class'] == 'NFR', "b_class"] = "Non-functional"


In [18]:
loaded_data_class['b_class'].value_counts(), loaded_data_class['b_class'].unique()

(b_class
 Functional        3964
 Non-functional    2122
 Name: count, dtype: int64,
 array(['Non-functional', 'Functional', nan], dtype=object))

In [19]:
print( loaded_data_class[(loaded_data_class['b_class'].isna())]  )
loaded_data_class = loaded_data_class.drop(loaded_data_class[loaded_data_class['b_class'].isna()].index)  

                                                   text b_class
1175  Program Administrators/Nursing Staff Members s...     NaN
1176  The Disputes application shall comply with the...     NaN
1177  The system shall ensure that all screens creat...     NaN
1178  The list of dispute cases after a search shall...     NaN
1179  100% of the card member and merchant services ...     NaN
1180  100% of card member services representatives s...     NaN
1181  100% of merchant services representatives shal...     NaN
1182  The maximum wait time for a user navigating fr...     NaN
1183  The Disputes application shall support 350 con...     NaN
1184  The Disputes application shall be available 24...     NaN
1185  To resolve disputes, the Disputes application ...     NaN
1186  The Disputes application shall retrieve all ca...     NaN
1187  Disputes applications shall retrieve all merch...     NaN
1188  The Disputes application shall send all letter...     NaN
1189  The Disputes application shall int

In [20]:
print( loaded_data_class[(loaded_data_class['b_class'].isna())] )
print( len(loaded_data_class), loaded_data_class['b_class'].unique() )

Empty DataFrame
Columns: [text, b_class]
Index: []
6086 ['Non-functional' 'Functional']


In [21]:
# Define a function to strip quotes
def strip_quotes(series):
    return series.str.strip('"')

# Apply this function to all columns
loaded_data_class =loaded_data_class.apply(strip_quotes)

In [22]:
#add a ['class'] column to make sure that the prompt construction code feasible, e.g. few_shot_list[requirement_index][i][2]
loaded_data_class["tem_class"]= loaded_data_class ["b_class"]
loaded_data_class.rename(columns={'b_class': 'class','tem_class': 'b_class'}, inplace=True)  

In [23]:
loaded_data_class.to_csv('processed_pure.csv', index=False)  

## train test split

In [24]:
file_path = 'processed_pure.csv'  
X = pd.read_csv(file_path , sep=",")  
X.shape, X


((6086, 3),
                                                    text           class  \
 0     The app shall run on a smart phone with Androi...  Non-functional   
 1     All layout shall be according to the TU/e corp...  Non-functional   
 2     The icons shall be according to the Android Ic...  Non-functional   
 3     The user interfaces shall be according to the ...  Non-functional   
 4     The Twitter "tweet" interface shall be accordi...  Non-functional   
 ...                                                 ...             ...   
 6081  The system shall implement stringent access co...  Non-functional   
 6082  The system shall enforce robust user authentic...  Non-functional   
 6083  The system shall ensure continuous availabilit...  Non-functional   
 6084  The system shall maximize security measures to...  Non-functional   
 6085  The system shall restrict access to maintainer...  Non-functional   
 
              b_class  
 0     Non-functional  
 1     Non-functional  
 2

In [25]:
# Assuming your dataset is stored in X  
X_train, X_temp = train_test_split(X, test_size=0.2, random_state=42)  

# Further split the temporary data into validation and test data  
X_val, X_test = train_test_split(X_temp, test_size=0.5, random_state=42)  

In [26]:
X_train.to_csv('processed_pure_train.csv', index=False)
X_val.to_csv('processed_pure_val.csv', index=False) 
X_test.to_csv('processed_pure_test.csv', index=False)  

In [27]:
pd.read_csv('processed_pure_test.csv' , sep=",")[580:] 


Unnamed: 0,text,class,b_class
580,The system shall allow users to create boards ...,Functional,Functional
581,User shall/will be able to perform online bala...,Functional,Functional
582,FABS user shall ensure historical data include...,Non-functional,Non-functional
583,The system shall allow a member of the network...,Functional,Functional
584,System shall automatically move some emails to...,Functional,Functional
585,System shall provide electronic course study m...,Functional,Functional
586,The system shall provide a Linked Data interfa...,Functional,Functional
587,The application will require well-defined test...,Non-functional,Non-functional
588,User shall/will have sensitive data encrypted ...,Non-functional,Non-functional
589,System shall allow dataset developers to assig...,Non-functional,Non-functional


## train test split

In [33]:
file_path = 'processed_nfr_so.csv'  
X = load_csv(file_path)  
X.shape

(17434, 2)

In [34]:
# Assuming your dataset is stored in X  
X_train, X_temp = train_test_split(X, test_size=0.4, random_state=42)  

# Further split the temporary data into validation and test data  
X_val, X_test = train_test_split(X_temp, test_size=0.5, random_state=42)  
  

In [35]:
#X_train.size, X_val.size, X_test.size

In [36]:
X_train.to_csv('processed_nfr_so_train.csv', index=False)
X_val.to_csv('processed_nfr_so_val.csv', index=False) 
X_test.to_csv('processed_nfr_so_test.csv', index=False)  

## get the few_shot_list

In [37]:
get_random_few_shot_list("promise")
array = np.load('./few_shot_list/promise_random.npy')  
array.shape

(123, 160, 3)

In [38]:
get_embedding_few_shot_list("promise")
array = np.load('./few_shot_list/promise_embedding.npy')  
array.shape



(123, 160, 3)

In [39]:
get_tfidf_few_shot_list("promise")
array = np.load('./few_shot_list/promise_tfidf.npy')  
array.shape

(123, 160, 3)

In [40]:
get_random_few_shot_list("pure")
array = np.load('./few_shot_list/pure_random.npy')  
array.shape

(609, 160, 3)

In [41]:
get_embedding_few_shot_list("pure")
array = np.load('./few_shot_list/pure_embedding.npy')  
array.shape

(609, 160, 3)

In [42]:
get_tfidf_few_shot_list("pure")
array = np.load('./few_shot_list/pure_tfidf.npy')  
array.shape

(609, 160, 3)