In [None]:
import re
import time
import torch
import string
import numpy as np
import pandas as pd
import seaborn as sn
from tqdm import tqdm
import matplotlib.pyplot as plt

In [None]:
df = pd.read_csv("PROMISE.csv",encoding='latin1')
df.head(2)

In [None]:
columns = ['RequirementText', '_class_']
df = df[columns].copy()

print(df["RequirementText"].isnull().values.any())
df.dropna(axis = 0 , inplace = True)
print(df.shape)

class_counts = df["_class_"].value_counts()
class_counts_sorted = class_counts.sort_values(ascending=False)
colors = ['skyblue', 'orange', 'green', 'red', 'purple', 'yellow', 'pink', 'cyan', 'magenta', 'lime']
plt.figure(figsize=(10,6))
bars = class_counts_sorted.plot(kind='bar', color=colors)
for bar, value in zip(bars.patches, class_counts_sorted.values):
    bars.text(bar.get_x() + bar.get_width() / 2, bar.get_height() + 0.1, value, ha='center', va='bottom')

plt.title('The number of classes in the data')
plt.xlabel('class name')
plt.ylabel('Number')
plt.xticks(rotation=45)
plt.show()

import contractions

def count_contractions(text):
    contractions_pattern = re.compile(r"\b(?:{})\b".format("|".join(contractions.contractions_dict.keys())), flags=re.IGNORECASE)
    matches = contractions_pattern.findall(text)
    return len(matches)

df['contraction_count'] = df['RequirementText'].apply(count_contractions)
column_sum = df['contraction_count'].sum()
print("Number Of Contractions " + str(column_sum))

def expand_contractions(text):
    return contractions.fix(text)

df['text'] = df['RequirementText'].apply(expand_contractions)

df['text'] = df['text'].str.lower()


df['text'] = df['text'].apply(lambda x: re.sub('[%s]' % re.escape(string.punctuation), '' , x))


from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))
stop_words.add('subject')
stop_words.add('http')
def remove_stopwords(text):
    return " ".join([word for word in str(text).split() if word not in stop_words])
df['text'] = df['text'].apply(lambda x: remove_stopwords(x))

df['text'] = df['text'].apply(lambda x: re.sub(' +', ' ', x))
df.head(2)             

In [None]:
from transformers import BertModel, BertTokenizer
model = BertModel.from_pretrained('bert-base-uncased')
bertTokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
vocab_size = bertTokenizer.vocab_size
print("Vocabulary Size:", vocab_size)
def tokenize_text(text):
    tokens =bertTokenizer.tokenize(text) 
    return tokens

df['Tokenized_Text'] = df['text'].apply(tokenize_text)
df.head()
def Convert_Tokens_To_Ids(tokens):
    input_ids = bertTokenizer.convert_tokens_to_ids(tokens)
    return input_ids

df['Tokens_Ids'] = df['Tokenized_Text'].apply(Convert_Tokens_To_Ids)
df.head()
df.to_csv('Ids_BERT_nfr.csv', index=False)

In [None]:
df = pd.read_csv("Ids_BERT_nfr.csv",encoding='latin1')
df.head(2)

In [None]:
import ast
def Convert_StrIds_To_ListIds(string_list):
    actual_list = ast.literal_eval(string_list)
    return actual_list

# Apply the function to the 'Tokens_Ids' column in the DataFrame
tqdm.pandas()  # Enable progress bar for pandas operations

df['Tokens_Ids'] = df['Tokens_Ids'].progress_apply(Convert_StrIds_To_ListIds)
df.head(2)

In [None]:
import torch
from transformers import BertModel, BertTokenizer
model = BertModel.from_pretrained('bert-base-uncased')
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
from tqdm import tqdm

def Convert_IDs_To_Vector(input_ids):
    global count
    
    # Check if input_ids is empty
    if not input_ids:
        print("Input_ids is empty."+ str(count))
        return None
    
    input_ids = torch.tensor(input_ids).unsqueeze(0)
    with torch.no_grad():
        outputs = model(input_ids)
        embeddings = outputs.last_hidden_state[0].tolist()
    
        return embeddings

# Use tqdm to show progress bar while applying the function to each row
tqdm.pandas()

# Apply the function to each row and assign the result to a new column
df['Tokens_vectors'] = df['Tokens_Ids'].progress_apply(Convert_IDs_To_Vector)
df.head()


In [None]:
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
from gensim.models import KeyedVectors
from sklearn.metrics import precision_score, recall_score, f1_score, precision_recall_fscore_support

import tensorflow as tf
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from keras.utils import to_categorical

from keras.models import Sequential
from keras.layers import Input, Embedding, Conv1D, MaxPooling1D, GlobalMaxPooling1D, Dense, Flatten, Dropout, Bidirectional, LSTM
from keras.models import Model

In [None]:
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(df['Tokens_vectors'], df['_class_'], test_size=0.20, random_state=0)

# Encode labels
label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train)
y_test_encoded = label_encoder.transform(y_test)
num_classes = len(np.unique(y_train_encoded))
y_train_categorical = to_categorical(y_train_encoded, num_classes=num_classes)
y_test_categorical = to_categorical(y_test_encoded, num_classes=num_classes)

# Pad sequences
max_sequence_length = max(len(seq) for seq in X_train)
X_train_padded = pad_sequences(X_train, maxlen=max_sequence_length, padding='post', dtype='float32')
X_test_padded = pad_sequences(X_test, maxlen=max_sequence_length, padding='post', dtype='float32')

print(X_train_padded.shape)

In [None]:
from keras.models import Sequential
from keras.layers import Embedding, GRU, Dense, GlobalMaxPooling1D

# Define GRU model
model = Sequential()
model.add(Input(shape=(X_train_padded.shape[1], X_train_padded.shape[2])))
model.add(GRU(256, return_sequences=True))
model.add(GlobalMaxPooling1D())
model.add(Dense(512, activation='relu'))
model.add(Dense(num_classes, activation='softmax'))

# Compile model
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Print model summary
model.summary()

# Train model
history = model.fit(X_train_padded, y_train_categorical, epochs=50, batch_size=64, validation_data=(X_test_padded, y_test_categorical))

# Evaluate model
loss, accuracy = model.evaluate(X_test_padded, y_test_categorical)
print("Test Accuracy:", accuracy)


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix, classification_report
# پیش‌بینی برچسب‌ها برای داده‌های آزمون
y_pred = np.argmax(model.predict(X_test_padded), axis=-1)

# مقادیر accuracy برای داده‌های train
train_accuracy = history.history['accuracy']

# مقادیر p-r-f1 برای داده‌های train
train_precision, train_recall, train_f1, _ = precision_recall_fscore_support(y_train_encoded, np.argmax(model.predict(X_train_padded), axis=-1), average='macro')
# مقادیر p-r-f1 برای داده‌های test
test_precision, test_recall, test_f1, _ = precision_recall_fscore_support(y_test_encoded, y_pred, average='macro')

print("Train Accuracy:", train_accuracy)
print("Train Precision:", train_precision)
print("Train Recall:", train_recall)
print("Train F1 Score:", train_f1)
print("Test Precision:", test_precision)
print("Test Recall:", test_recall)
print("Test F1 Score:", test_f1)

# Evaluate model
loss, accuracy = model.evaluate(X_test_padded, y_test_categorical)
print("Test Accuracy:", accuracy)

# Predictions
y_pred_probs = model.predict(X_test_padded)
y_pred = np.argmax(y_pred_probs, axis=1)
y_test_encoded = np.argmax(y_test_categorical, axis=1)

# Calculate metrics
train_precision = precision_score(y_train_categorical.argmax(axis=1), model.predict(X_train_padded).argmax(axis=1), average='weighted')
train_recall = recall_score(y_train_categorical.argmax(axis=1), model.predict(X_train_padded).argmax(axis=1), average='weighted')
train_f1 = f1_score(y_train_categorical.argmax(axis=1), model.predict(X_train_padded).argmax(axis=1), average='weighted')
test_precision = precision_score(y_test_encoded, y_pred, average='weighted')
test_recall = recall_score(y_test_encoded, y_pred, average='weighted')
test_f1 = f1_score(y_test_encoded, y_pred, average='weighted')

# Plotting Accuracy
plt.figure(figsize=(12, 6))
plt.plot(history.history['accuracy'], label='Train Accuracy')
plt.plot(history.history['val_accuracy'], label='Validation Accuracy')
plt.title('Model Accuracy')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.legend(loc='lower right')
plt.savefig('model_accuracy.png')
plt.show()

# Plotting F1 Score, Recall, Precision for Train and Test sets
metrics = {'Train': (train_precision, train_recall, train_f1), 'Test': (test_precision, test_recall, test_f1)}

for metric in ['Precision', 'Recall', 'F1 Score']:
    plt.figure(figsize=(12, 6))
    plt.bar(metrics.keys(), [metrics['Train'][['Precision', 'Recall', 'F1 Score'].index(metric)],
                             metrics['Test'][['Precision', 'Recall', 'F1 Score'].index(metric)]], color=['blue', 'orange'])
    plt.title(f'{metric} Comparison')
    plt.xlabel('Dataset')
    plt.ylabel(metric)
    plt.savefig(f'{metric.lower()}_comparison.png')
    plt.show()

# Plotting Confusion Matrix
conf_matrix = confusion_matrix(y_test_encoded, y_pred)
plt.figure(figsize=(10, 8))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', xticklabels=range(num_classes), yticklabels=range(num_classes))
plt.title('Confusion Matrix')
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.savefig('confusion_matrix.png')
plt.show()

# Classification Report for each class
print("Classification Report for each class:")
print(classification_report(y_test_encoded, y_pred, target_names=[str(i) for i in range(num_classes)]))