In [None]:
import re
import string
import numpy as np
import pandas as pd
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from gensim.models import KeyedVectors
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from keras.utils import to_categorical
from tensorflow.keras.preprocessing.sequence import pad_sequences
import contractions

In [None]:
# Load data
df = pd.read_csv("PROMISE.csv", encoding='latin1')
columns = ['RequirementText', '_class_']
df = df[columns].copy()
df.dropna(axis=0, inplace=True)

In [None]:
# Data cleaning
df['contraction_count'] = df['RequirementText'].apply(lambda x: len(re.findall(r"\b(?:{})\b".format("|".join(contractions.contractions_dict.keys())), x)))
df['text'] = df['RequirementText'].apply(contractions.fix)
df['text'] = df['text'].str.lower()
df['text'] = df['text'].apply(lambda x: re.sub('[%s]' % re.escape(string.punctuation), '', x))
stop_words = set(stopwords.words('english'))
stop_words.add('subject')
stop_words.add('http')
df['text'] = df['text'].apply(lambda x: " ".join([word for word in x.split() if word not in stop_words]))
df['text'] = df['text'].apply(lambda x: re.sub(' +', ' ', x))


In [None]:
# Tokenize text
df['Tokenized_Text'] = df['text'].apply(word_tokenize)

In [None]:
# Load GloVe model
glove_file = 'glove.42B.300d.txt'
model = KeyedVectors.load_word2vec_format(glove_file, binary=False, no_header=True)
vectorized_text = df['Tokenized_Text'].apply(lambda x: [model[word] if word in model else [0] * model.vector_size for word in x])
df['Vectorized_Text'] = vectorized_text


In [None]:
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(df['Vectorized_Text'], df['_class_'], test_size=0.15, random_state=0)


In [None]:
# Encode labels
label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train)
y_test_encoded = label_encoder.transform(y_test)
num_classes = len(np.unique(y_train_encoded))
y_train_categorical = to_categorical(y_train_encoded, num_classes=num_classes)
y_test_categorical = to_categorical(y_test_encoded, num_classes=num_classes)


In [None]:
# Pad sequences
max_sequence_length = max(len(seq) for seq in X_train)
X_train_padded = pad_sequences(X_train, maxlen=max_sequence_length, padding='post', dtype='float32')
X_test_padded = pad_sequences(X_test, maxlen=max_sequence_length, padding='post', dtype='float32')

print(X_train_padded.shape)

In [None]:
from keras.models import Sequential
from keras.layers import Dense, Dropout, Flatten

# Build DNN model
model = Sequential()
model.add(Flatten(input_shape=(X_train_padded.shape[1], X_train_padded.shape[2])))
model.add(Dense(512, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(256, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(128, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(num_classes, activation='softmax'))

# Compile the model
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])


In [None]:
# Train the model
history = model.fit(X_train_padded, y_train_categorical, epochs=15, batch_size=64, validation_data=(X_test_padded, y_test_categorical))


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Evaluate the model
loss, accuracy = model.evaluate(X_test_padded, y_test_categorical)
print("Test Accuracy:", accuracy)

# Predictions
y_pred = np.argmax(model.predict(X_test_padded), axis=-1)

# Metrics
from sklearn.metrics import precision_score, recall_score, f1_score, precision_recall_fscore_support, confusion_matrix, classification_report
train_precision, train_recall, train_f1, _ = precision_recall_fscore_support(y_train_encoded, np.argmax(model.predict(X_train_padded), axis=-1), average='macro')
test_precision, test_recall, test_f1, _ = precision_recall_fscore_support(y_test_encoded, y_pred, average='macro')

print("Train Precision:", train_precision)
print("Train Recall:", train_recall)
print("Train F1 Score:", train_f1)
print("Test Precision:", test_precision)
print("Test Recall:", test_recall)
print("Test F1 Score:", test_f1)

# Plot Accuracy
plt.figure(figsize=(12, 6))
plt.plot(history.history['accuracy'], label='Train Accuracy')
plt.plot(history.history['val_accuracy'], label='Validation Accuracy')
plt.title('Model Accuracy')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.legend(loc='lower right')
plt.savefig('model_accuracy_dnn.png')
plt.show()

# Plot F1 Score, Recall, Precision for Train and Test sets
metrics = {'Train': (train_precision, train_recall, train_f1), 'Test': (test_precision, test_recall, test_f1)}
for metric in ['Precision', 'Recall', 'F1 Score']:
    plt.figure(figsize=(12, 6))
    plt.bar(metrics.keys(), [metrics['Train'][['Precision', 'Recall', 'F1 Score'].index(metric)],
                             metrics['Test'][['Precision', 'Recall', 'F1 Score'].index(metric)]], color=['blue', 'orange'])
    plt.title(f'{metric} Comparison')
    plt.xlabel('Dataset')
    plt.ylabel(metric)
    plt.savefig(f'{metric.lower()}_comparison_dnn.png')
    plt.show()

# Confusion Matrix
conf_matrix = confusion_matrix(y_test_encoded, y_pred)
plt.figure(figsize=(10, 8))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', xticklabels=label_encoder.classes_, yticklabels=label_encoder.classes_)
plt.title('Confusion Matrix')
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.savefig('confusion_matrix_dnn.png')
plt.show()

# Classification Report
print("Classification Report for each class:")
print(classification_report(y_test_encoded, y_pred, target_names=label_encoder.classes_))
