In [None]:
import re
import string
import numpy as np
import pandas as pd
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from gensim.models import KeyedVectors
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from keras.utils import to_categorical
from tensorflow.keras.preprocessing.sequence import pad_sequences
import contractions

In [None]:
# Load data
df = pd.read_csv("PROMISE.csv", encoding='latin1')
columns = ['RequirementText', '_class_']
df = df[columns].copy()
df.dropna(axis=0, inplace=True)

In [None]:
# Data cleaning
df['contraction_count'] = df['RequirementText'].apply(lambda x: len(re.findall(r"\b(?:{})\b".format("|".join(contractions.contractions_dict.keys())), x)))
df['text'] = df['RequirementText'].apply(contractions.fix)
df['text'] = df['text'].str.lower()
df['text'] = df['text'].apply(lambda x: re.sub('[%s]' % re.escape(string.punctuation), '', x))
stop_words = set(stopwords.words('english'))
stop_words.add('subject')
stop_words.add('http')
df['text'] = df['text'].apply(lambda x: " ".join([word for word in x.split() if word not in stop_words]))
df['text'] = df['text'].apply(lambda x: re.sub(' +', ' ', x))


In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
# Initialize the TF-IDF Vectorizer
vectorizer = TfidfVectorizer()
# Fit and transform the text data
tfidf_matrix = vectorizer.fit_transform(df['text'])

# Convert the TF-IDF matrix to a DataFrame for better readability
tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=vectorizer.get_feature_names_out())

# Display the TF-IDF DataFrame
print(tfidf_df)


In [None]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score, f1_score, confusion_matrix, classification_report
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Sequential
from keras.layers import Input, Embedding, Conv1D, MaxPooling1D, GlobalMaxPooling1D, Dense, Flatten, Dropout, Bidirectional, LSTM
import pandas as pd

# Assuming df is already defined and contains the necessary data
# and tfidf_df is your TF-IDF feature matrix.

# Label encoding
label_encoder = LabelEncoder()
df['label'] = label_encoder.fit_transform(df['_class_'])

# Number of classes
num_classes = df['label'].nunique()

# Convert text data to feature vectors
X = tfidf_df

# Convert labels to categorical data
y = df['label']
y = to_categorical(y, num_classes)

# Splitting the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

# Reshape X_train and X_test to fit LSTM input shape (samples, timesteps, features)
X_train = np.expand_dims(X_train, axis=1)
X_test = np.expand_dims(X_test, axis=1)

print("Shape of X_train:", X_train.shape)

# Build DNN model
model = Sequential()
model.add(Flatten(input_shape=(X_train.shape[1], X_train.shape[2])))
model.add(Dense(512, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(256, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(128, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(num_classes, activation='softmax'))

# Compile the model
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Train the model
history = model.fit(X_train, y_train, epochs=50, batch_size=64, validation_data=(X_test, y_test))

# Evaluate the model
test_loss, test_acc = model.evaluate(X_test, y_test)
print('Test accuracy:', test_acc)

# Predict on test data
y_pred_test = model.predict(X_test)

# Convert predictions to labels
y_pred_test_labels = np.argmax(y_pred_test, axis=1)
y_test_labels = np.argmax(y_test, axis=1)

# Calculate precision, recall, and F-measure for test data
precision_test = precision_score(y_test_labels, y_pred_test_labels, average='weighted')
recall_test = recall_score(y_test_labels, y_pred_test_labels, average='weighted')
f1_test = f1_score(y_test_labels, y_pred_test_labels, average='weighted')

print("Test Precision:", precision_test)
print("Test Recall:", recall_test)
print("Test F-Measure:", f1_test)

# Predict on train data
y_pred_train = model.predict(X_train)

# Convert predictions to labels
y_pred_train_labels = np.argmax(y_pred_train, axis=1)
y_train_labels = np.argmax(y_train, axis=1)

# Calculate precision, recall, and F-measure for train data
precision_train = precision_score(y_train_labels, y_pred_train_labels, average='weighted')
recall_train = recall_score(y_train_labels, y_pred_train_labels, average='weighted')
f1_train = f1_score(y_train_labels, y_pred_train_labels, average='weighted')

print("Train Precision:", precision_train)
print("Train Recall:", recall_train)
print("Train F-Measure:", f1_train)

# Plotting F1 Score for Train and Test sets
plt.figure(figsize=(12, 6))
plt.bar(['Train', 'Test'], [f1_train, f1_test], color=['blue', 'orange'])
plt.title('F1 Score Comparison')
plt.xlabel('Dataset')
plt.ylabel('F1 Score')
plt.savefig('f1_score_comparison.png')
plt.show()

# Plotting Accuracy for Train and Test sets
plt.figure(figsize=(12, 6))
plt.plot(history.history['accuracy'], label='Train Accuracy')
plt.plot(history.history['val_accuracy'], label='Validation Accuracy')
plt.title('Model Accuracy')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.legend(loc='lower right')
plt.savefig('model_accuracy.png')
plt.show()

# Plotting Recall for Train and Test sets
plt.figure(figsize=(12, 6))
plt.bar(['Train', 'Test'], [recall_train, recall_test], color=['blue', 'orange'])
plt.title('Recall Comparison')
plt.xlabel('Dataset')
plt.ylabel('Recall')
plt.savefig('recall_comparison.png')
plt.show()

# Plotting Precision for Train and Test sets
plt.figure(figsize=(12, 6))
plt.bar(['Train', 'Test'], [precision_train, precision_test], color=['blue', 'orange'])
plt.title('Precision Comparison')
plt.xlabel('Dataset')
plt.ylabel('Precision')
plt.savefig('precision_comparison.png')
plt.show()

# Plotting Confusion Matrix
conf_matrix = confusion_matrix(y_test_labels, y_pred_test_labels)
plt.figure(figsize=(10, 8))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', xticklabels=label_encoder.classes_, yticklabels=label_encoder.classes_)
plt.title('Confusion Matrix')
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.savefig('confusion_matrix.png')
plt.show()

# Classification Report for each class
print("Classification Report for each class:")
report = classification_report(y_test_labels, y_pred_test_labels, target_names=label_encoder.classes_)
print(report)

# Save classification report to a text file
with open('classification_report.txt', 'w') as f:
    f.write(report)
