In [None]:
import re
import time
import torch
import string
import numpy as np
import pandas as pd
import seaborn as sns
from tqdm import tqdm
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score, f1_score, confusion_matrix, classification_report
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv1D, MaxPooling1D, Flatten, Dense, Dropout

In [None]:
# Load your dataset
df = pd.read_csv("PROMISE.csv", encoding='latin1')
columns = ['RequirementText', '_class_']
df = df[columns].copy()

# Remove NaN values
df.dropna(axis=0, inplace=True)

# TF-IDF vectorization
tfidf_vectorizer = TfidfVectorizer(stop_words='english', max_features=5000)
X_tfidf = tfidf_vectorizer.fit_transform(df['RequirementText'])

# Label encoding
label_encoder = LabelEncoder()
df['label'] = label_encoder.fit_transform(df['_class_'])

# Number of classes
num_classes = df['label'].nunique()

# Convert labels to categorical data
y = to_categorical(df['label'], num_classes=num_classes)

# Splitting the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_tfidf, y, test_size=0.2, random_state=0)

# Convert sparse matrices to dense arrays (required for CNN)
X_train = X_train.toarray()
X_test = X_test.toarray()

# Reshape for Conv1D input
input_shape = (X_train.shape[1], 1)
X_train = X_train.reshape(X_train.shape[0], *input_shape)
X_test = X_test.reshape(X_test.shape[0], *input_shape)


In [None]:
# Define the CNN model
model = Sequential([
    Conv1D(300, 3, activation='relu', input_shape=input_shape),
    MaxPooling1D(50, padding='same'),
    Conv1D(300, 2, activation='relu'),
    MaxPooling1D(10, padding='same'),
    Flatten(),
    Dense(300, activation='relu'),
    Dropout(0.2),
    Dense(num_classes, activation='softmax')
])

# Compile the model
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])


In [None]:
# Train the model
history = model.fit(X_train, y_train, epochs=50, batch_size=64, validation_data=(X_test, y_test))


In [None]:
# Evaluate the model
test_loss, test_acc = model.evaluate(X_test, y_test)
print('Test accuracy:', test_acc)

# Predict on test data
y_pred_test = model.predict(X_test)
y_pred_test_labels = np.argmax(y_pred_test, axis=1)
y_test_labels = np.argmax(y_test, axis=1)

# Calculate precision, recall, and F-measure for test data
precision_test = precision_score(y_test_labels, y_pred_test_labels, average='weighted')
recall_test = recall_score(y_test_labels, y_pred_test_labels, average='weighted')
f1_test = f1_score(y_test_labels, y_pred_test_labels, average='weighted')

print("Test Precision:", precision_test)
print("Test Recall:", recall_test)
print("Test F-Measure:", f1_test)

# Predict on train data
y_pred_train = model.predict(X_train)
y_pred_train_labels = np.argmax(y_pred_train, axis=1)
y_train_labels = np.argmax(y_train, axis=1)

# Calculate precision, recall, and F-measure for train data
precision_train = precision_score(y_train_labels, y_pred_train_labels, average='weighted')
recall_train = recall_score(y_train_labels, y_pred_train_labels, average='weighted')
f1_train = f1_score(y_train_labels, y_pred_train_labels, average='weighted')

print("Train Precision:", precision_train)
print("Train Recall:", recall_train)
print("Train F-Measure:", f1_train)

# Plotting F1 Score for Train and Test sets
plt.figure(figsize=(12, 6))
plt.bar(['Train', 'Test'], [f1_train, f1_test], color=['blue', 'orange'])
plt.title('F1 Score Comparison')
plt.xlabel('Dataset')
plt.ylabel('F1 Score')
plt.savefig('f1_score_comparison.png')
plt.show()

# Plotting Accuracy for Train and Test sets
plt.figure(figsize=(12, 6))
plt.plot(history.history['accuracy'], label='Train Accuracy')
plt.plot(history.history['val_accuracy'], label='Validation Accuracy')
plt.title('Model Accuracy')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.legend(loc='lower right')
plt.savefig('model_accuracy.png')
plt.show()

# Plotting Recall for Train and Test sets
plt.figure(figsize=(12, 6))
plt.bar(['Train', 'Test'], [recall_train, recall_test], color=['blue', 'orange'])
plt.title('Recall Comparison')
plt.xlabel('Dataset')
plt.ylabel('Recall')
plt.savefig('recall_comparison.png')
plt.show()

# Plotting Precision for Train and Test sets
plt.figure(figsize=(12, 6))
plt.bar(['Train', 'Test'], [precision_train, precision_test], color=['blue', 'orange'])
plt.title('Precision Comparison')
plt.xlabel('Dataset')
plt.ylabel('Precision')
plt.savefig('precision_comparison.png')
plt.show()

# Plotting Confusion Matrix
conf_matrix = confusion_matrix(y_test_labels, y_pred_test_labels)
plt.figure(figsize=(10, 8))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', xticklabels=label_encoder.classes_, yticklabels=label_encoder.classes_)
plt.title('Confusion Matrix')
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.savefig('confusion_matrix.png')
plt.show()

# Classification Report for each class
print("Classification Report for each class:")
report = classification_report(y_test_labels, y_pred_test_labels, target_names=label_encoder.classes_)
print(report)

# Save classification report to a text file
with open('classification_report.txt', 'w') as f:
    f.write(report)