In [1]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import os

Finding Best parameters

In [2]:
# Load sentences and labels from text files
corpus_path = os.path.join('..', 'Dataset', 'corpus.txt')
with open(corpus_path, 'r', encoding='utf-8') as f:
    sentences = f.readlines()

labels_path = os.path.join('..', 'Dataset', 'labels.txt')
with open(labels_path, 'r', encoding='utf-8') as f:
    labels = f.readlines()

In [3]:
# Shuffle the dataset
seed = 42  # Set a seed for reproducibility
np.random.seed(seed)
indices = np.arange(len(sentences))
np.random.shuffle(indices)

sentences = np.array(sentences)[indices]
labels = np.array(labels)[indices]

In [99]:
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(sentences, labels, test_size=0.2, random_state=seed)

In [100]:
# TF-IDF Vectorization
tfidf_vectorizer = TfidfVectorizer(max_features=1000, stop_words='english')
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

In [101]:
svm_model = SVC()

# Grid Search to find the best parameters
param_grid = {'C': [0.1, 1, 10, 100], 'gamma': [100, 10, 1, 0.1, 0.01, 0.001], 'kernel': ['linear', 'rbf']}
grid_search = GridSearchCV(svm_model, param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train_tfidf, y_train)

# Get the best parameters and the best model
best_params = grid_search.best_params_
best_model = grid_search.best_estimator_

# Print the best parameters
print("Best Parameters:", best_params)

Best Parameters: {'C': 1, 'gamma': 100, 'kernel': 'linear'}


In [102]:
# Use the best model for predictions
y_pred = best_model.predict(X_test_tfidf)

# Evaluate the accuracy of the best model
accuracy = np.mean(y_pred == y_test)
print(f"Accuracy of the Best Model: {accuracy * 100:.2f}%")


Accuracy of the Best Model: 75.42%


Using best parameters for Testing Generated Outputs

Coeff_1_no

In [103]:
#Changing variable names
X_train = sentences
y_train = [value.strip() for value in labels]

In [104]:
test_folder_path = os.path.join('Test Samples', 'coeff_1_no')
test_files = os.listdir(test_folder_path)

X_test = []
y_test = []

for test_file in test_files:
    file_path = os.path.join(test_folder_path, test_file)
    
    with open(file_path, 'r', encoding='utf-8') as f:
        lines =f.readlines()
        X_test.extend(lines)
    
    # Extract label from the file name
    label = test_file.split('_')[1]
    for _ in range(50):
        y_test.append(label)

y_test = [value.replace('.txt', '') for value in y_test]

In [105]:
# Shuffle the dataset
seed = 42  # Set a seed for reproducibility
np.random.seed(seed)
indices = np.arange(len(X_test))
np.random.shuffle(indices)

X_test = np.array(X_test)[indices]
y_test = np.array(y_test)[indices]

In [106]:
# Create TF-IDF Vectorizer
vectorizer = TfidfVectorizer()
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

In [107]:
# Create and train SVC
svc = SVC(C=1, gamma=100, kernel='linear')
svc.fit(X_train_tfidf, y_train)

# Predict on the test set
y_pred = svc.predict(X_test_tfidf)

In [109]:
# Evaluate the model
accuracy = np.mean(y_pred == y_test)
precision = precision_score(y_test, y_pred, average='macro')
recall = recall_score(y_test, y_pred, average='macro')
f1 = f1_score(y_test, y_pred, average='macro')

# Output the results
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)

Accuracy: 0.63
Precision: 0.644774884783142
Recall: 0.6299999999999999
F1 Score: 0.6271424275823653


Coeff_1_laplace

In [110]:
#Changing variable names
X_train = sentences
y_train = [value.strip() for value in labels]

In [111]:
test_folder_path = os.path.join('Test Samples', 'coeff_1_laplace')
test_files = os.listdir(test_folder_path)

X_test = []
y_test = []

for test_file in test_files:
    file_path = os.path.join(test_folder_path, test_file)
    
    with open(file_path, 'r', encoding='utf-8') as f:
        lines =f.readlines()
        X_test.extend(lines)
    
    # Extract label from the file name
    label = test_file.split('_')[1]
    for _ in range(50):
        y_test.append(label)

y_test = [value.replace('.txt', '') for value in y_test]

In [112]:
# Shuffle the dataset
seed = 42  # Set a seed for reproducibility
np.random.seed(seed)
indices = np.arange(len(X_test))
np.random.shuffle(indices)

X_test = np.array(X_test)[indices]
y_test = np.array(y_test)[indices]

In [113]:
# Create TF-IDF Vectorizer
vectorizer = TfidfVectorizer()
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

In [114]:
# Create and train SVC
svc = SVC(C=1, gamma=100, kernel='linear')
svc.fit(X_train_tfidf, y_train)

# Predict on the test set
y_pred = svc.predict(X_test_tfidf)

In [115]:
# Evaluate the model
accuracy = np.mean(y_pred == y_test)
precision = precision_score(y_test, y_pred, average='macro')
recall = recall_score(y_test, y_pred, average='macro')
f1 = f1_score(y_test, y_pred, average='macro')

# Output the results
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)

Accuracy: 0.5766666666666667
Precision: 0.590503237045141
Recall: 0.5766666666666667
F1 Score: 0.5771001292913216


No beta

In [4]:
#Changing variable names
X_train = sentences
y_train = [value.strip() for value in labels]

In [10]:
test_folder_path = os.path.join('Test Samples', 'sample_level')
test_files = os.listdir(test_folder_path)

X_test = []
y_test = []

for test_file in test_files:
    file_path = os.path.join(test_folder_path, test_file)
    
    with open(file_path, 'r', encoding='utf-8') as f:
        lines =f.readlines()
        X_test.extend(lines)
    
    # Extract label from the file name
    label = test_file.split('_')[1]
    for _ in range(50):
        y_test.append(label)

y_test = [value.replace('.txt', '') for value in y_test]

In [11]:
# Shuffle the dataset
seed = 42  # Set a seed for reproducibility
np.random.seed(seed)
indices = np.arange(len(X_test))
np.random.shuffle(indices)

X_test = np.array(X_test)[indices]
y_test = np.array(y_test)[indices]

In [12]:
# Create TF-IDF Vectorizer
vectorizer = TfidfVectorizer()
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

In [13]:
# Create and train SVC
svc = SVC(C=1, gamma=100, kernel='linear')
svc.fit(X_train_tfidf, y_train)

# Predict on the test set
y_pred = svc.predict(X_test_tfidf)

In [14]:
# Evaluate the model
accuracy = np.mean(y_pred == y_test)
precision = precision_score(y_test, y_pred, average='macro')
recall = recall_score(y_test, y_pred, average='macro')
f1 = f1_score(y_test, y_pred, average='macro')

# Output the results
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)

Accuracy: 0.15
Precision: 0.10647684476039608
Recall: 0.15000000000000002
F1 Score: 0.10821577523936937
