In [15]:
# Import required libraries
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score

In [16]:
# Function to load data
def load_data():
    with open('rt-polarity.pos', 'r', encoding='latin-1') as pos_file:
        positive_reviews = pos_file.readlines()

    with open('rt-polarity.neg', 'r', encoding='latin-1') as neg_file:
        negative_reviews = neg_file.readlines()

    return positive_reviews, negative_reviews

# Load the data
positive_reviews, negative_reviews = load_data()

In [10]:
# Split data into training, validation, and test sets
def split_data(positive_reviews, negative_reviews):
    # Training: First 4000 positive and negative
    X_train = positive_reviews[:4000] + negative_reviews[:4000]
    y_train = [1]*4000 + [0]*4000

    # Validation: Next 500 positive and negative
    X_val = positive_reviews[4000:4500] + negative_reviews[4000:4500]
    y_val = [1]*500 + [0]*500

    # Test: Remaining 831 positive and negative
    X_test = positive_reviews[4500:] + negative_reviews[4500:]
    y_test = [1]*831 + [0]*831

    return X_train, y_train, X_val, y_val, X_test, y_test

# Split the data
X_train, y_train, X_val, y_val, X_test, y_test = split_data(positive_reviews, negative_reviews)
print("Length of training data:", len(X_train))


# print("Training set size:", len(X_train))
# # Display a few examples from each dataset split
# print("Training Data (first 5 examples):")
# for i in range(5):
#     print(f"Review: {X_train[i]}")
#     print(f"Label: {'Positive' if y_train[i] == 1 else 'Negative'}")
#     print()

# print("Validation Data (first 5 examples):")
# for i in range(5):
#     print(f"Review: {X_val[i]}")
#     print(f"Label: {'Positive' if y_val[i] == 1 else 'Negative'}")
#     print()

# print("Test Data (first 5 examples):")
# for i in range(5):
#     print(f"Review: {X_test[i]}")
#     print(f"Label: {'Positive' if y_test[i] == 1 else 'Negative'}")
#     print()

Length of training data: 8000


In [19]:
# Step 3: Text preprocessing and feature extraction using TF-IDF
tfidf_vectorizer = TfidfVectorizer(stop_words='english', max_features=5000)

# Transform the training, validation, and test sets
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)  # Use X_train
X_val_tfidf = tfidf_vectorizer.transform(X_val)          # Use X_val
X_test_tfidf = tfidf_vectorizer.transform(X_test)        # Use X_test

# Convert the full sparse matrix to a dense matrix
dense_train_matrix = X_train_tfidf.todense()

# Get feature names
feature_names = tfidf_vectorizer.get_feature_names_out()

# Convert to DataFrame for better readability
df = pd.DataFrame(dense_train_matrix, columns=feature_names)

# Display the TF-IDF matrix for all training examples
print("TF-IDF Matrix (full training examples):")
print(df)

# Display the vocabulary size and sample vocabulary
print("Vocabulary Size:", len(feature_names))
print("Sample Vocabulary (first 20 words):", feature_names[:20])

TF-IDF Matrix (full training examples):
       10  100  101   11  110   12   13   15  170   19  ...  youthful   yu  \
0     0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...       0.0  0.0   
1     0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...       0.0  0.0   
2     0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...       0.0  0.0   
3     0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...       0.0  0.0   
4     0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...       0.0  0.0   
...   ...  ...  ...  ...  ...  ...  ...  ...  ...  ...  ...       ...  ...   
7995  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...       0.0  0.0   
7996  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...       0.0  0.0   
7997  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...       0.0  0.0   
7998  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...       0.0  0.0   
7999  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...       0.0  0.0   

      yvan  zany  zeal 

In [20]:
print("Shape of X_train:", X_train_tfidf.shape)        # Should be (n_samples, n_features)
print("Length of train_labels:", len(y_train))          # Should match n_samples

Shape of X_train: (8000, 5000)
Length of train_labels: 8000


In [24]:
from sklearn.svm import SVC
from sklearn.model_selection import RandomizedSearchCV

# Step 4: Train the SVM model
svm_model = SVC()

# Set up the parameter grid for RandomizedSearchCV
param_dist = {
    'C': [0.1, 1, 10],
    'kernel': ['linear', 'rbf', 'poly'],
    'gamma': ['scale', 'auto']
}

# Use RandomizedSearchCV to find the best parameters
random_search = RandomizedSearchCV(svm_model, param_distributions=param_dist, n_iter=10, cv=5, scoring='accuracy', verbose=1, random_state=42)
random_search.fit(X_train_tfidf, y_train)

# Best model evaluation
best_svm_model = random_search.best_estimator_

# Calculate and display training accuracy
train_score = best_svm_model.score(X_train_tfidf, y_train)
print("Training Accuracy:", train_score * 100, "%")

# Validate the model
val_accuracy = best_svm_model.score(X_val_tfidf, y_val)
print("Validation Accuracy:", val_accuracy * 100, "%")

# Step 6: Test the model on the test set
test_predictions = best_svm_model.predict(X_test_tfidf)
test_accuracy = best_svm_model.score(X_test_tfidf, y_test)
print("Test Accuracy:", test_accuracy * 100,"%")

Fitting 5 folds for each of 10 candidates, totalling 50 fits
Training Accuracy: 99.91250000000001 %
Validation Accuracy: 77.0 %
Test Accuracy: 74.60890493381469 %


In [33]:
# Step 6: Test the model on the test set
test_predictions = best_svm_model.predict(X_test_tfidf)  # Use the transformed test data

# Step 7: Report results (confusion matrix, precision, recall, F1-score)
conf_matrix = confusion_matrix(y_test, test_predictions)  # Use y_test for labels
print("Confusion Matrix:\n", conf_matrix)

# Precision, Recall, F1-Score
report = classification_report(y_test, test_predictions, target_names=['Negative', 'Positive'])
print("Classification Report:\n", report)

# Step 8: Display metrics in a readable format
TP = conf_matrix[1, 1]  # True Positives
TN = conf_matrix[0, 0]  # True Negatives
FP = conf_matrix[0, 1]  # False Positives
FN = conf_matrix[1, 0]  # False Negatives

# Calculate precision, recall, and F1 score with checks for division by zero
precision = TP / (TP + FP) if (TP + FP) > 0 else 0
recall = TP / (TP + FN) if (TP + FN) > 0 else 0
f1_score = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0

print(f"True Positives (TP): {TP}")
print(f"True Negatives (TN): {TN}")
print(f"False Positives (FP): {FP}")
print(f"False Negatives (FN): {FN}")
print(f"Precision: {precision*100:.4f}","%")
print(f"Recall: {recall:.4f*}*","%")
print(f"F1-Score: {f1_score:.4f}","%")

Confusion Matrix:
 [[623 208]
 [214 617]]
Classification Report:
               precision    recall  f1-score   support

    Negative       0.74      0.75      0.75       831
    Positive       0.75      0.74      0.75       831

    accuracy                           0.75      1662
   macro avg       0.75      0.75      0.75      1662
weighted avg       0.75      0.75      0.75      1662

True Positives (TP): 617
True Negatives (TN): 623
False Positives (FP): 208
False Negatives (FN): 214
Precision: 0.7479 %


ValueError: Invalid format specifier '.4f*100' for object of type 'float'