In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Load the dataset
df = pd.read_csv(r'C:\Users\annsu\OneDrive\Desktop\AIDI\AIDI Semester2\AI in Enterprise Systems\movie.csv')

# Display the first few rows of the dataset
print(df.head())

# Check for NaN values and drop them if any
df.dropna(inplace=True)

# Verify the number of samples in the dataset
print("Number of samples in the dataset:", len(df))

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(df['text'], df['label'], test_size=0.2, random_state=42)

# Vectorize the text data
vectorizer = TfidfVectorizer(stop_words='english', max_df=0.7)
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

# Train the SVM model
svm_model = SVC(random_state=42)
svm_model.fit(X_train_tfidf, y_train)

# Make predictions
y_pred = svm_model.predict(X_test_tfidf)

# Calculate and print evaluation metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='binary')
recall = recall_score(y_test, y_pred, average='binary')
f1 = f1_score(y_test, y_pred, average='binary')

print("\nSupport Vector Machine Results:")
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")

# Sample predictions
sample_size = 5
sample_indices = np.random.choice(X_test.index, sample_size, replace=False)
sample_texts = X_test.loc[sample_indices]
sample_labels = y_test.loc[sample_indices]

print("\nSample Predictions:")
sample_tfidf = vectorizer.transform(sample_texts)
sample_predictions = svm_model.predict(sample_tfidf)

for text, true_label, pred_label in zip(sample_texts, sample_labels, sample_predictions):
    print(f"Text: {text[:50]}...")
    print(f"True Label: {true_label}, Predicted Label: {pred_label}")
    print()

                                                text  label
0  I grew up (b. 1965) watching and loving the Th...      0
1  When I put this movie in my DVD player, and sa...      0
2  Why do people who do not know what a particula...      0
3  Even though I have great interest in Biblical ...      0
4  Im a die hard Dads Army fan and nothing will e...      1
Number of samples in the dataset: 40000
