In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report

df = pd.read_csv('stemmed_Dataset1_21k.csv')

df = df.dropna(subset=['text'])
df = df[df['text'].astype(str).apply(lambda x: len(x.strip()) > 0)]

In [2]:
#Data Splitting Block

train_data, temp_data, train_labels, temp_labels = train_test_split(df['text'], df['label'], test_size=0.2, random_state=42)
val_data, test_data, val_labels, test_labels = train_test_split(temp_data, temp_labels, test_size=0.5, random_state=42)

In [3]:
#To train our ML model we need numerical data. So we must convert our text data to numerical data.
#We will do that with TF-IDF (Term Frequency-Inverse Document Frequency).

tfidf_vectorizer = TfidfVectorizer(stop_words='english', max_df=0.7, min_df=2, max_features=5000)
train_features = tfidf_vectorizer.fit_transform(train_data.astype(str)).toarray()
val_features = tfidf_vectorizer.transform(val_data.astype(str)).toarray()
test_features = tfidf_vectorizer.transform(test_data.astype(str)).toarray()

In [4]:
svm_model = SVC(kernel='linear')
svm_model.fit(train_features, train_labels)

val_predictions = svm_model.predict(val_features)
val_accuracy = accuracy_score(val_labels, val_predictions)

test_predictions = svm_model.predict(test_features)
test_accuracy = accuracy_score(test_labels, test_predictions)

val_accuracy_percentage = val_accuracy * 100
test_accuracy_percentage = test_accuracy * 100

In [5]:
#We finalize here, and display the result and performance of our task:

print("Validation Accuracy: {:.2f}%".format(val_accuracy_percentage))
print("Test Accuracy: {:.2f}%".format(test_accuracy_percentage))
print("Test Classification Report:")
print(classification_report(test_labels, test_predictions))

Validation Accuracy: 98.58%
Test Accuracy: 98.96%
Test Classification Report:
              precision    recall  f1-score   support

        FAKE       0.99      0.99      0.99      1023
        REAL       0.99      0.99      0.99      1091

    accuracy                           0.99      2114
   macro avg       0.99      0.99      0.99      2114
weighted avg       0.99      0.99      0.99      2114

