In [58]:
import re
import nltk
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\itsme\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

Data

In [61]:
df = pd.read_csv("train.csv")

In [62]:
df.drop(columns=['author', 'id'], inplace=True)
df.dropna(inplace=True)

In [63]:
X = df['title']+' '+df['text']
Y = df['label']

In [64]:
X_train, X_test, Y_train, Y_test = train_test_split(X,Y,test_size=1/3,random_state=42)

Stemming

In [65]:
port = PorterStemmer()

In [66]:
stop_words = set(stopwords.words('english'))
def stemming(content):
    pattern = re.compile('[^a-zA-Z]')
    stemmed_content = pattern.sub(' ', content) 
    stemmed_content = stemmed_content.lower() 
    stemmed_content = stemmed_content.split() 
    stemmed_content = [port.stem(word) for word in stemmed_content if word not in stop_words] 
    stemmed_content = ' '.join(stemmed_content) 
    return stemmed_content


In [67]:
X_train = X_train.apply(stemming)
X_test = X_test.apply(stemming)

In [69]:
vectorizer = TfidfVectorizer()
X_train = vectorizer.fit_transform(X_train)
X_test = vectorizer.transform(X_test)

In [73]:
svm = SVC(probability=True)
svm.fit(X_train, Y_train)

In [74]:
Y_probabilities = svm.predict_proba(X_test)
Y_predict = np.argmax(Y_probabilities, axis=1)

In [75]:
for i, prob in enumerate(Y_probabilities):
    print(f"Instance {i+1}: Probability of 0 = {prob[0]:.4f}, Probability of 1 = {prob[1]:.4f}")

Instance 1: Probability of 0 = 0.1473, Probability of 1 = 0.8527
Instance 2: Probability of 0 = 0.0000, Probability of 1 = 1.0000
Instance 3: Probability of 0 = 0.8476, Probability of 1 = 0.1524
Instance 4: Probability of 0 = 0.0000, Probability of 1 = 1.0000
Instance 5: Probability of 0 = 0.8077, Probability of 1 = 0.1923
Instance 6: Probability of 0 = 0.9988, Probability of 1 = 0.0012
Instance 7: Probability of 0 = 0.0641, Probability of 1 = 0.9359
Instance 8: Probability of 0 = 0.3157, Probability of 1 = 0.6843
Instance 9: Probability of 0 = 0.0000, Probability of 1 = 1.0000
Instance 10: Probability of 0 = 0.9746, Probability of 1 = 0.0254
Instance 11: Probability of 0 = 0.0049, Probability of 1 = 0.9951
Instance 12: Probability of 0 = 0.9999, Probability of 1 = 0.0001
Instance 13: Probability of 0 = 0.7208, Probability of 1 = 0.2792
Instance 14: Probability of 0 = 0.0050, Probability of 1 = 0.9950
Instance 15: Probability of 0 = 1.0000, Probability of 1 = 0.0000
Instance 16: Probab

In [76]:
Y_predict = svm.predict(X_test)

In [77]:
accuracy = accuracy_score(Y_test, Y_predict)
print(f"Accuracy: {accuracy:.2f}")

Accuracy: 0.96
