In [None]:
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import classification_report
import re
import string
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
from nltk.tokenize import word_tokenize
nltk.download('punkt')
import joblib
import numpy as np

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [None]:
# Fetch the 20 Newsgroups dataset
newsgroups = fetch_20newsgroups(subset='all', categories=['comp.graphics','soc.religion.christian','talk.politics.guns','misc.forsale','rec.sport.baseball'])

In [48]:
graphics_target = newsgroups.target_names.index('comp.graphics')
religion_target = newsgroups.target_names.index('soc.religion.christian')
politics_target = newsgroups.target_names.index('talk.politics.guns')
forsale_target = newsgroups.target_names.index('misc.forsale')
baseball_target = newsgroups.target_names.index('rec.sport.baseball')

graphics_target, religion_target, politics_target, forsale_target, baseball_target

(0, 3, 4, 1, 2)

In [None]:
# Preprocessing
def preprocess_text(text):
    # Remove headers and emails
    header_end = text.find('\n\n')
    if header_end != -1:
        text = text[header_end:]
    text = re.sub(r'\S+@\S+', '', text)
    
    # Transform to lowercase
    text = text.lower()
    
    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    
    # Remove stopwords and tokenize
    stop_words = set(stopwords.words('english'))
    tokens = word_tokenize(text)
    tokens = [token for token in tokens if token not in stop_words]
    
    return tokens

# Apply preprocessing to the dataset
preprocessed_data = [preprocess_text(text) for text in newsgroups.data]

In [None]:
# Convert preprocessed data to string format
preprocessed_data_str = [' '.join(tokens) for tokens in preprocessed_data]

# Create TfidfVectorizer
vectorizer = TfidfVectorizer(max_features=5000)

# Perform feature extraction
X = vectorizer.fit_transform(preprocessed_data_str)
y = newsgroups.target

In [None]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# Train SVC model
svc_model = SVC(kernel='rbf')
svc_model.fit(X_train, y_train)

In [None]:
# Save the model to a file
joblib.dump(svc_model, '/content/drive/MyDrive/NLP-MiniProjet/svm_model_1.pkl')

['/content/drive/MyDrive/NLP-MiniProjet/svm_model_1.pkl']

In [None]:
print(f'Train accuracy : {svc_model.score(X_train, y_train)}')
print(f'Test accuracy : {svc_model.score(X_test, y_test)}')

Train accuracy : 0.9984532095901005
Test accuracy : 0.9494845360824742


In [None]:
# Make predictions on the test set
y_pred = svc_model.predict(X_test)

# Evaluate model performance
report = classification_report(y_test, y_pred)
print(report)

              precision    recall  f1-score   support

           0       0.93      0.93      0.93       199
           1       0.91      0.93      0.92       191
           2       0.97      0.99      0.98       197
           3       0.96      0.94      0.95       204
           4       0.97      0.95      0.96       179

    accuracy                           0.95       970
   macro avg       0.95      0.95      0.95       970
weighted avg       0.95      0.95      0.95       970

