In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import classification_report
import re
import string
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
from nltk.tokenize import word_tokenize
nltk.download('punkt')
import joblib
import numpy as np

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [None]:
df_bbc = pd.read_csv('/content/drive/MyDrive/NLP-MiniProjet/bbc-text.csv')

In [None]:
le = LabelEncoder()
df_bbc['category_encoded'] = le.fit_transform(df_bbc['category'])

In [None]:
df_bbc.head()

Unnamed: 0,category,text,category_encoded
0,tech,tv future in the hands of viewers with home th...,4
1,business,worldcom boss left books alone former worldc...,0
2,sport,tigers wary of farrell gamble leicester say ...,3
3,sport,yeading face newcastle in fa cup premiership s...,3
4,entertainment,ocean s twelve raids box office ocean s twelve...,1


In [None]:
df_bbc.shape

(2225, 3)

In [None]:
np.unique(df_bbc.category)

array(['business', 'entertainment', 'politics', 'sport', 'tech'],
      dtype=object)

In [None]:
def preprocess_text(text):
    
    # Transform to lowercase
    text = text.lower()
    
    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    
    # Remove stopwords and tokenize
    stop_words = set(stopwords.words('english'))
    tokens = word_tokenize(text)
    tokens = [token for token in tokens if token not in stop_words]
    
    return tokens

In [None]:
preprocessed_data = [preprocess_text(text) for text in df_bbc.text]

In [None]:
# Convert preprocessed data to string format
preprocessed_data_str = [' '.join(tokens) for tokens in preprocessed_data]

# Create TfidfVectorizer
vectorizer = TfidfVectorizer(max_features=5000)

# Perform feature extraction
X = vectorizer.fit_transform(preprocessed_data_str)
y = df_bbc.category_encoded

In [None]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, shuffle=True)

In [None]:
# Train SVC model
svc_model = SVC(kernel='rbf')
svc_model.fit(X_train, y_train)

In [None]:
# Save the model to a file
joblib.dump(svc_model, '/content/drive/MyDrive/NLP-MiniProjet/svm_model_1.pkl')

['/content/drive/MyDrive/NLP-MiniProjet/svm_model_1.pkl']

In [None]:
print(f'Train accuracy : {svc_model.score(X_train, y_train)}')
print(f'Test accuracy : {svc_model.score(X_test, y_test)}')

Train accuracy : 0.999438202247191
Test accuracy : 0.9730337078651685


In [18]:
# Make predictions on the test set
y_pred = svc_model.predict(X_test)

# Evaluate model performance
report = classification_report(y_test, y_pred)
print(report)

              precision    recall  f1-score   support

           0       0.95      0.94      0.95       101
           1       1.00      0.98      0.99        81
           2       0.94      0.98      0.96        83
           3       0.98      1.00      0.99        98
           4       1.00      0.98      0.99        82

    accuracy                           0.97       445
   macro avg       0.97      0.97      0.97       445
weighted avg       0.97      0.97      0.97       445

