In [3]:
import pandas as pd
import string
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, classification_report
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import re
import pickle
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\munir\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [4]:
data = pd.read_json("News_Category_Dataset_v3.json", lines=True)

In [5]:
data = data[['headline', 'category']]

In [6]:
data['headline'] = data['headline'].str.lower()

In [7]:
def temizle(text):
    text = re.sub(f"[{string.punctuation}]", "", text)
    text = re.sub(r"\d+", "", text)
    return text

In [8]:
data['headline'] = data['headline'].apply(temizle)

In [9]:
stop_words = set(stopwords.words("english"))
data['headline'] = data['headline'].apply(lambda x: " ".join([word for word in x.split() if word not in stop_words]))


In [10]:
stemmer = PorterStemmer()
data['headline'] = data['headline'].apply(lambda x: " ".join([stemmer.stem(word) for word in x.split()]))

In [11]:
X = data['headline']
y = data['category']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [12]:
pipeline = Pipeline([
    ('vectorizer', CountVectorizer()),
    ('classifier', MultinomialNB())
])

In [13]:
pipeline.fit(X_train, y_train)

In [14]:
y_pred = pipeline.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Doğruluk Oranı: {accuracy:.2f}")
print("Sınıflandırma Raporu:\n", classification_report(y_test, y_pred))

Doğruluk Oranı: 0.52
Sınıflandırma Raporu:
                 precision    recall  f1-score   support

          ARTS       0.61      0.04      0.07       293
ARTS & CULTURE       0.43      0.01      0.02       275
  BLACK VOICES       0.58      0.20      0.30       889
      BUSINESS       0.53      0.32      0.40      1216
       COLLEGE       0.38      0.03      0.06       202
        COMEDY       0.61      0.29      0.39      1022
         CRIME       0.53      0.51      0.52       713
CULTURE & ARTS       0.91      0.10      0.18       202
       DIVORCE       0.88      0.49      0.63       664
     EDUCATION       0.67      0.01      0.02       209
 ENTERTAINMENT       0.47      0.79      0.59      3419
   ENVIRONMENT       0.83      0.08      0.14       313
         FIFTY       0.00      0.00      0.00       263
  FOOD & DRINK       0.63      0.69      0.66      1270
     GOOD NEWS       0.67      0.05      0.10       270
         GREEN       0.39      0.10      0.16       532
HEA

In [15]:
with open("model.pkl", "wb") as model_file:
    pickle.dump(pipeline, model_file)