In [2]:
!python -m spacy download en_core_web_lg

Collecting en-core-web-lg==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.7.1/en_core_web_lg-3.7.1-py3-none-any.whl (587.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m587.7/587.7 MB[0m [31m2.1 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: en-core-web-lg
Successfully installed en-core-web-lg-3.7.1
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_lg')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [3]:
import spacy

nlp = spacy.load('en_core_web_lg')

In [4]:
import pandas as pd

df = pd.read_json("/content/drive/MyDrive/Datasets/news_dataset.json")

print(df.info(),'\n\n',df.describe())
df.head()

<class 'pandas.core.frame.DataFrame'>
Index: 12695 entries, 0 to 12694
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   text      12695 non-null  object
 1   category  12695 non-null  object
dtypes: object(2)
memory usage: 297.5+ KB
None 

                                                      text  category
count                                               12695     12695
unique                                              12689         4
top     10 Most Hated Companies In America To be truly...  BUSINESS
freq                                                    2      4254


Unnamed: 0,text,category
0,Watching Schrödinger's Cat Die University of C...,SCIENCE
1,WATCH: Freaky Vortex Opens Up In Flooded Lake,SCIENCE
2,Entrepreneurs Today Don't Need a Big Budget to...,BUSINESS
3,These Roads Could Recharge Your Electric Car A...,BUSINESS
4,Civilian 'Guard' Fires Gun While 'Protecting' ...,CRIME


In [5]:
df['category'].value_counts()

Unnamed: 0_level_0,count
category,Unnamed: 1_level_1
BUSINESS,4254
SPORTS,4167
CRIME,2893
SCIENCE,1381


In [7]:
df['target'] = df['category'].map({
    "BUSINESS" : 0,
    "SPORTS" : 1,
    "CRIME" : 2,
    "SCIENCE" : 3,
})

df.head()

Unnamed: 0,text,category,target
0,Watching Schrödinger's Cat Die University of C...,SCIENCE,3
1,WATCH: Freaky Vortex Opens Up In Flooded Lake,SCIENCE,3
2,Entrepreneurs Today Don't Need a Big Budget to...,BUSINESS,0
3,These Roads Could Recharge Your Electric Car A...,BUSINESS,0
4,Civilian 'Guard' Fires Gun While 'Protecting' ...,CRIME,2


In [13]:
def preprocess(text):
  doc = nlp(text)
  filtered_tokens = []
  for token in doc:
    if token.is_stop or token.is_punct:
      continue
    filtered_tokens.append(token.lemma_)

  return " ".join(filtered_tokens)

In [14]:
df['preprocessed'] = df['text'].apply(preprocess)
df.head()

Unnamed: 0,text,category,target,preprocessed
0,Watching Schrödinger's Cat Die University of C...,SCIENCE,3,watch Schrödinger Cat Die University Californi...
1,WATCH: Freaky Vortex Opens Up In Flooded Lake,SCIENCE,3,watch freaky Vortex open Flooded Lake
2,Entrepreneurs Today Don't Need a Big Budget to...,BUSINESS,0,entrepreneur today need Big Budget start year ...
3,These Roads Could Recharge Your Electric Car A...,BUSINESS,0,road recharge Electric Car drive high tech hig...
4,Civilian 'Guard' Fires Gun While 'Protecting' ...,CRIME,2,civilian Guard Fires Gun protect Recruiting Ce...


In [15]:
df['vector'] = df['preprocessed'].apply(lambda x: nlp(x).vector)
df.head()

Unnamed: 0,text,category,target,preprocessed,vector
0,Watching Schrödinger's Cat Die University of C...,SCIENCE,3,watch Schrödinger Cat Die University Californi...,"[-0.85190785, 1.0438694, -0.9148885, -1.395817..."
1,WATCH: Freaky Vortex Opens Up In Flooded Lake,SCIENCE,3,watch freaky Vortex open Flooded Lake,"[0.60747343, 1.9251899, -0.16949336, -0.573053..."
2,Entrepreneurs Today Don't Need a Big Budget to...,BUSINESS,0,entrepreneur today need Big Budget start year ...,"[0.088981755, 0.5882564, -1.2281352, -0.320762..."
3,These Roads Could Recharge Your Electric Car A...,BUSINESS,0,road recharge Electric Car drive high tech hig...,"[-1.0280653, 4.349204, -1.06896, -1.045683, 1...."
4,Civilian 'Guard' Fires Gun While 'Protecting' ...,CRIME,2,civilian Guard Fires Gun protect Recruiting Ce...,"[-1.4220493, 0.9367255, -1.8070079, 3.1870718,..."


In [16]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(df['vector'].values,df['target'],test_size=0.2,random_state=2025)


In [17]:
import numpy as np

X_train_2d = np.stack(X_train)
X_test_2d = np.stack(X_test)

In [18]:
from sklearn.metrics import classification_report

In [19]:
from sklearn.tree import DecisionTreeClassifier

model = DecisionTreeClassifier()

model.fit(X_train_2d,y_train)
y_pred = model.predict(X_test_2d)

print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.74      0.73      0.74       867
           1       0.75      0.71      0.73       851
           2       0.64      0.68      0.66       565
           3       0.41      0.46      0.43       256

    accuracy                           0.68      2539
   macro avg       0.64      0.65      0.64      2539
weighted avg       0.69      0.68      0.69      2539



In [20]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.preprocessing import MinMaxScaler


model = MultinomialNB()
scaler = MinMaxScaler()
scaled_train = scaler.fit_transform(X_train_2d)
model.fit(scaled_train,y_train)
y_pred = model.predict(X_test_2d)

print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.49      0.98      0.65       867
           1       0.95      0.36      0.53       851
           2       0.90      0.54      0.67       565
           3       0.67      0.34      0.45       256

    accuracy                           0.61      2539
   macro avg       0.75      0.56      0.58      2539
weighted avg       0.75      0.61      0.60      2539



In [21]:
from sklearn.neighbors import KNeighborsClassifier

model = KNeighborsClassifier()

model.fit(X_train_2d,y_train)
y_pred = model.predict(X_test_2d)

print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.80      0.90      0.85       867
           1       0.88      0.84      0.86       851
           2       0.81      0.86      0.84       565
           3       0.91      0.56      0.69       256

    accuracy                           0.84      2539
   macro avg       0.85      0.79      0.81      2539
weighted avg       0.84      0.84      0.83      2539



In [22]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.preprocessing import MinMaxScaler


model = RandomForestClassifier()
scaler = MinMaxScaler()
scaled_train = scaler.fit_transform(X_train_2d)
model.fit(scaled_train,y_train)
y_pred = model.predict(X_test_2d)

print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.45      0.72      0.55       867
           1       0.76      0.41      0.54       851
           2       0.77      0.38      0.50       565
           3       0.27      0.43      0.33       256

    accuracy                           0.51      2539
   macro avg       0.56      0.48      0.48      2539
weighted avg       0.61      0.51      0.51      2539



In [24]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.preprocessing import MinMaxScaler


model = GradientBoostingClassifier()
scaler = MinMaxScaler()
scaled_train = scaler.fit_transform(X_train_2d)
model.fit(scaled_train,y_train)
y_pred = model.predict(X_test_2d)

print(classification_report(y_test,y_pred))

KeyboardInterrupt: 

In [None]:
from sklearn.metrics import confusion_matrix

cm = confusion_matrix(y_test,y_pred)
print(cm)