In [1]:
import spacy
import pandas as pd
import numpy as np

In [10]:
df = pd.read_json("./news_dataset.json")

In [11]:
nlp = spacy.load("en_core_web_sm")

In [29]:
def preprocess(text):
    doc = nlp(text)
    return " ".join([token.lemma_ for token in doc if not token.is_stop and not token.is_punct])

In [13]:
df.head()

Unnamed: 0,text,category
0,Watching Schrödinger's Cat Die University of C...,SCIENCE
1,WATCH: Freaky Vortex Opens Up In Flooded Lake,SCIENCE
2,Entrepreneurs Today Don't Need a Big Budget to...,BUSINESS
3,These Roads Could Recharge Your Electric Car A...,BUSINESS
4,Civilian 'Guard' Fires Gun While 'Protecting' ...,CRIME


In [21]:
df["category"].value_counts()

BUSINESS    4254
SPORTS      4167
CRIME       2893
SCIENCE     1381
Name: category, dtype: int64

In [18]:
min_values = df["category"].value_counts().min()

In [22]:
df_science = df[df["category"] == "SCIENCE"].sample(n=min_values,random_state=41)
df_sports = df[df["category"] == "SPORTS"].sample(n=min_values,random_state=41)
df_crime = df[df["category"] == "CRIME"].sample(n=min_values,random_state=41)
df_business = df[df["category"] == "BUSINESS"].sample(n=min_values,random_state=41)

In [23]:
df = pd.concat([df_science,df_sports,df_crime,df_business],axis=0)

In [24]:
df.head()

Unnamed: 0,text,category
2014,Teaching Robots To Be Moral,SCIENCE
10901,'Infant' Alien Planet Discovery Shakes Up Idea...,SCIENCE
5728,Ancient Bones Reveal Bizarre Iron Age Rituals,SCIENCE
7419,11 Science Facts That Seem More Like Science F...,SCIENCE
2415,What's A Scientist To Do? March! It’s hard to ...,SCIENCE


In [25]:
df["category"].value_counts()

SCIENCE     1381
SPORTS      1381
CRIME       1381
BUSINESS    1381
Name: category, dtype: int64

In [26]:
# preprocessing
CATEGORIES = {
    "SCIENCE":0,
    "SPORTS":1,
    "CRIME":2,
    "BUSINESS":3
}
df["category"] = df["category"].apply(lambda x:CATEGORIES[x])

In [27]:
df.head()

Unnamed: 0,text,category
2014,Teaching Robots To Be Moral,0
10901,'Infant' Alien Planet Discovery Shakes Up Idea...,0
5728,Ancient Bones Reveal Bizarre Iron Age Rituals,0
7419,11 Science Facts That Seem More Like Science F...,0
2415,What's A Scientist To Do? March! It’s hard to ...,0


In [30]:
df["tex"] = df["text"].apply(preprocess)

In [31]:
df["text"].head()

2014                          Teaching Robots To Be Moral 
10901    'Infant' Alien Planet Discovery Shakes Up Idea...
5728        Ancient Bones Reveal Bizarre Iron Age Rituals 
7419     11 Science Facts That Seem More Like Science F...
2415     What's A Scientist To Do? March! It’s hard to ...
Name: text, dtype: object

In [32]:
# building the model
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import make_pipeline

clf = make_pipeline(CountVectorizer(ngram_range=(1,3)),MultinomialNB())

In [33]:
# split the dataset
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(df["text"],df["category"],test_size=0.2,shuffle=True)

In [34]:
clf.fit(X_train,y_train)

In [35]:
clf.score(X_test,y_test)

0.8570135746606334