In [1]:
from sklearn.feature_extraction.text import CountVectorizer

In [4]:
v = CountVectorizer(ngram_range=(2,2))
v.fit(["Thor odinson has been banished from asgard"])
v.vocabulary_

{'thor odinson': 5,
 'odinson has': 4,
 'has been': 3,
 'been banished': 1,
 'banished from': 0,
 'from asgard': 2}

In [5]:
corpus = [
    "Thor ate pizza",
    "Loki is tall",
    "Loki likes pizza"
]

In [6]:
import spacy

In [7]:
nlp = spacy.load("en_core_web_sm")

In [8]:
def preprocess(text):
    doc = nlp(text)

    filtered_tokens = []
    
    for token in doc:
        if token.is_stop or token.is_punct:
            continue
        filtered_tokens.append(token.lemma_)
    return " ".join(filtered_tokens)

In [9]:
preprocess("Thor ate pizza")

'thor eat pizza'

In [11]:
corpus_processed = [ preprocess(text) for text in corpus ]

In [12]:
corpus_processed

['thor eat pizza', 'Loki tall', 'Loki like pizza']

In [14]:
v = CountVectorizer(ngram_range=(1,2))
v.fit(corpus_processed)
v.vocabulary_

{'thor': 9,
 'eat': 0,
 'pizza': 7,
 'thor eat': 10,
 'eat pizza': 1,
 'loki': 4,
 'tall': 8,
 'loki tall': 6,
 'like': 2,
 'loki like': 5,
 'like pizza': 3}

In [16]:
v.transform(["Thor eat pizza"]).toarray()

array([[1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1]], dtype=int64)

In [17]:
v.transform(["Iron eat pizza"]).toarray()

array([[1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0]], dtype=int64)

In [18]:
import pandas as pd

In [21]:
df = pd.read_json("news_dataset.json")

In [22]:
print(df.shape)
df.head(5)

(12695, 2)


Unnamed: 0,text,category
0,Watching Schrödinger's Cat Die University of C...,SCIENCE
1,WATCH: Freaky Vortex Opens Up In Flooded Lake,SCIENCE
2,Entrepreneurs Today Don't Need a Big Budget to...,BUSINESS
3,These Roads Could Recharge Your Electric Car A...,BUSINESS
4,Civilian 'Guard' Fires Gun While 'Protecting' ...,CRIME


In [23]:
df.category.value_counts()

category
BUSINESS    4254
SPORTS      4167
CRIME       2893
SCIENCE     1381
Name: count, dtype: int64

In [27]:
min_samples  = 1381
df_business = df[df.category=="BUSINESS"].sample(min_samples, random_state=69)
df_sports = df[df.category=="SPORTS"].sample(min_samples, random_state=69)
df_crime = df[df.category=="CRIME"].sample(min_samples, random_state=69)
df_science = df[df.category=="SCIENCE"].sample(min_samples, random_state=69)

In [28]:
df_balanced = pd.concat([df_business, df_sports, df_crime, df_science], axis=0)
df_balanced.category.value_counts()

category
BUSINESS    1381
SPORTS      1381
CRIME       1381
SCIENCE     1381
Name: count, dtype: int64

In [30]:
target = {'BUSINESS': 0, 'SPORTS': 1, 'CRIME': 2, 'SCIENCE': 3}

df_balanced['category_num'] = df_balanced.category.map(target)

In [31]:
df_balanced.head()

Unnamed: 0,text,category,category_num
157,American Cities Adding The Most Jobs This Year...,BUSINESS,0
9584,"Women in Business: Doris Greif, Regional Vice ...",BUSINESS,0
8045,City Of Charleston Steps Up To Help Victims' F...,BUSINESS,0
8205,Leadership Matters: Decision Making Strategie...,BUSINESS,0
5875,"Intel Now Hiring Way More Women And You Can, T...",BUSINESS,0


In [29]:
from sklearn.model_selection import train_test_split

In [32]:
X_train, X_test, y_train, y_test = train_test_split(
    df_balanced.text,
    df_balanced.category_num,
    test_size = 0.2,
    random_state = 69,
    stratify = df_balanced.category_num
)

In [34]:
print(X_train.shape)
X_train.head()

(4419,)


8562    Hmmm... The NFL's 'Independent' Investigator T...
5541    Yet Another Toddler Accidentally Shot And Kill...
7760    Machete-Wielding Suspect Shot As He Breaks Thr...
5991    Texans Teammates Argue About Existence Of Dino...
7806    The Secret To Being Happy And Loving Your Job ...
Name: text, dtype: object

In [35]:
y_train.value_counts()

category_num
1    1105
0    1105
3    1105
2    1104
Name: count, dtype: int64

In [36]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report

In [37]:
clf = Pipeline([
    ('vectorizer_bow', CountVectorizer()),
    ('Multi NB', MultinomialNB())
])

clf.fit(X_train, y_train)

In [39]:
y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.79      0.92      0.85       276
           1       0.91      0.85      0.88       276
           2       0.91      0.91      0.91       277
           3       0.90      0.82      0.86       276

    accuracy                           0.87      1105
   macro avg       0.88      0.87      0.87      1105
weighted avg       0.88      0.87      0.87      1105



In [41]:
clf = Pipeline([
    ('vectorizer_bow', CountVectorizer(ngram_range=(1,3))),
    ('Multi NB', MultinomialNB())
])

clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.72      0.95      0.82       276
           1       0.93      0.82      0.87       276
           2       0.91      0.89      0.90       277
           3       0.92      0.76      0.83       276

    accuracy                           0.86      1105
   macro avg       0.87      0.86      0.86      1105
weighted avg       0.87      0.86      0.86      1105



In [42]:
X_test[:5]

6905     Donald Trump Agrees Hosting Golf Tournament On...
1013     Pay A-Rod If this were only about Alex Rodrigu...
7168     This Metal Is So Waterproof That Droplets Boun...
569      Taking Stock of BlackRock The core problem wit...
11464    Two Victims Shot On Texas Southern University ...
Name: text, dtype: object

In [43]:
y_test[:5]

6905     1
1013     1
7168     3
569      0
11464    2
Name: category_num, dtype: int64

In [44]:
y_pred[:5]

array([1, 1, 3, 0, 2], dtype=int64)

In [45]:
df_balanced['processed_text'] = df_balanced.text.apply(preprocess)

In [46]:
df_balanced.head()

Unnamed: 0,text,category,category_num,processed_text
157,American Cities Adding The Most Jobs This Year...,BUSINESS,0,American Cities add job year U.S. economy add ...
9584,"Women in Business: Doris Greif, Regional Vice ...",BUSINESS,0,woman business Doris Greif Regional Vice Presi...
8045,City Of Charleston Steps Up To Help Victims' F...,BUSINESS,0,city Charleston step help victim family big wa...
8205,Leadership Matters: Decision Making Strategie...,BUSINESS,0,leadership matter decision make strategy Suc...
5875,"Intel Now Hiring Way More Women And You Can, T...",BUSINESS,0,Intel hire way woman dear tech company


In [47]:
X_train, X_test, y_train, y_test = train_test_split(
    df_balanced.processed_text,
    df_balanced.category_num,
    test_size = 0.2,
    random_state = 69,
    stratify = df_balanced.category_num
)

In [48]:
clf = Pipeline([
    ('vectorizer_bow', CountVectorizer(ngram_range=(1,3))),
    ('Multi NB', MultinomialNB())
])

clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.83      0.92      0.87       276
           1       0.91      0.89      0.90       276
           2       0.89      0.94      0.91       277
           3       0.92      0.80      0.86       276

    accuracy                           0.89      1105
   macro avg       0.89      0.89      0.89      1105
weighted avg       0.89      0.89      0.89      1105

