In [1]:
from sklearn.feature_extraction.text import CountVectorizer

In [2]:
cv = CountVectorizer(ngram_range=(1,3))
cv.fit(["Thor Hathodawala is looking for a job"])

cv.vocabulary_

{'thor': 12,
 'hathodawala': 2,
 'is': 5,
 'looking': 9,
 'for': 0,
 'job': 8,
 'thor hathodawala': 13,
 'hathodawala is': 3,
 'is looking': 6,
 'looking for': 10,
 'for job': 1,
 'thor hathodawala is': 14,
 'hathodawala is looking': 4,
 'is looking for': 7,
 'looking for job': 11}

In [3]:
corpus = [
    "Thor ate pizza",
    "Loki is tall",
    "Loki is eating pizza"
]

In [4]:
import spacy

nlp = spacy.load("en_core_web_sm")

def preprocess(text):
    doc = nlp(text)
    
    filtered_tokens = []
    
    for token in doc:
        if token.is_stop or token.is_punct:
            continue 
        filtered_tokens.append(token.lemma_) #lemmatization:- returning the base word
        
    return " ".join(filtered_tokens) #python list into a string 

preprocess("Loki is eating pizza")

'Loki eat pizza'

In [5]:
corpus_processed = [preprocess(text) for text in corpus]
corpus_processed

['thor eat pizza', 'Loki tall', 'Loki eat pizza']

In [6]:
cv = CountVectorizer(ngram_range=(1,2))
cv.fit(corpus_processed)
cv.vocabulary_

{'thor': 7,
 'eat': 0,
 'pizza': 5,
 'thor eat': 8,
 'eat pizza': 1,
 'loki': 2,
 'tall': 6,
 'loki tall': 4,
 'loki eat': 3}

In [7]:
cv.transform(["Thor eat pizza"]).toarray()

array([[1, 1, 0, 0, 0, 1, 0, 1, 1]], dtype=int64)

In [8]:
cv.transform(["Hulk eat pizza"]).toarray()

array([[1, 1, 0, 0, 0, 1, 0, 0, 0]], dtype=int64)

In [9]:
import pandas as pd

df = pd.read_json("news_dataset.json")

print(df.shape)

df.head()

(12695, 2)


Unnamed: 0,text,category
0,Watching Schrödinger's Cat Die University of C...,SCIENCE
1,WATCH: Freaky Vortex Opens Up In Flooded Lake,SCIENCE
2,Entrepreneurs Today Don't Need a Big Budget to...,BUSINESS
3,These Roads Could Recharge Your Electric Car A...,BUSINESS
4,Civilian 'Guard' Fires Gun While 'Protecting' ...,CRIME


In [10]:
#imbalanced 
df['category'].value_counts()

category
BUSINESS    4254
SPORTS      4167
CRIME       2893
SCIENCE     1381
Name: count, dtype: int64

In [11]:
min_samples = 1381

df_business = df[df['category'] == 'BUSINESS'].sample(min_samples, random_state=42)
#df_business.shape
df_sports = df[df['category'] == 'SPORTS'].sample(min_samples, random_state=42)
df_crime = df[df['category'] == 'CRIME'].sample(min_samples, random_state=42)
df_science = df[df['category'] == 'SCIENCE'].sample(min_samples, random_state=42)

In [12]:
df_balanced = pd.concat([df_business, df_sports, df_crime, df_science], axis=0) #axis=0 at row level
df_balanced['category'].value_counts()

category
BUSINESS    1381
SPORTS      1381
CRIME       1381
SCIENCE     1381
Name: count, dtype: int64

In [19]:
#preprocessing 

df_balanced['category_num'] = df_balanced['category'].map({
    'BUSINESS' : 0,
    'SPORTS' : 1,
    'CRIME' : 2,
    'SCIENCE' : 3 
})

In [20]:
df_balanced.head()

Unnamed: 0,text,category,category_num
594,How to Develop the Next Generation of Innovato...,BUSINESS,0
3093,"Madoff Victims' Payout Nears $7.2 Billion, Tru...",BUSINESS,0
7447,Bay Area Floats 'Sanctuary In Transit Policy' ...,BUSINESS,0
10388,Microsoft Agrees To Acquire LinkedIn For $26.2...,BUSINESS,0
1782,"Inside A Legal, Multibillion Dollar Weed Market",BUSINESS,0


In [21]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    df_balanced.text, 
    df_balanced.category_num, 
    test_size=0.2, # 20% samples will go to test dataset
    random_state=2022,
    stratify=df_balanced.category_num
)

In [22]:
print(X_train.shape)
X_train.head()

(4419,)


5230    Fairies, Witches and Astronauts What were all ...
2111    Anticipation: The Psychology of Waiting in Lin...
7443    Jake 'The Snake' Roberts In Intensive Care Aft...
1631    Jeweler Ordered To Pay $34,500 For Trashing Ri...
7066    7 Killed In What May Be Australia's Worst Mass...
Name: text, dtype: object

In [23]:
y_train.value_counts()

category_num
3    1105
2    1105
0    1105
1    1104
Name: count, dtype: int64

In [25]:
y_test.value_counts()

category_num
1    277
0    276
3    276
2    276
Name: count, dtype: int64

In [26]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report
from sklearn.pipeline import Pipeline

In [28]:
clf = Pipeline([
    ('vectorizer_bow', CountVectorizer()),
    ('multinomialNB', MultinomialNB())
])

clf.fit(X_train, y_train)

y_hat = clf.predict(X_test)

print(classification_report(y_test, y_hat))

              precision    recall  f1-score   support

           0       0.79      0.87      0.83       276
           1       0.92      0.82      0.87       277
           2       0.83      0.91      0.87       276
           3       0.91      0.82      0.86       276

    accuracy                           0.86      1105
   macro avg       0.86      0.86      0.86      1105
weighted avg       0.86      0.86      0.86      1105

