In [53]:
from sklearn.feature_extraction.text import CountVectorizer

In [54]:
v = CountVectorizer(ngram_range=(1,3))
v.fit(['Thor Hathodwala s looking for a job just like me :)'])
v.vocabulary_

{'thor': 18,
 'hathodwala': 3,
 'looking': 14,
 'for': 0,
 'job': 6,
 'just': 9,
 'like': 12,
 'me': 17,
 'thor hathodwala': 19,
 'hathodwala looking': 4,
 'looking for': 15,
 'for job': 1,
 'job just': 7,
 'just like': 10,
 'like me': 13,
 'thor hathodwala looking': 20,
 'hathodwala looking for': 5,
 'looking for job': 16,
 'for job just': 2,
 'job just like': 8,
 'just like me': 11}

In [55]:
corpus =[
    "thor ate pizza",
    "laila is tall",
    "laila is cooling water",
]

In [56]:
import spacy
nlp = spacy.load("en_core_web_sm")


def preprocessing(text):
    doc = nlp(text)

    filtered_token= []
    for token in doc:
        if token.is_stop or token.is_punct:
            continue
        filtered_token.append(token.lemma_)

    return " ".join(filtered_token)


In [57]:
preprocessing("thor ate pizza")# convert ate to eat the pas verb

'thor eat pizza'

In [58]:
corpus_process = [preprocessing(text) for text in corpus]

In [59]:
corpus_process

['thor eat pizza', 'laila tall', 'laila cool water']

In [60]:
v = CountVectorizer(ngram_range=(1,2))
v.fit(corpus_process)
v.vocabulary_

{'thor': 9,
 'eat': 2,
 'pizza': 7,
 'thor eat': 10,
 'eat pizza': 3,
 'laila': 4,
 'tall': 8,
 'laila tall': 6,
 'cool': 0,
 'water': 11,
 'laila cool': 5,
 'cool water': 1}

In [61]:
vectore = v.transform(["thor ate pizza"])

In [62]:
vectore.toarray()

array([[0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0]], dtype=int64)

In [63]:
import pandas as pd

df = pd.read_json("news_dataset.json"  )
df.head()

Unnamed: 0,text,category
0,Watching Schrödinger's Cat Die University of C...,SCIENCE
1,WATCH: Freaky Vortex Opens Up In Flooded Lake,SCIENCE
2,Entrepreneurs Today Don't Need a Big Budget to...,BUSINESS
3,These Roads Could Recharge Your Electric Car A...,BUSINESS
4,Civilian 'Guard' Fires Gun While 'Protecting' ...,CRIME


In [64]:
df.category.value_counts()

category
BUSINESS    4254
SPORTS      4167
CRIME       2893
SCIENCE     1381
Name: count, dtype: int64

In [65]:
manoirity_sample_count = 1381

BUSINESS_sample = df[df.category=="BUSINESS"].sample(manoirity_sample_count , random_state=2024)
SPORTS_sample = df[df.category=="SPORTS"].sample(manoirity_sample_count , random_state=2024)
CRIME_sample = df[df.category=="CRIME"].sample(manoirity_sample_count , random_state=2024)
SCIENCE_sample = df[df.category=="SCIENCE"]

In [66]:
SPORTS_sample

Unnamed: 0,text,category
6560,Will Ferrell Is Really Pumped About The USA-Ge...,SPORTS
4490,Are You Ready For Some L.A. Football? In an er...,SPORTS
9720,Cam Newton Wore An Adorable Shirt Of His Son A...,SPORTS
9233,"WATCH: One Play, Three Errors",SPORTS
130,WATCH: Marshawn Lynch Wants Nothing To Do With...,SPORTS
...,...,...
1150,Stunt Biker Danny MacAskill Turns Scotland Int...,SPORTS
10747,If NFL Quaterbacks Were On Tinder,SPORTS
4657,This Young Carolina Panthers Fan Is Happiness ...,SPORTS
8197,"Become a Millionaire, Start Skateboarding Skat...",SPORTS


In [67]:
df_training_balanced= pd.concat([BUSINESS_sample, SPORTS_sample, CRIME_sample, SCIENCE_sample], axis=0)
df_training_balanced.category.value_counts()

category
BUSINESS    1381
SPORTS      1381
CRIME       1381
SCIENCE     1381
Name: count, dtype: int64

In [68]:

df_training_balanced['category_num']=df_training_balanced.category.map(
{'BUSINESS':0, 'SPORTS':1, 'CRIME':2, 'SCIENCE':3}
)

In [69]:
df_training_balanced

Unnamed: 0,text,category,category_num
2487,Amazon To Challenge Alibaba In Global Delivery...,BUSINESS,0
12534,"A Tale of Two Investors Like many investors, P...",BUSINESS,0
10014,"Quit Working So Hard At the end of the day, ho...",BUSINESS,0
4605,The Rating Game It's hard to find a restaurant...,BUSINESS,0
6833,Has Instant Messaging Become More Annoying Tha...,BUSINESS,0
...,...,...,...
12632,New Continent Zealandia Is Discovered Underwat...,SCIENCE,3
12644,Smartphone Lovers More Likely To Forget Things...,SCIENCE,3
12669,Ambitious Test On Tap For Real-Life 'Flying Sa...,SCIENCE,3
12691,This Minivan-Sized Sea Sponge Is Thought To Be...,SCIENCE,3


In [70]:
from sklearn.model_selection import train_test_split

X_train , X_test, y_train , y_test = train_test_split(df_training_balanced.text,df_training_balanced.category_num, 
                                                      test_size=0.2 , random_state=2024,
                                                      stratify=df_training_balanced.category_num)

In [71]:
y_test.value_counts()

category_num
0    277
1    276
3    276
2    276
Name: count, dtype: int64

In [72]:
from  sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report


In [78]:
clf= Pipeline([('vectorizer_bow', CountVectorizer(ngram_range=(1,2))),
          ("Multi NB", MultinomialNB())
          ])
clf.fit(X_train, y_train)
y_pred=clf.predict(X_test)

print(classification_report(y_test, y_pred))


              precision    recall  f1-score   support

           0       0.72      0.91      0.80       277
           1       0.93      0.82      0.87       276
           2       0.90      0.90      0.90       276
           3       0.92      0.79      0.85       276

    accuracy                           0.85      1105
   macro avg       0.87      0.85      0.86      1105
weighted avg       0.87      0.85      0.86      1105



In [74]:
df_training_balanced["preprcessing_text"]=df_training_balanced.text.apply(preprocessing)

In [75]:
df_training_balanced.head()

Unnamed: 0,text,category,category_num,preprcessing_text
2487,Amazon To Challenge Alibaba In Global Delivery...,BUSINESS,0,Amazon challenge Alibaba Global Delivery Marke...
12534,"A Tale of Two Investors Like many investors, P...",BUSINESS,0,Tale Investors like investor Paul Mary trust W...
10014,"Quit Working So Hard At the end of the day, ho...",BUSINESS,0,Quit Working hard end day time purely truly pr...
4605,The Rating Game It's hard to find a restaurant...,BUSINESS,0,Rating Game hard find restaurant place little ...
6833,Has Instant Messaging Become More Annoying Tha...,BUSINESS,0,instant messaging annoying Email 5 step produc...


In [76]:
X_train2 , X_test2, y_train2 , y_test2 = train_test_split(df_training_balanced.preprcessing_text,df_training_balanced.category_num, 
                                                      test_size=0.2 , random_state=2024,
                                                      stratify=df_training_balanced.category_num)

In [79]:
clf2= Pipeline([('vectorizer_bow', CountVectorizer(ngram_range=(1,2))),
          ("Multi NB", MultinomialNB())
          ])
clf.fit(X_train2, y_train2)
y_pred2=clf.predict(X_test2)

print(classification_report(y_test2, y_pred2))

              precision    recall  f1-score   support

           0       0.82      0.88      0.85       277
           1       0.90      0.86      0.88       276
           2       0.86      0.93      0.89       276
           3       0.93      0.82      0.87       276

    accuracy                           0.87      1105
   macro avg       0.88      0.87      0.87      1105
weighted avg       0.88      0.87      0.87      1105

