In [1]:
from sklearn.feature_extraction.text import CountVectorizer

In [7]:
v = CountVectorizer(ngram_range = (1,2)) #(1,2) for BOW and bigrams, (2,2) for bigrams etc.
v.fit(['Maayeesha is looking for a job'])
v.vocabulary_

{'maayeesha': 7,
 'is': 2,
 'looking': 5,
 'for': 0,
 'job': 4,
 'maayeesha is': 8,
 'is looking': 3,
 'looking for': 6,
 'for job': 1}

In [38]:
corpus = ["Maayeesha ate pizza","Farzana is tall","Farzana is eating pizza"]

In [43]:
import spacy
nlp = spacy.load("en_core_web_sm")

def preprocess(text):
    doc = nlp(text)
    filtered_tokens=[]
    for token in doc:
        if token.is_stop or token.is_punct:
            continue
        filtered_tokens.append(token.lemma_)
    return " ".join(filtered_tokens)

preprocess("Farzana is eating piza")

'Farzana eat piza'

In [44]:
corpus_processed = [preprocess(text) for text in corpus]
corpus_processed

['Maayeesha eat pizza', 'Farzana tall', 'Farzana eat pizza']

In [45]:
v = CountVectorizer(ngram_range=(1,2))
v.fit(corpus_processed)
v.vocabulary_

{'maayeesha': 5,
 'eat': 0,
 'pizza': 7,
 'maayeesha eat': 6,
 'eat pizza': 1,
 'farzana': 2,
 'tall': 8,
 'farzana tall': 4,
 'farzana eat': 3}

In [48]:
v.transform(["Maayeesha eat pizza"]).toarray()

array([[1, 1, 0, 0, 0, 1, 1, 1, 0]])

In [49]:
v.transform(["Farzana eat pizza and Maayeesha is tall"]).toarray()

array([[1, 1, 1, 1, 0, 1, 0, 1, 1]])

In [54]:
import pandas as pd
df = pd.read_json("/Users/admin/Downloads/News_Category_Dataset_v3.json",lines=True)
print(df.shape)
df.head()

(209527, 6)


Unnamed: 0,link,headline,category,short_description,authors,date
0,https://www.huffpost.com/entry/covid-boosters-...,Over 4 Million Americans Roll Up Sleeves For O...,U.S. NEWS,Health experts said it is too early to predict...,"Carla K. Johnson, AP",2022-09-23
1,https://www.huffpost.com/entry/american-airlin...,"American Airlines Flyer Charged, Banned For Li...",U.S. NEWS,He was subdued by passengers and crew when he ...,Mary Papenfuss,2022-09-23
2,https://www.huffpost.com/entry/funniest-tweets...,23 Of The Funniest Tweets About Cats And Dogs ...,COMEDY,"""Until you have a dog you don't understand wha...",Elyse Wanshel,2022-09-23
3,https://www.huffpost.com/entry/funniest-parent...,The Funniest Tweets From Parents This Week (Se...,PARENTING,"""Accidentally put grown-up toothpaste on my to...",Caroline Bologna,2022-09-23
4,https://www.huffpost.com/entry/amy-cooper-lose...,Woman Who Called Cops On Black Bird-Watcher Lo...,U.S. NEWS,Amy Cooper accused investment firm Franklin Te...,Nina Golgowski,2022-09-22


In [56]:
df.category.value_counts()

category
POLITICS          35602
WELLNESS          17945
ENTERTAINMENT     17362
TRAVEL             9900
STYLE & BEAUTY     9814
PARENTING          8791
HEALTHY LIVING     6694
QUEER VOICES       6347
FOOD & DRINK       6340
BUSINESS           5992
COMEDY             5400
SPORTS             5077
BLACK VOICES       4583
HOME & LIVING      4320
PARENTS            3955
THE WORLDPOST      3664
WEDDINGS           3653
WOMEN              3572
CRIME              3562
IMPACT             3484
DIVORCE            3426
WORLD NEWS         3299
MEDIA              2944
WEIRD NEWS         2777
GREEN              2622
WORLDPOST          2579
RELIGION           2577
STYLE              2254
SCIENCE            2206
TECH               2104
TASTE              2096
MONEY              1756
ARTS               1509
ENVIRONMENT        1444
FIFTY              1401
GOOD NEWS          1398
U.S. NEWS          1377
ARTS & CULTURE     1339
COLLEGE            1144
LATINO VOICES      1130
CULTURE & ARTS     1074
EDUCATI

In [60]:
min_samples = 1014
df_politics = df[df.category=="POLITICS"].sample(min_samples,random_state=2022)
df_edu = df[df.category=="EDUCATION"].sample(min_samples,random_state=2022)
df_crime = df[df.category=="CRIME"].sample(min_samples,random_state=2022)
df_women = df[df.category=="WOMEN"].sample(min_samples,random_state=2022)

In [62]:
df_balanced = pd.concat([df_politics,df_edu,df_crime,df_women],axis=0)
df_balanced.category.value_counts()

category
POLITICS     1014
EDUCATION    1014
CRIME        1014
WOMEN        1014
Name: count, dtype: int64

In [63]:
df_balanced["category_num"] = df_balanced.category.map( {"POLITICS":0,"EDUCATION":1,"CRIME":2,"WOMEN":3})

In [64]:
df_balanced.head()

Unnamed: 0,link,headline,category,short_description,authors,date,category_num
16014,https://www.huffingtonpost.com/entry/trump-hea...,The Coverage Of Trump’s Big Dumb Body Is Fat W...,POLITICS,"The president, it evidently needs to be said, ...",Travis Waldron,2018-01-18,0
25545,https://www.huffingtonpost.com/entry/dreamers-...,"Dreamers Are People, Not Political Footballs",POLITICS,People should not be reduced to pawns used by ...,"Center for Community Change Action, Contributo...",2017-09-09,0
51291,https://www.huffingtonpost.com/entry/democrats...,Democrats Must Elect Bernie Sanders Senate Min...,POLITICS,The Senate will meet this Wednesday to elect i...,"Linda Milazzo, ContributorParticipatory journa...",2016-11-13,0
41123,https://www.huffingtonpost.com/entry/jcc-lette...,Jewish Leaders Frustrated By Lack Of Progress ...,POLITICS,"In a letter to Attorney General Jeff Sessions,...",Matt Ferner,2017-03-08,0
10450,https://www.huffingtonpost.com/entry/donald-tr...,Donald Trump Roasted For Painfully Awkward Att...,POLITICS,"Well, that didn't seem to go as planned.",Rebecca Shapiro,2018-04-24,0


In [66]:
from sklearn.model_selection import train_test_split
X_train, X_test,y_train,y_test = train_test_split(df_balanced.short_description	,df_balanced.category_num,test_size=0.2,random_state=2022,stratify = df_balanced.category_num)

In [69]:
print(X_train.shape)
X_train.head()

(3244,)


68242     Los Angeles police investigators obtained a me...
20146     Ivanka Trump helped a fugitive become the lead...
38963     Randi Bergman, 31, found relics left in a clos...
125586                                                     
72285     I am not asking anyone to stop supporting Bern...
Name: short_description, dtype: object

In [71]:
y_train.value_counts()

category_num
2    811
0    811
3    811
1    811
Name: count, dtype: int64

In [72]:
from sklearn.naive_bayes import MultinomialNB 
from sklearn.pipeline import Pipeline 
from sklearn.metrics import classification_report 

In [75]:
#1. creating a pipeling object
classifier = Pipeline([
    ("vectorizer_bow",CountVectorizer(ngram_range = (1,2))),
    ('Multi NB',MultinomialNB())
])
#2. fit with x_train and y_train
classifier.fit(X_train,y_train)
#3. get prediction for x_test and store it in y_pred
y_pred = classifier.predict(X_test)
#4. print the classification report
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.51      0.58      0.54       203
           1       0.59      0.82      0.69       203
           2       0.85      0.50      0.63       203
           3       0.69      0.63      0.66       203

    accuracy                           0.63       812
   macro avg       0.66      0.63      0.63       812
weighted avg       0.66      0.63      0.63       812



In [76]:
X_test[:5]

72788     It was just another late Friday night at Wichi...
74338     "I marveled at his maturity," a former school ...
29914     "To me, any expression that is empowered and i...
106531    LaVerne Evans Srinivasan, the new Vice Preside...
41732     “This is an individual just trying to make us ...
Name: short_description, dtype: object

In [77]:
y_test[:5]

72788     2
74338     2
29914     3
106531    1
41732     2
Name: category_num, dtype: int64

In [78]:
y_pred[:5]

array([1, 2, 3, 1, 1])

In [79]:
df_balanced['preprocessed_short_description'] = df_balanced.short_description.apply(preprocess)

In [80]:
df_balanced.head()

Unnamed: 0,link,headline,category,short_description,authors,date,category_num,preprocessed_short_description
16014,https://www.huffingtonpost.com/entry/trump-hea...,The Coverage Of Trump’s Big Dumb Body Is Fat W...,POLITICS,"The president, it evidently needs to be said, ...",Travis Waldron,2018-01-18,0,president evidently need say professional athlete
25545,https://www.huffingtonpost.com/entry/dreamers-...,"Dreamers Are People, Not Political Footballs",POLITICS,People should not be reduced to pawns used by ...,"Center for Community Change Action, Contributo...",2017-09-09,0,People reduce pawn official hope garner politi...
51291,https://www.huffingtonpost.com/entry/democrats...,Democrats Must Elect Bernie Sanders Senate Min...,POLITICS,The Senate will meet this Wednesday to elect i...,"Linda Milazzo, ContributorParticipatory journa...",2016-11-13,0,Senate meet Wednesday elect leader
41123,https://www.huffingtonpost.com/entry/jcc-lette...,Jewish Leaders Frustrated By Lack Of Progress ...,POLITICS,"In a letter to Attorney General Jeff Sessions,...",Matt Ferner,2017-03-08,0,letter Attorney General Jeff Sessions ask avai...
10450,https://www.huffingtonpost.com/entry/donald-tr...,Donald Trump Roasted For Painfully Awkward Att...,POLITICS,"Well, that didn't seem to go as planned.",Rebecca Shapiro,2018-04-24,0,plan


In [81]:
#using preprocessed text for model training
X_train, X_test,y_train,y_test = train_test_split(
    df_balanced.preprocessed_short_description,
    df_balanced.category_num,test_size=0.2,
    random_state=2022,stratify = df_balanced.category_num
)

In [82]:
#1. creating a pipeling object
classifier = Pipeline([
    ("vectorizer_bow",CountVectorizer(ngram_range = (1,2))),
    ('Multi NB',MultinomialNB())
])
#2. fit with x_train and y_train
classifier.fit(X_train,y_train)
#3. get prediction for x_test and store it in y_pred
y_pred = classifier.predict(X_test)
#4. print the classification report
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.52      0.67      0.59       203
           1       0.68      0.79      0.73       203
           2       0.76      0.50      0.60       203
           3       0.71      0.65      0.68       203

    accuracy                           0.65       812
   macro avg       0.67      0.65      0.65       812
weighted avg       0.67      0.65      0.65       812

