In [110]:
from sklearn.feature_extraction.text import CountVectorizer

v=CountVectorizer()
v.fit(["Thor Hathodawala is looking for a job"])
v.vocabulary_

{'thor': 5, 'hathodawala': 1, 'is': 2, 'looking': 4, 'for': 0, 'job': 3}

In [112]:
v = CountVectorizer(ngram_range=(1,2))
v.fit(["Thor Hathodawala is looking for a job"])
v.vocabulary_

{'thor': 9,
 'hathodawala': 2,
 'is': 4,
 'looking': 7,
 'for': 0,
 'job': 6,
 'thor hathodawala': 10,
 'hathodawala is': 3,
 'is looking': 5,
 'looking for': 8,
 'for job': 1}

In [114]:
v = CountVectorizer(ngram_range=(1,3))
v.fit(["Thor Hathodawala is looking for a job"])
v.vocabulary_

{'thor': 12,
 'hathodawala': 2,
 'is': 5,
 'looking': 9,
 'for': 0,
 'job': 8,
 'thor hathodawala': 13,
 'hathodawala is': 3,
 'is looking': 6,
 'looking for': 10,
 'for job': 1,
 'thor hathodawala is': 14,
 'hathodawala is looking': 4,
 'is looking for': 7,
 'looking for job': 11}

In [116]:
corpus = [
    "Thor ate pizza",
    "Loki is tall",
    "Loki is eating pizza"
]

In [118]:
import spacy

nlp=spacy.load('en_core_web_sm')

def preprocessing(text):
    doc=nlp(text)

    filtered_tokens=[]
    for token in doc:
        if token.is_stop or token.is_punct:
            continue

        filtered_tokens.append(token.lemma_)
    return " ".join(filtered_tokens)
preprocessing("Thor ate pizza")

'thor eat pizza'

In [119]:
preprocessing("Loki is eating pizza")

'Loki eat pizza'

In [120]:
corpus_processed=[preprocessing(text) for text in corpus]
corpus_processed

['thor eat pizza', 'Loki tall', 'Loki eat pizza']

In [124]:
v=CountVectorizer(ngram_range=(1,2))
v.fit(corpus_processed)
v.vocabulary_

{'thor': 7,
 'eat': 0,
 'pizza': 5,
 'thor eat': 8,
 'eat pizza': 1,
 'loki': 2,
 'tall': 6,
 'loki tall': 4,
 'loki eat': 3}

In [126]:
v.transform(["Thor eat pizza"]).toarray()

array([[1, 1, 0, 0, 0, 1, 0, 1, 1]], dtype=int64)

In [128]:
v.transform(["Hulk eat pizza"]).toarray()

array([[1, 1, 0, 0, 0, 1, 0, 0, 0]], dtype=int64)

In [130]:
import pandas as pd
df=pd.read_json(r"C:\Users\omgai\Downloads\news_dataset.json")
df.head()

Unnamed: 0,text,category
0,Watching Schrödinger's Cat Die University of C...,SCIENCE
1,WATCH: Freaky Vortex Opens Up In Flooded Lake,SCIENCE
2,Entrepreneurs Today Don't Need a Big Budget to...,BUSINESS
3,These Roads Could Recharge Your Electric Car A...,BUSINESS
4,Civilian 'Guard' Fires Gun While 'Protecting' ...,CRIME


In [132]:
df.shape

(12695, 2)

In [134]:
df.category.value_counts()

category
BUSINESS    4254
SPORTS      4167
CRIME       2893
SCIENCE     1381
Name: count, dtype: int64

In [136]:
df.isnull().sum()

text        0
category    0
dtype: int64

In [138]:
min_samples=1381
df_business= df[df.category=='BUSINESS'].sample(min_samples, random_state=2022)
df_sports = df[df.category=="SPORTS"].sample(min_samples, random_state=2022)
df_crime = df[df.category=="CRIME"].sample(min_samples, random_state=2022)
df_science = df[df.category=="SCIENCE"].sample(min_samples, random_state=2022)

In [140]:
df_balanced=pd.concat([df_business, df_sports,df_crime,df_science], axis=0)
df_balanced

Unnamed: 0,text,category
11967,GCC Business Leaders Remain Confident in the F...,BUSINESS
2912,From the Other Side; an Honest Review from Emp...,BUSINESS
3408,"Mike McDerment, CEO of FreshBooks, Talks About...",BUSINESS
502,How to Market Your Business While Traveling th...,BUSINESS
5279,How to Leverage Intuition in Decision-making I...,BUSINESS
...,...,...
2178,Aquarium To Monitor Animals' Behavior Changes ...,SCIENCE
5682,How Google Glass Could Save Lives In The Hospi...,SCIENCE
1643,Honda's Gravity Modification Research For us A...,SCIENCE
11428,EVERYONE Loves Alternative Facts THE POWER OF ...,SCIENCE


In [142]:
df_balanced.category.value_counts()

category
BUSINESS    1381
SPORTS      1381
CRIME       1381
SCIENCE     1381
Name: count, dtype: int64

In [144]:
df_balanced["category_num"]=df.category.map({'BUSINESS':1,'SPORTS':2, 'CRIME':3, 'SCIENCE':4 })

In [146]:
df_balanced

Unnamed: 0,text,category,category_num
11967,GCC Business Leaders Remain Confident in the F...,BUSINESS,1
2912,From the Other Side; an Honest Review from Emp...,BUSINESS,1
3408,"Mike McDerment, CEO of FreshBooks, Talks About...",BUSINESS,1
502,How to Market Your Business While Traveling th...,BUSINESS,1
5279,How to Leverage Intuition in Decision-making I...,BUSINESS,1
...,...,...,...
2178,Aquarium To Monitor Animals' Behavior Changes ...,SCIENCE,4
5682,How Google Glass Could Save Lives In The Hospi...,SCIENCE,4
1643,Honda's Gravity Modification Research For us A...,SCIENCE,4
11428,EVERYONE Loves Alternative Facts THE POWER OF ...,SCIENCE,4


In [148]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    df_balanced.text, 
    df_balanced.category_num, 
    test_size=0.2, # 20% samples will go to test dataset
    random_state=2022,
    stratify=df_balanced.category_num
)

In [150]:
print(X_train.shape)
X_train.head()

(4419,)


7589     Ovulating Women Prefer Images of Penetration O...
10442    Scientists Discover Spooky Influence On Baby N...
8792     Olympic Race Walker Steps Up To Propose To His...
1733     Beloved Bipedal Bear Named Pedals Believed Kil...
2526     Elizabeth Smart Gave Birth To Baby Girl, Fathe...
Name: text, dtype: object

In [152]:
y_train.value_counts()

category_num
4    1105
3    1105
1    1105
2    1104
Name: count, dtype: int64

In [154]:
y_test.value_counts()

category_num
2    277
1    276
4    276
3    276
Name: count, dtype: int64

In [164]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report 

clf=Pipeline([
    ('vectorizer_bow',CountVectorizer()),
    ('multi nb',MultinomialNB())
])

clf.fit(X_train, y_train)
y_pred=clf.predict(X_test)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           1       0.75      0.87      0.81       276
           2       0.93      0.80      0.86       277
           3       0.83      0.90      0.86       276
           4       0.90      0.80      0.85       276

    accuracy                           0.84      1105
   macro avg       0.85      0.84      0.84      1105
weighted avg       0.85      0.84      0.84      1105



In [168]:
clf=Pipeline([
    ('vectorizer_bow',CountVectorizer(ngram_range=(1,3))),
    ('multi nb',MultinomialNB())
])

clf.fit(X_train, y_train)
y_pred=clf.predict(X_test)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           1       0.67      0.91      0.77       276
           2       0.96      0.73      0.83       277
           3       0.83      0.87      0.85       276
           4       0.93      0.76      0.83       276

    accuracy                           0.82      1105
   macro avg       0.84      0.82      0.82      1105
weighted avg       0.84      0.82      0.82      1105



In [170]:
X_test[:5]

3716     African Nation Slaps Exxon With Fine Nearly 7 ...
608      These Cringe-Worthy Stories Show It Can Be Har...
11172    LISTEN: The Accidental Discovery That Proved T...
1346     Build Loyalty -- The Cost -- $00.00 Remember y...
1356     Man Killed By Michigan Police Wasn't Targeting...
Name: text, dtype: object

In [172]:
y_test[:5]

3716     1
608      4
11172    4
1346     1
1356     3
Name: category_num, dtype: int64

In [174]:
y_pred[:5]

array([1, 1, 4, 1, 3], dtype=int64)

till now we build the model on row text 
and now let's try to build same with preprocessing

In [177]:
df_balanced['preprocessed_text']=df_balanced.text.apply(preprocessing)

In [178]:
df_balanced.head()

Unnamed: 0,text,category,category_num,preprocessed_text
11967,GCC Business Leaders Remain Confident in the F...,BUSINESS,1,GCC Business leader remain confident Face Regi...
2912,From the Other Side; an Honest Review from Emp...,BUSINESS,1,Honest Review Employees wake morning love impo...
3408,"Mike McDerment, CEO of FreshBooks, Talks About...",BUSINESS,1,Mike McDerment CEO FreshBooks Talks give build...
502,How to Market Your Business While Traveling th...,BUSINESS,1,market business travel World recently amazing ...
5279,How to Leverage Intuition in Decision-making I...,BUSINESS,1,leverage intuition decision making feel safe r...


In [181]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    df_balanced.preprocessed_text, 
    df_balanced.category_num, 
    test_size=0.2, # 20% samples will go to test dataset
    random_state=2022,
    stratify=df_balanced.category_num
)

In [185]:
clf=Pipeline([
    ('vectorizer_bow',CountVectorizer(ngram_range=(1,2))),
    ('multi nb',MultinomialNB())
])

clf.fit(X_train, y_train)
y_pred=clf.predict(X_test)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           1       0.79      0.88      0.83       276
           2       0.94      0.82      0.87       277
           3       0.82      0.91      0.86       276
           4       0.91      0.82      0.86       276

    accuracy                           0.86      1105
   macro avg       0.86      0.86      0.86      1105
weighted avg       0.86      0.86      0.86      1105



bY comparing we get an idea how preprocessing helps to build better models

EXERCISE

In [189]:
df=pd.read_csv(r"C:\Users\omgai\Downloads\Fake_Real_Data.csv")
df.head()

Unnamed: 0,Text,label
0,Top Trump Surrogate BRUTALLY Stabs Him In The...,Fake
1,U.S. conservative leader optimistic of common ...,Real
2,"Trump proposes U.S. tax overhaul, stirs concer...",Real
3,Court Forces Ohio To Allow Millions Of Illega...,Fake
4,Democrats say Trump agrees to work on immigrat...,Real


In [191]:
df.shape

(9900, 2)

In [193]:
df.label.value_counts()

label
Fake    5000
Real    4900
Name: count, dtype: int64

In [197]:
df['real']=df.label.map({'Fake':0 , 'Real':1}).astype('int64')
df.head()

Unnamed: 0,Text,label,real
0,Top Trump Surrogate BRUTALLY Stabs Him In The...,Fake,0
1,U.S. conservative leader optimistic of common ...,Real,1
2,"Trump proposes U.S. tax overhaul, stirs concer...",Real,1
3,Court Forces Ohio To Allow Millions Of Illega...,Fake,0
4,Democrats say Trump agrees to work on immigrat...,Real,1


In [205]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(df.Text, df.real)

In [207]:
X_train.shape

(7425,)

In [209]:
X_test.shape

(2475,)

In [217]:
from sklearn.neighbors import KNeighborsClassifier
clf=Pipeline([
    ('Count vectorizer', CountVectorizer(ngram_range=(1,3))),
    ('model',KNeighborsClassifier(n_neighbors=10, metric='euclidean'))
])

In [219]:
clf.fit(X_train,y_train)

In [220]:
y_pred=clf.predict(X_test)
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.95      0.46      0.62      1280
           1       0.63      0.98      0.76      1195

    accuracy                           0.71      2475
   macro avg       0.79      0.72      0.69      2475
weighted avg       0.80      0.71      0.69      2475



In [227]:
from sklearn.neighbors import KNeighborsClassifier
clf=Pipeline([
    ('Count vectorizer', CountVectorizer(ngram_range=(1,3))),
    ('model',KNeighborsClassifier(n_neighbors=10, metric='cosine'))
])
clf.fit(X_train,y_train)
y_pred=clf.predict(X_test)
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       1.00      0.50      0.66      1280
           1       0.65      1.00      0.79      1195

    accuracy                           0.74      2475
   macro avg       0.82      0.75      0.73      2475
weighted avg       0.83      0.74      0.72      2475



In [229]:
from sklearn.ensemble import RandomForestClassifier
clf=Pipeline([
    ('Count vectorizer', CountVectorizer(ngram_range=(3,3))),
    ('model',RandomForestClassifier())
])
clf.fit(X_train,y_train)
y_pred=clf.predict(X_test)
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       1.00      0.98      0.99      1280
           1       0.98      1.00      0.99      1195

    accuracy                           0.99      2475
   macro avg       0.99      0.99      0.99      2475
weighted avg       0.99      0.99      0.99      2475



In [230]:
clf=Pipeline([
    ('vectorizer_bow',CountVectorizer(ngram_range=(1,2))),
    ('multi nb',MultinomialNB(alpha=0.75))
])

clf.fit(X_train, y_train)
y_pred=clf.predict(X_test)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.98      0.98      0.98      1280
           1       0.98      0.98      0.98      1195

    accuracy                           0.98      2475
   macro avg       0.98      0.98      0.98      2475
weighted avg       0.98      0.98      0.98      2475



In [None]:
df['preprocessed_Text']=df.Text.apply(preprocessing)
df.head()

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    df_balanced.preprocessed_Text, 
    df_balanced.category_num, 
    test_size=0.2, # 20% samples will go to test dataset
    random_state=2022,
    stratify=df_balanced.category_num
)

In [None]:
clf=Pipeline([
    ('Count vectorizer', CountVectorizer(ngram_range=(3,3))),
    ('model',RandomForestClassifier())
])
clf.fit(X_train,y_train)
y_pred=clf.predict(X_test)
print(classification_report(y_test,y_pred))

In [None]:
clf=Pipeline([
    ('Count vectorizer', CountVectorizer(ngram_range=(1,3))),
    ('model',RandomForestClassifier())
])
clf.fit(X_train,y_train)
y_pred=clf.predict(X_test)
print(classification_report(y_test,y_pred))