In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split

In [3]:
df1 = pd.read_csv("/kaggle/input/fake-and-real-news-dataset/True.csv")
df1["category"] = 1
df2 = pd.read_csv("/kaggle/input/fake-and-real-news-dataset/Fake.csv")
df2["category"] = 0
print(df1.shape,df2.shape)

(21417, 5) (23481, 5)


In [3]:
df1.head()

Unnamed: 0,title,text,subject,date,category
0,"As U.S. budget fight looms, Republicans flip t...",WASHINGTON (Reuters) - The head of a conservat...,politicsNews,"December 31, 2017",1
1,U.S. military to accept transgender recruits o...,WASHINGTON (Reuters) - Transgender people will...,politicsNews,"December 29, 2017",1
2,Senior U.S. Republican senator: 'Let Mr. Muell...,WASHINGTON (Reuters) - The special counsel inv...,politicsNews,"December 31, 2017",1
3,FBI Russia probe helped by Australian diplomat...,WASHINGTON (Reuters) - Trump campaign adviser ...,politicsNews,"December 30, 2017",1
4,Trump wants Postal Service to charge 'much mor...,SEATTLE/WASHINGTON (Reuters) - President Donal...,politicsNews,"December 29, 2017",1


In [4]:
df = pd.concat([df1,df2],ignore_index=False)
df.head()

Unnamed: 0,title,text,subject,date,category
0,"As U.S. budget fight looms, Republicans flip t...",WASHINGTON (Reuters) - The head of a conservat...,politicsNews,"December 31, 2017",1
1,U.S. military to accept transgender recruits o...,WASHINGTON (Reuters) - Transgender people will...,politicsNews,"December 29, 2017",1
2,Senior U.S. Republican senator: 'Let Mr. Muell...,WASHINGTON (Reuters) - The special counsel inv...,politicsNews,"December 31, 2017",1
3,FBI Russia probe helped by Australian diplomat...,WASHINGTON (Reuters) - Trump campaign adviser ...,politicsNews,"December 30, 2017",1
4,Trump wants Postal Service to charge 'much mor...,SEATTLE/WASHINGTON (Reuters) - President Donal...,politicsNews,"December 29, 2017",1


In [9]:
df.category.value_counts()

category
0    23481
1    21417
Name: count, dtype: int64

In [6]:
X_train, X_test, y_train, y_test = train_test_split(df.text,df.category,test_size=0.2,random_state=2022,stratify=df.category)


In [7]:
y_test.value_counts()

category
0    4696
1    4284
Name: count, dtype: int64

In [8]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report

clf = Pipeline([
    ('Vectorizer',CountVectorizer(ngram_range=(1,3))),
    ('KNN' ,KNeighborsClassifier(n_neighbors=10,metric='euclidean'))
])
clf.fit(X_train,y_train)
y_pred = clf.predict(X_test)
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.75      0.83      0.79      4696
           1       0.79      0.70      0.74      4284

    accuracy                           0.77      8980
   macro avg       0.77      0.77      0.77      8980
weighted avg       0.77      0.77      0.77      8980



In [9]:
clf2 = Pipeline([
    ('Vectorizer',CountVectorizer(ngram_range=(1,3))),
    ('KNN' ,KNeighborsClassifier(n_neighbors=10,metric='cosine'))
])
clf2.fit(X_train,y_train)
y_pred = clf2.predict(X_test)
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.64      0.99      0.78      4696
           1       0.96      0.40      0.57      4284

    accuracy                           0.71      8980
   macro avg       0.80      0.69      0.67      8980
weighted avg       0.80      0.71      0.68      8980



In [10]:
from sklearn.ensemble import RandomForestClassifier

clf3 = Pipeline([
    ('Vectorizer',CountVectorizer(ngram_range=(3,3))),
    ('RandomForest' ,RandomForestClassifier())
])
clf3.fit(X_train,y_train)
y_pred = clf3.predict(X_test)
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.97      0.98      0.98      4696
           1       0.97      0.97      0.97      4284

    accuracy                           0.97      8980
   macro avg       0.97      0.97      0.97      8980
weighted avg       0.97      0.97      0.97      8980



In [11]:
from sklearn.naive_bayes import MultinomialNB

clf4 = Pipeline([
    ('Vectorizer',CountVectorizer(ngram_range=(1,2))),
    ('MultinomialNB' ,MultinomialNB(alpha=0.75))
])
clf4.fit(X_train,y_train)
y_pred = clf4.predict(X_test)
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.97      0.97      0.97      4696
           1       0.97      0.97      0.97      4284

    accuracy                           0.97      8980
   macro avg       0.97      0.97      0.97      8980
weighted avg       0.97      0.97      0.97      8980



In [5]:
import spacy

# load english language model and create nlp object from it
nlp = spacy.load("en_core_web_sm") 

def preprocess(text):
    # remove stop words and lemmatize the text
    doc = nlp(text)
    filtered_tokens = []
    for token in doc:
        if token.is_stop or token.is_punct:
            continue
        filtered_tokens.append(token.lemma_)
    
    return " ".join(filtered_tokens) 

In [6]:
df["preprocessed_txt"] = df.text.apply(preprocess)
df.head()

Unnamed: 0,title,text,subject,date,category,preprocessed_txt
0,"As U.S. budget fight looms, Republicans flip t...",WASHINGTON (Reuters) - The head of a conservat...,politicsNews,"December 31, 2017",1,WASHINGTON Reuters head conservative republica...
1,U.S. military to accept transgender recruits o...,WASHINGTON (Reuters) - Transgender people will...,politicsNews,"December 29, 2017",1,WASHINGTON Reuters Transgender people allow ti...
2,Senior U.S. Republican senator: 'Let Mr. Muell...,WASHINGTON (Reuters) - The special counsel inv...,politicsNews,"December 31, 2017",1,WASHINGTON Reuters special counsel investigati...
3,FBI Russia probe helped by Australian diplomat...,WASHINGTON (Reuters) - Trump campaign adviser ...,politicsNews,"December 30, 2017",1,WASHINGTON Reuters trump campaign adviser Geor...
4,Trump wants Postal Service to charge 'much mor...,SEATTLE/WASHINGTON (Reuters) - President Donal...,politicsNews,"December 29, 2017",1,SEATTLE WASHINGTON Reuters President Donald Tr...


In [8]:
X_train, X_test, y_train, y_test = train_test_split(df.preprocessed_txt, df.category,test_size=0.2,random_state=2022,stratify=df.category)


In [9]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

clf4 = Pipeline([
    ('Vectorizer',CountVectorizer(ngram_range=(3,3))),
    ('RandomForest' ,RandomForestClassifier())
])
clf4.fit(X_train,y_train)
y_pred = clf4.predict(X_test)
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.93      0.98      0.95      4696
           1       0.97      0.92      0.94      4284

    accuracy                           0.95      8980
   macro avg       0.95      0.95      0.95      8980
weighted avg       0.95      0.95      0.95      8980



In [14]:
from sklearn.naive_bayes import MultinomialNB

clf5 = Pipeline([
    ('Vectorizer',CountVectorizer(ngram_range=(1,3))),
    ('MultinomialNB' ,MultinomialNB(alpha=0.75))
])
clf5.fit(X_train,y_train)
y_pred = clf5.predict(X_test)
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.98      0.98      0.98      4696
           1       0.98      0.98      0.98      4284

    accuracy                           0.98      8980
   macro avg       0.98      0.98      0.98      8980
weighted avg       0.98      0.98      0.98      8980

