In [15]:
import pandas as pd


df = pd.read_csv('train.csv')
df

Unnamed: 0,id,title,author,text,label
0,0,House Dem Aide: We Didn’t Even See Comey’s Let...,Darrell Lucus,House Dem Aide: We Didn’t Even See Comey’s Let...,1
1,1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Daniel J. Flynn,Ever get the feeling your life circles the rou...,0
2,2,Why the Truth Might Get You Fired,Consortiumnews.com,"Why the Truth Might Get You Fired October 29, ...",1
3,3,15 Civilians Killed In Single US Airstrike Hav...,Jessica Purkiss,Videos 15 Civilians Killed In Single US Airstr...,1
4,4,Iranian woman jailed for fictional unpublished...,Howard Portnoy,Print \nAn Iranian woman has been sentenced to...,1
...,...,...,...,...,...
20795,20795,Rapper T.I.: Trump a ’Poster Child For White S...,Jerome Hudson,Rapper T. I. unloaded on black celebrities who...,0
20796,20796,"N.F.L. Playoffs: Schedule, Matchups and Odds -...",Benjamin Hoffman,When the Green Bay Packers lost to the Washing...,0
20797,20797,Macy’s Is Said to Receive Takeover Approach by...,Michael J. de la Merced and Rachel Abrams,The Macy’s of today grew from the union of sev...,0
20798,20798,"NATO, Russia To Hold Parallel Exercises In Bal...",Alex Ansary,"NATO, Russia To Hold Parallel Exercises In Bal...",1


In [16]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20800 entries, 0 to 20799
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   id      20800 non-null  int64 
 1   title   20242 non-null  object
 2   author  18843 non-null  object
 3   text    20761 non-null  object
 4   label   20800 non-null  int64 
dtypes: int64(2), object(3)
memory usage: 812.6+ KB


In [17]:
df.fillna('inavailable',inplace=True)

In [18]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20800 entries, 0 to 20799
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   id      20800 non-null  int64 
 1   title   20800 non-null  object
 2   author  20800 non-null  object
 3   text    20800 non-null  object
 4   label   20800 non-null  int64 
dtypes: int64(2), object(3)
memory usage: 812.6+ KB


In [19]:
df['comb'] = df['author']+" "+df['title']+" "+df['text']

In [20]:
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer 
import re 
import string

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [21]:
# Initialize objects that don't change once outside the function
# This prevents them from being re-created every time the function is called.
_stemer = PorterStemmer()
_cached_stopwords = set(stopwords.words("english"))

def clean(text):
    """
    Cleans a given text by:
    1. Removing non-alphabetic characters.
    2. Converting to lowercase.
    3. Splitting into words.
    4. Removing stopwords.
    5. Applying Porter Stemming.
    6. Joining words back into a string.
    """
    # 1. Remove non-alphabetic characters and replace with a space
    text = re.sub('[^a-zA-Z]', ' ', text)

    # 2. Convert to lowercase
    text = text.lower()

    # 3. Split the text into words (tokenize)
    # Using text.split() by default splits by any whitespace and handles multiple spaces
    words = text.split()

    # 4. Remove stopwords and apply stemming
    # Using the pre-initialized stemer and stopwords set for efficiency
    stemmed_words = [_stemer.stem(word) for word in words if word not in _cached_stopwords]

    # 5. Join the processed words back into a single string
    text = " ".join(stemmed_words)

    return text

In [23]:
# # !pip install swifter
# import swifter

# df['comb'] = df['comb'].swifter.apply(clean)

# # df['comb'] = df['comb'].apply(lambda x : clean(x))

In [None]:
from multiprocessing import Pool, cpu_count

def parallel_clean(series):
    with Pool(cpu_count()) as p:
        return p.map(clean, series)

df['comb'] = parallel_clean(df['comb'].tolist())


In [None]:
from keras.preprocessing.text import one_hot

texts = [
    'This is an example sentence.',
    'Another sentence for demonstration purposes.',
    'Yet another sentence to encode.'
]

vocab_size = 40

encoded_texts = [one_hot(text, vocab_size) for text in texts]

for text, encoded_text in zip(texts, encoded_texts):
    print('Text:', text)
    print('Encoded:', encoded_text)

In [None]:
voc_size = 50000   
from keras.preprocessing.text import one_hot

text = df['comb']
one_hot_result = [one_hot(words, voc_size) for words in text]

In [None]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

max_len = 500
X = pad_sequences(one_hot_result, padding='post', maxlen=max_len)

In [None]:
from keras.models import Sequential
from keras.layers import Input,Embedding
from keras.layers import Bidirectional, LSTM
from keras.layers import Dense, Dropout

model=Sequential()
model.add(Input(shape=max_len))
model.add(Embedding(input_dim = voc_size, output_dim = 50, input_length = max_len))
model.add(Bidirectional(LSTM(100))) 
model.add(Dropout(0.1))
model.add(Dense(1, activation='sigmoid'))

In [None]:
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [None]:
from sklearn.model_selection import train_test_split

X_train, x_valid, y_train, y_valid = train_test_split(X, y, random_state=0)

In [None]:
history = model.fit(X_train, y_train, validation_data=(x_valid,y_valid), epochs=20, batch_size=124)

In [None]:
import matplotlib.pyplot as plt

fig, ax = plt.subplots(figsize = (10, 4))
ax.plot(history.history['accuracy'], label = 'train accuracy')
ax.plot(history.history['val_accuracy'], label = 'val accuracy')

ax.set_title('Model Accuracy')
ax.set_xlabel('Epoch')
ax.set_ylabel('Accuracy')
ax.legend(loc = 'upper left')
plt.show()

In [None]:
model.save('fake_news_model.keras')

In [None]:
title="House Dem Aide: We Didn’t Even See Comey’s Letter Until Jason Chaffetz Tweeted It"
author="Darrell Lucus"
text='''
House Dem Aide: We Didn’t Even See Comey’s Letter Until Jason Chaffetz Tweeted It By Darrell Lucus on October 30, 2016 Subscribe Jason Chaffetz on the stump in American Fork, Utah ( image courtesy Michael Jolley, available under a Creative Commons-BY license) 
With apologies to Keith Olbermann, there is no doubt who the Worst Person in The World is this week–FBI Director James Comey. But according to a House Democratic aide, it looks like we also know who the second-worst person is as well. It turns out that when Comey sent his now-infamous letter announcing that the FBI was looking into emails that may be related to Hillary Clinton’s email server, the ranking Democrats on the relevant committees didn’t hear about it from Comey. They found out via a tweet from one of the Republican committee chairmen. 
As we now know, Comey notified the Republican chairmen and Democratic ranking members of the House Intelligence, Judiciary, and Oversight committees that his agency was reviewing emails it had recently discovered in order to see if they contained classified information. Not long after this letter went out, Oversight Committee Chairman Jason Chaffetz set the political world ablaze with this tweet. FBI Dir just informed me, ""The FBI has learned of the existence of emails that appear to be pertinent to the investigation.
'''

In [None]:
news = author + " " + title + " " + text

news_clean = clean(news)
news_onehot = one_hot(news_clean, voc_size) 
news_seq = pad_sequences([news_onehot],padding='post',maxlen=max_len)

In [None]:
from keras.models import load_model

model = load_model('fake_news_model.keras')

prediction = model.predict([news_seq])

if prediction[0] > 0.5:
    print ("Fake news !")
else:
    print ("Not fake news !")