In [6]:
import pandas as pd
import numpy as np
import re
import string
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.preprocessing import LabelEncoder
from keras.models import Sequential
from keras.layers import Embedding, LSTM, Dense, SpatialDropout1D, Conv1D, MaxPooling1D ,SimpleRNN,  GlobalMaxPooling1D
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences






In [7]:
# Load the dataset
df = pd.read_csv('Constraint_Train (1).csv')

In [8]:
# Preprocess the data
df['label'] = df['label'].map({'real': 1, 'fake': 0})
df['tweet'] = df['tweet'].apply(lambda x: x.lower())
df['tweet'] = df['tweet'].apply(lambda x: re.sub('\[.*?\]', '', x))
df['tweet'] = df['tweet'].apply(lambda x: re.sub("\\W", " ", x))
df['tweet'] = df['tweet'].apply(lambda x: re.sub('http?://\S+|www.\.\S+', '', x))
df['tweet'] = df['tweet'].apply(lambda x: re.sub('<.*?>+', '', x))
df['tweet'] = df['tweet'].apply(lambda x: re.sub('[%s]' % re.escape(string.punctuation), '', x))
df['tweet'] = df['tweet'].apply(lambda x: re.sub('\n', '', x))
df['tweet'] = df['tweet'].apply(lambda x: re.sub('\w*\d\w*', '', x))



In [9]:
import re
import string

def wordopt(tweet):
    tweet = tweet.lower()
    tweet = re.sub('\[.*?\]', '', tweet)
    tweet = re.sub("\\W", " ", tweet)
    tweet = re.sub('http?://\S+|www.\.\S+', '', tweet)
    tweet = re.sub('<.*?>+', '', tweet)
    tweet = re.sub('[%s]' % re.escape(string.punctuation), '', tweet)
    tweet = re.sub('\n', '', tweet)
    tweet = re.sub('\w*\d\w*', '', tweet)
    return tweet


In [10]:
# Split the data into training and testing sets
x_train, x_test, y_train, y_test = train_test_split(df['tweet'], df['label'], test_size=0.2, random_state=42)



In [11]:
# Tokenize the text
max_words = 5000
tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(x_train)
x_train_seq = tokenizer.texts_to_sequences(x_train)
x_test_seq = tokenizer.texts_to_sequences(x_test)



In [12]:
# Pad sequences for uniform length
max_len = 100  # You can adjust this based on your data and model complexity
x_train_pad = pad_sequences(x_train_seq, maxlen=max_len)
x_test_pad = pad_sequences(x_test_seq, maxlen=max_len)



# LSTM 

In [13]:
# Build LSTM model
model_lstm = Sequential()
model_lstm.add(Embedding(max_words, 128, input_length=max_len))
model_lstm.add(SpatialDropout1D(0.2))
model_lstm.add(LSTM(100, dropout=0.2, recurrent_dropout=0.2))
model_lstm.add(Dense(1, activation='sigmoid'))
model_lstm.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])







In [14]:
# Train the LSTM model
model_lstm.fit(x_train_pad, y_train, epochs=5, batch_size=32, validation_data=(x_test_pad, y_test))



Epoch 1/5


Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.src.callbacks.History at 0x295eac04690>

In [15]:
loss, accuracy = model_lstm.evaluate(x_test_pad, y_test)
print(f'Test Accuracy: {accuracy}')

Test Accuracy: 0.8707165122032166


In [16]:
y_pred = model_lstm.predict(x_test_pad)



In [17]:
y_pred = (y_pred > 0.5).astype(int)

In [18]:
y_test = np.array(y_test)

In [19]:
lstm_predictions = model_lstm.predict(x_test_pad)



In [20]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.95      0.76      0.85       596
           1       0.82      0.96      0.89       688

    accuracy                           0.87      1284
   macro avg       0.89      0.86      0.87      1284
weighted avg       0.88      0.87      0.87      1284



# CNN

In [21]:
# ... (previous code for data preprocessing, LSTM model, and common functions)

# Build CNN model
model_cnn = Sequential()
model_cnn.add(Embedding(max_words, 128, input_length=max_len))
model_cnn.add(Conv1D(64, kernel_size=3, activation='relu'))
model_cnn.add(MaxPooling1D(pool_size=2))
model_cnn.add(LSTM(100))
model_cnn.add(Dense(1, activation='sigmoid'))
model_cnn.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])






In [22]:
# Train the CNN model
model_cnn.fit(x_train_pad, y_train, epochs=5, batch_size=32, validation_data=(x_test_pad, y_test))



Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.src.callbacks.History at 0x295f6beead0>

In [23]:
loss, accuracy = model_cnn.evaluate(x_test_pad, y_test)
print(f'Test Accuracy: {accuracy}')

Test Accuracy: 0.9104361534118652


In [24]:
y_pred = model_cnn.predict(x_test_pad)



In [25]:
y_pred = (y_pred > 0.5).astype(int)

In [26]:
y_test = np.array(y_test)

In [27]:
cnn_predictions = model_cnn.predict(x_test_pad)



In [28]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.89      0.92      0.91       596
           1       0.93      0.90      0.92       688

    accuracy                           0.91      1284
   macro avg       0.91      0.91      0.91      1284
weighted avg       0.91      0.91      0.91      1284



# RNN

In [29]:
model_rnn = Sequential()
model_rnn.add(Embedding(max_words, 128, input_length=max_len))
model_rnn.add(SimpleRNN(100, dropout=0.2, recurrent_dropout=0.2))
model_rnn.add(Dense(1, activation='sigmoid'))
model_rnn.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])


In [30]:
model_rnn.fit(x_train_pad, y_train, epochs=5, batch_size=32, validation_data=(x_test_pad, y_test))


Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.src.callbacks.History at 0x295fa48d2d0>

In [31]:
accuracy_rnn = model_rnn.evaluate(x_test_pad, y_test)
print("RNN Accuracy:", accuracy_rnn[1])


RNN Accuracy: 0.8933022022247314


In [32]:
y_pred = model_rnn.predict(x_test_pad)



In [33]:
y_pred = (y_pred > 0.5).astype(int)

In [34]:
y_test = np.array(y_test)

In [35]:
rnn_predictions = model_rnn.predict(x_test_pad)



In [36]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.84      0.96      0.89       596
           1       0.96      0.84      0.89       688

    accuracy                           0.89      1284
   macro avg       0.90      0.90      0.89      1284
weighted avg       0.90      0.89      0.89      1284



# LSTM prediction

In [None]:
# Predict the output for a random input
def predict_output(model, text):
    text = wordopt(text.lower())  # Applying wordopt here
    sequence = tokenizer.texts_to_sequences([text])
    padded_sequence = pad_sequences(sequence, maxlen=max_len)
    prediction = model.predict(padded_sequence)
    return prediction[0][0]

news = input("Enter a news text: ")
lstm_prediction = predict_output(model_lstm, news)

In [None]:
def output_lable(n):
    if n == 0:
        return "Fake News"
    elif n == 1:
        return "Not A Fake News"

print("\nLSTM Prediction:", output_lable(round(lstm_prediction)))

# CNN prediction

In [None]:
# Predict the output for a random input
def predict_output(model, text):
    text = wordopt(text.lower())  # Applying wordopt here
    sequence = tokenizer.texts_to_sequences([text])
    padded_sequence = pad_sequences(sequence, maxlen=max_len)
    prediction = model.predict(padded_sequence)
    return prediction[0][0]

news = input("Enter a news text: ")
cnn_prediction = predict_output(model_cnn, news)

In [None]:
def output_lable(n):
    if n == 0:
        return "Fake News"
    elif n == 1:
        return "Not A Fake News"

print("\nCNN Prediction:", output_lable(round(cnn_prediction)))

# RNN prediction

In [None]:
# Predict the output for a random input
def predict_output(model, text):
    text = wordopt(text.lower())  # Applying wordopt here
    sequence = tokenizer.texts_to_sequences([text])
    padded_sequence = pad_sequences(sequence, maxlen=max_len)
    prediction = model.predict(padded_sequence)
    return prediction[0][0]

news = input("Enter a news text: ")
rnn_prediction = predict_output(model_rnn, news)

In [57]:
def output_lable(n):
    if n == 0:
        return "Fake News"
    elif n == 1:
        return "Not A Fake News"

print("\nRNN Prediction:", output_lable(round(rnn_prediction)))


RNN Prediction: Not A Fake News
