# Import Libraries

In [None]:
!pip install flask ngrok tensorflow lime spacy pyngrok
!python -m spacy download en_core_web_sm


Collecting en-core-web-sm==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m33.7 MB/s[0m eta [36m0:00:00[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [None]:
import pandas as pd
import numpy as np
import re
import string
import spacy
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Conv1D, GlobalMaxPooling1D, Embedding, Dropout
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import pickle

In [None]:
# Load the English NLP model
spacy.cli.download("en_core_web_sm")
nlp = spacy.load("en_core_web_sm")

[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


# Load Data

In [None]:
# Load the datasets
fake = pd.read_csv('drive/My Drive/final_year_project/fake_dataset.csv')
true = pd.read_csv('drive/My Drive/final_year_project/true_dataset.csv')

# Assign labels
fake['label'] = 0  # Fake news
true['label'] = 1  # True news

# Combine the datasets
data = pd.concat([fake, true], ignore_index=True)

# Remove unnecessary columns
data = data.drop(columns=['title', 'subject', 'date'])

data

Unnamed: 0,text,label
0,Donald Trump just couldn t wish all Americans ...,0
1,House Intelligence Committee Chairman Devin Nu...,0
2,"On Friday, it was revealed that former Milwauk...",0
3,"On Christmas day, Donald Trump announced that ...",0
4,Pope Francis used his annual Christmas Day mes...,0
...,...,...
44893,BRUSSELS (Reuters) - NATO allies on Tuesday we...,1
44894,"LONDON (Reuters) - LexisNexis, a provider of l...",1
44895,MINSK (Reuters) - In the shadow of disused Sov...,1
44896,MOSCOW (Reuters) - Vatican Secretary of State ...,1


#Train & Save Model and Tokenizer

In [None]:
# Function to filter text based on POS tags
def filter_pos(text):
    doc = nlp(text)
    return " ".join([token.text for token in doc if token.pos_ in ['NOUN', 'VERB', 'ADJ']])

# Apply the POS filter to your text data
data['filtered_text'] = data['text'].apply(filter_pos)

# Preprocess text function
def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()
    # Remove URLs
    text = re.sub(r'https?://\S+|www\.\S+', '', text)
    # Remove HTML tags
    text = re.sub(r'<.*?>', '', text)
    # Remove punctuation
    text = re.sub(r'[%s]' % re.escape(string.punctuation), '', text)
    # Remove numbers
    text = re.sub(r'\w*\d\w*', '', text)
    # Remove extra whitespace
    text = re.sub(r'\s+', ' ', text)
    return text

# Apply the preprocessing to the text column
data['filtered_text'] = data['filtered_text'].apply(preprocess_text)

data

Unnamed: 0,text,label,filtered_text
0,Donald Trump just couldn t wish all Americans ...,0,wish leave had give shout enemies haters disho...
1,House Intelligence Committee Chairman Devin Nu...,0,going have bad day assumption many dossier pro...
2,"On Friday, it was revealed that former Milwauk...",0,revealed former considered administration has ...
3,"On Christmas day, Donald Trump announced that ...",0,day announced work following day golfing fourt...
4,Pope Francis used his annual Christmas Day mes...,0,used annual message rebuke mentioning name del...
...,...,...,...
44893,BRUSSELS (Reuters) - NATO allies on Tuesday we...,1,allies welcomed decision commit more forces pa...
44894,"LONDON (Reuters) - LexisNexis, a provider of l...",1,provider legal regulatory business information...
44895,MINSK (Reuters) - In the shadow of disused Sov...,1,shadow disused era factories street lined ecle...
44896,MOSCOW (Reuters) - Vatican Secretary of State ...,1,said was positive momentum idea visiting sugge...


In [None]:
# Tokenization and Padding
tokenizer = Tokenizer(num_words=10000)
tokenizer.fit_on_texts(data['filtered_text'])
sequences = tokenizer.texts_to_sequences(data['filtered_text'])

X = pad_sequences(sequences, maxlen=500)
y = data['label'].values

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# CNN model
model = Sequential([
    Embedding(input_dim=10000, output_dim=128, input_length=500),
    Conv1D(128, 5, activation='relu'),
    GlobalMaxPooling1D(),
    Dropout(0.5),
    Dense(1, activation='sigmoid')
])

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
history = model.fit(X_train, y_train, epochs=5, batch_size=32, validation_split=0.2)

# Save the trained model
model.save('fake_news_model.h5')

# Save the tokenizer
with open('tokenizer.pkl', 'wb') as f:
    pickle.dump(tokenizer, f)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


  saving_api.save_model(


In [None]:
# Evaluate the model
from sklearn.metrics import classification_report, accuracy_score

y_pred = (model.predict(X_test) > 0.5).astype("int32")
print("\n", classification_report(y_test, y_pred))
print(f"Accuracy: {accuracy_score(y_test, y_pred)}")


               precision    recall  f1-score   support

           0       0.98      0.98      0.98      4733
           1       0.98      0.98      0.98      4247

    accuracy                           0.98      8980
   macro avg       0.98      0.98      0.98      8980
weighted avg       0.98      0.98      0.98      8980

Accuracy: 0.9791759465478842


saving model and tokenizer to Google Drive

In [None]:
# Save the trained model
model.save('drive/My Drive/final_year_project/fake_news_model.h5')

# Save the tokenizer
with open('drive/My Drive/final_year_project/tokenizer.pkl', 'wb') as f:
    pickle.dump(tokenizer, f)


  saving_api.save_model(
