<a href="https://colab.research.google.com/github/muqarrab469/insightshield/blob/main/TextModelTraining.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**Model Trainning and Testing**

Google Drive Mounting:

In [None]:
from google.colab import drive
drive.mount('/content/drive')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


Data Sets Loading:

In [None]:
import pandas as pd

# Define file paths
real_news_path = '/content/drive/My Drive/FYP/FYPDataSet/TrueNewsData.csv'
fake_news_path = '/content/drive/My Drive/FYP/FYPDataSet/FakeNewsData.csv'

real_df = pd.read_csv(real_news_path, encoding='ISO-8859-1')
fake_df = pd.read_csv(fake_news_path, encoding='ISO-8859-1')

#Labels
#real_df['label'] = 'Real'
#fake_df['label'] = 'Fake'

#Combinig Datasets of Real and Fake News
df = pd.concat([real_df, fake_df], ignore_index=True)


Data Preprocessing:

In [None]:
!pip install beautifulsoup4
!pip instal pandas
!pip install nltk

import pandas as pd
import re
import string
from bs4 import BeautifulSoup
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import nltk
import unicodedata

nltk.download('stopwords')
nltk.download('punkt')


#Function for Preprocessing:
def preprocess_text(text):

    #To remove HTML tags
    text = BeautifulSoup(text, "html.parser").get_text()

    #To lowercase data
    text = text.lower()

    #To Rremove punctuation marks
    text = text.translate(str.maketrans('', '', string.punctuation))

    #To remove numbers using regular expression
    text = re.sub(r'\d+', '', text)

    #To remove non-ASCII characters
    text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('ascii')

    #Tokenization
    tokens = word_tokenize(text)

    #To remove stopwords
    tokens = [word for word in tokens if word not in stopwords.words('english')]

    #To join tokens
    return ' '.join(tokens)


df['processed_title'] = df['news_title'].apply(preprocess_text)
df['processed_text'] = df['news_text'].apply(preprocess_text)

# Display the DataFrame with processed text
print(df[['news_title', 'processed_title', 'news_text', 'processed_text']])


ERROR: unknown command "instal" - maybe you meant "install"


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
  text = BeautifulSoup(text, "html.parser").get_text()
  text = BeautifulSoup(text, "html.parser").get_text()
  text = BeautifulSoup(text, "html.parser").get_text()


                                              news_title  \
0      A dairy-for-lumber deal? Think-tank paper prop...   
1      A look at what's driving cooling electric vehi...   
2      Air Canada lands last in on-time flights in ra...   
3      Alberta's carbon capture rollout plan criticiz...   
4      'America First' strategy is raising hackles, a...   
...                                                  ...   
19992  YOU GOTTA LOVE THIS: [VIDEO] White Girl Told S...   
19993  YOU WON?T BELIEVE Why Students In Communist Wi...   
19994  Yahoo caves in to NSA, FBI ? and secretly moni...   
19995  YALE DISFIGURES STONE CARVING to Disarm Purita...   
19996  Trump's $175M Bond Rejected, Filled With Filin...   

                                         processed_title  \
0      dairyforlumber deal thinktank paper proposes c...   
1      look whats driving cooling electric vehicle sa...   
2      air canada lands last ontime flights ranking n...   
3      albertas carbon capture rollout 

Model Training:

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Conv1D, GlobalMaxPooling1D, Dense, Dropout

#Dataframe loading after preprocessing
titles = df['processed_title'].values
texts = df['processed_text'].values
df['label'] = df['news_label'].map({'Real': 1, 'Fake': 0})
labels = df['label'].values


#Combining title and text:
combined_texts = [title + " " + text for title, text in zip(titles, texts)]

#Tokenizing the combined text data
tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(combined_texts)
sequences = tokenizer.texts_to_sequences(combined_texts)

#Padding sequences
maxlen = 1000
padded_sequences = pad_sequences(sequences, maxlen=maxlen)

#Splitting data into training and testing parts
X_train, X_test, y_train, y_test = train_test_split(padded_sequences, labels, test_size=0.2, random_state=42)

#Building the 1D CNN model
model = Sequential()
model.add(Embedding(input_dim=5000, output_dim=128, input_length=maxlen))
model.add(Conv1D(128, 5, activation='relu'))
model.add(GlobalMaxPooling1D())
model.add(Dense(128, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(1, activation='sigmoid'))

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

#Training the model
model.fit(X_train, y_train, epochs=5, batch_size=32, validation_data=(X_test, y_test))

#Testing:
loss, accuracy = model.evaluate(X_test, y_test)
print("Test Accuracy:", accuracy)


Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Test Accuracy: 0.981249988079071


Model Saving

In [None]:
#Model saving .h5 format
model.save('/content/drive/MyDrive/FYP/news_detection_model.h5')


  saving_api.save_model(
