In [None]:
import pandas as pd
import numpy as np
import tensorflow as tf
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
import nltk

nltk.download('stopwords')


train_df = pd.read_csv('C:\\Users\\lenovo\\Desktop\\dataset\\train.csv')
test_df = pd.read_csv('C:\\Users\\lenovo\\Desktop\\dataset\\test.csv')

train_df.dropna(subset=['title', 'text', 'label'], inplace=True)

train_df['clean_title'] = train_df['title'].fillna('')  
train_df['clean_text'] = train_df['text'].fillna('')    

test_df['clean_title'] = test_df['title'].fillna('')    
test_df['clean_text'] = test_df['text'].fillna('')       

stop_words = set(stopwords.words('english'))

def remove_stopwords(text):
    words = word_tokenize(text.lower()) 
    filtered_text = [word for word in words if word not in stop_words and word.isalnum()]  # Remove stopwords and non-alphanumeric words
    return " ".join(filtered_text)


train_df['clean_title'] = train_df['clean_title'].apply(remove_stopwords)
train_df['clean_text'] = train_df['clean_text'].apply(remove_stopwords)

test_df['clean_title'] = test_df['clean_title'].apply(remove_stopwords)
test_df['clean_text'] = test_df['clean_text'].apply(remove_stopwords)


train_df['combined_text'] = train_df['clean_title'] + " " + train_df['clean_text']
test_df['combined_text'] = test_df['clean_title'] + " " + test_df['clean_text']

tokenizer = Tokenizer(num_words=5000, oov_token="<OOV>")
tokenizer.fit_on_texts(train_df['combined_text'])

train_sequences = tokenizer.texts_to_sequences(train_df['combined_text'])
test_sequences = tokenizer.texts_to_sequences(test_df['combined_text'])

maxlen = 50
train_sequences_padded = pad_sequences(train_sequences, maxlen=maxlen, padding='post', truncating='post')
test_sequences_padded = pad_sequences(test_sequences, maxlen=maxlen, padding='post', truncating='post')

train_labels = train_df['label'].values

X_train, X_val, y_train, y_val = train_test_split(train_sequences_padded, train_labels, test_size=0.2, random_state=42)



[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\lenovo\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [13]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dropout, Dense


model = Sequential()


model.add(Embedding(input_dim=5000, output_dim=128, input_length=maxlen))

# LSTM layer
model.add(LSTM(128, return_sequences=False, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(64, activation='relu'))
# Dropout layer
model.add(Dropout(0.5))  

model.add(Dense(1, activation='sigmoid')) 

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

model.summary()




In [15]:
# Train the model
from tensorflow.keras.callbacks import EarlyStopping

early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)

history = model.fit(X_train, y_train, epochs=20, batch_size=64, validation_data=(X_val, y_val), callbacks=[early_stopping])



Epoch 1/20
[1m253/253[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m32s[0m 78ms/step - accuracy: 0.7955 - loss: 0.4114 - val_accuracy: 0.9584 - val_loss: 0.1396
Epoch 2/20
[1m253/253[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 72ms/step - accuracy: 0.9648 - loss: 0.1269 - val_accuracy: 0.9510 - val_loss: 0.1780
Epoch 3/20
[1m253/253[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 75ms/step - accuracy: 0.9640 - loss: 0.1438 - val_accuracy: 0.9592 - val_loss: 0.1451
Epoch 4/20
[1m253/253[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 80ms/step - accuracy: 0.9711 - loss: 0.1129 - val_accuracy: 0.9495 - val_loss: 0.1496


In [17]:
val_loss, val_accuracy = model.evaluate(X_val, y_val)
print(f'Validation Loss: {val_loss}')
print(f'Validation Accuracy: {val_accuracy}')

# Predict on the test data
test_predictions = model.predict(test_sequences_padded)
test_predictions = (test_predictions > 0.5).astype(int)

[1m127/127[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 16ms/step - accuracy: 0.9618 - loss: 0.1282
Validation Loss: 0.13955755531787872
Validation Accuracy: 0.9584261178970337
[1m163/163[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 18ms/step
