In [None]:
import pandas as pd
import numpy as np
import re
import nltk
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, LSTM
from nltk.corpus import stopwords

# Download NLTK data (if not already downloaded)
nltk.download('stopwords')

# Load the datasets
train_data = pd.read_csv('/content/drive/MyDrive/Projects/Natural Language Processing with Disaster Tweets/train.csv')
test_data = pd.read_csv('/content/drive/MyDrive/Projects/Natural Language Processing with Disaster Tweets/test.csv')

# Basic preprocessing function
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'\d+', '', text)
    text = re.sub(r'[^\w\s]', '', text)
    text = ' '.join(word for word in text.split() if word not in stopwords.words('english'))
    return text

# Apply preprocessing
train_data['text'] = train_data['text'].apply(preprocess_text)
test_data['text'] = test_data['text'].apply(preprocess_text)

# Features and target
X_train = train_data['text']
y_train = train_data['target']

# Tokenize and pad the text data
max_len = 100  # Adjust this based on your data
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X_train)
X_train_sequences = tokenizer.texts_to_sequences(X_train)
X_train_padded = pad_sequences(X_train_sequences, maxlen=max_len)

# Build and train the model
model = Sequential([
    Embedding(input_dim=len(tokenizer.word_index) + 1, output_dim=100, input_length=max_len),
    LSTM(128, return_sequences=True),
    LSTM(64),
    Dense(1, activation='sigmoid')
])

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Train the model
model.fit(X_train_padded, y_train, epochs=5, batch_size=32, validation_split=0.2)

# Prepare test data for prediction
test_sequences = tokenizer.texts_to_sequences(test_data['text'])
test_padded = pad_sequences(test_sequences, maxlen=max_len)

# Generate predictions
test_predictions = model.predict(test_padded)
test_predictions = (test_predictions > 0.5).astype(int)  # Convert probabilities to binary class

# Prepare the submission DataFrame
submission = pd.DataFrame({
    'id': test_data['id'],
    'target': test_predictions.flatten()  # Flatten to make sure it is 1D
})

# Save the submission file
submission.to_csv('/content/drive/MyDrive/Projects/Natural Language Processing with Disaster Tweets/submission.csv', index=False)


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Epoch 1/5




[1m191/191[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m55s[0m 258ms/step - accuracy: 0.6696 - loss: 0.6015 - val_accuracy: 0.7978 - val_loss: 0.4626
Epoch 2/5
[1m191/191[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m49s[0m 257ms/step - accuracy: 0.9018 - loss: 0.2554 - val_accuracy: 0.7991 - val_loss: 0.4924
Epoch 3/5
[1m191/191[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m51s[0m 265ms/step - accuracy: 0.9662 - loss: 0.1052 - val_accuracy: 0.7380 - val_loss: 0.5824
Epoch 4/5
[1m191/191[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m80s[0m 255ms/step - accuracy: 0.9845 - loss: 0.0554 - val_accuracy: 0.7459 - val_loss: 0.7548
Epoch 5/5
[1m191/191[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m83s[0m 258ms/step - accuracy: 0.9917 - loss: 0.0333 - val_accuracy: 0.7446 - val_loss: 0.7002
[1m102/102[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 92ms/step
