<a href="https://colab.research.google.com/github/lmarieta/disaster_tweet/blob/main/NLP_tweet_analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [252]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn import feature_extraction, linear_model, model_selection, preprocessing
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
import xgboost as xgb
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense, Concatenate, LSTM, Embedding, Flatten, Dropout
from keras.regularizers import l2
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import io
from scipy.sparse import hstack
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import f1_score
from keras.callbacks import LearningRateScheduler
from keras.optimizers import Adam

In [145]:
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [146]:
train_df = pd.read_csv("/content/train.csv")
test_df = pd.read_csv("/content/test.csv")

In [147]:
train_df[train_df["target"] == 0]["text"].values[1]

'I love fruits'

In [148]:
train_df[train_df["target"] == 1]["text"].values[1]

'Forest fire near La Ronge Sask. Canada'

In [149]:
count_vectorizer = feature_extraction.text.CountVectorizer()

## let's get counts for the first 5 tweets in the data
example_train_vectors = count_vectorizer.fit_transform(train_df["text"][0:5])

In [150]:
## we use .todense() here because these vectors are "sparse" (only non-zero elements are kept to save space)
print(example_train_vectors[0].todense().shape)
print(example_train_vectors[0].todense())

(1, 54)
[[0 0 0 1 1 1 0 0 0 0 0 0 1 1 0 0 0 0 1 0 0 0 0 0 0 1 0 0 0 1 0 0 0 0 1 0
  0 0 0 1 0 0 0 0 0 0 0 0 0 1 1 0 1 0]]


In [151]:
X = train_df["text"]
y = train_df["target"]
X_submission = test_df['text']

In [177]:
# Convert binary vectors to TF-IDF representation
tfidf_vectorizer = TfidfVectorizer(binary=True, max_features=5000)  # Adjust max_features as needed
X_tfidf = tfidf_vectorizer.fit_transform(X)
X_submission_tfidf = tfidf_vectorizer.fit_transform(X_submission)

In [178]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [190]:
# Tokenize the text
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X)

# Tokenize and pad sequences for training data
sequences_train = tokenizer.texts_to_sequences(X_train)
X_train_padded = pad_sequences(sequences, maxlen=max_sequence_length, padding='post', truncating='post')
sequences_submission = tokenizer.texts_to_sequences(X_submission)
X_submission_padded = pad_sequences(sequences_submission, maxlen=max_sequence_length, padding='post', truncating='post')

# Load pre-trained GloVe word embeddings
glove_path = '/content/glove.6B.50d.txt' # '/content/gdrive/MyDrive/glove.6B.50d.txt' # Replace with the path to your downloaded GloVe file
embedding_dim = 50  # Should match the dimension of the GloVe file you downloaded

embedding_index = {}
with open(glove_path, 'r', encoding='utf-8') as file:
    for line in file:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embedding_index[word] = coefs

# Create an embedding matrix
vocab_size = len(tokenizer.word_index) + 1
embedding_matrix = np.zeros((vocab_size, embedding_dim))
for word, index in tokenizer.word_index.items():
    embedding_vector = embedding_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[index] = embedding_vector

# Create the model input
X_word_embeddings = embedding_matrix[X_train_padded]
X_submission_word_embeddings = embedding_matrix[X_submission_padded]

In [191]:
X_train_tfidf, X_test_tfidf, X_train_word_embeddings, X_test_word_embeddings, y_train, y_test = train_test_split(
    X_tfidf, X_word_embeddings, y, test_size=0.2, random_state=42
)

In [192]:
# Create the model input for word embeddings
word_embeddings_input = Input(shape=(max_sequence_length, embedding_dim))  # Fix the input shape
word_embeddings_output = LSTM(64)(word_embeddings_input)
word_embeddings_model = Model(inputs=word_embeddings_input, outputs=word_embeddings_output)


In [199]:
# Reshape the word embeddings to match the number of features in TF-IDF
X_train_word_embeddings_flat = X_train_word_embeddings.reshape((X_train_word_embeddings.shape[0], -1))
X_test_word_embeddings_flat = X_test_word_embeddings.reshape((X_test_word_embeddings.shape[0], -1))
X_submission_embeddings_flat = X_submission_word_embeddings.reshape((X_submission_word_embeddings.shape[0], -1))

In [289]:
X_train_tfidf_dense = X_train_tfidf.toarray()
X_test_tfidf_dense = X_test_tfidf.toarray()
X_submission_tfidf_dense = X_submission_tfidf.toarray()
# Concatenate the representations
X_train_combined = np.concatenate((X_train_tfidf_dense, X_train_word_embeddings_flat), axis=1)
X_test_combined = np.concatenate((X_test_tfidf_dense, X_test_word_embeddings_flat), axis=1)
X_submission_combined = np.concatenate((X_submission_tfidf_dense, X_submission_embeddings_flat), axis=1)
# Build a simple model
model_input = Input(shape=(X_train_combined.shape[1],))
dense_layer = Dense(128, activation='relu', kernel_regularizer=l2(0.005))(model_input)
dropout_layer = Dropout(0.6)(dense_layer)
dense_layer = Dense(128, activation='relu', kernel_regularizer=l2(0.005))(dropout_layer)
dropout_layer = Dropout(0.6)(dense_layer)
dense_layer = Dense(128, activation='relu', kernel_regularizer=l2(0.005))(dropout_layer)
dropout_layer = Dropout(0.6)(dense_layer)
output_layer = Dense(1, activation='sigmoid')(dropout_layer)
model = Model(inputs=model_input, outputs=output_layer)

In [290]:
def linear_decay(epoch):
    initial_learning_rate = 0.02  # Adjust this based on your preference
    decay = 0.002  # Adjust the decay rate based on your preference
    new_learning_rate = initial_learning_rate - epoch * decay
    return max(new_learning_rate, 0.0001)

In [None]:
initial_learning_rate = 0.1
final_lr = 0.001
optimizer = Adam(learning_rate=initial_learning_rate)

# Define the learning rate scheduler
lr_scheduler = LearningRateScheduler(linear_decay)

# Compile and train the model
model.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=['accuracy'])
model.fit(X_train_combined, y_train, epochs=20, batch_size=32, validation_split=0.2, callbacks=[lr_scheduler])


Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [None]:
# Get predictions on the training data
y_pred_test = model.predict(X_test_combined)

# Convert predictions to binary (0 or 1) based on a threshold (e.g., 0.5)
threshold = 0.5
y_pred_binary = (y_pred_test > threshold).astype(int)

# Compute the F1 score
f1 = f1_score(y_test, y_pred_binary)
print("F1 Score:", f1)

In [203]:
sample_submission = pd.read_csv("/content/sample_submission.csv")

In [204]:
# Get predictions on the training data
y_pred_submission = model.predict(X_submission_combined)

# Convert predictions to binary (0 or 1) based on a threshold (e.g., 0.5)
threshold = 0.5
y_pred_binary = (y_pred_submission > threshold).astype(int)



In [205]:
print(X_submission_padded.shape)

(3263, 20)


In [208]:
sample_submission["target"] = y_pred_binary

In [207]:
sample_submission.head()

Unnamed: 0,id,target
0,0,1
1,2,1
2,3,1
3,9,1
4,11,1


In [209]:
sample_submission.to_csv("/content/submission.csv", index=False)