<a href="https://colab.research.google.com/github/lmarieta/disaster_tweet/blob/main/NLP_tweet_analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [526]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn import feature_extraction, linear_model, model_selection, preprocessing
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
import xgboost as xgb
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense, Concatenate, LSTM, Embedding, Flatten, Dropout
from keras.regularizers import l2
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import io
from scipy.sparse import hstack
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import f1_score
from keras.callbacks import LearningRateScheduler
from keras.optimizers import Adam

In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

In [None]:
train_df = pd.read_csv("/content/train.csv")
test_df = pd.read_csv("/content/test.csv")

In [None]:
train_df

In [None]:
train_df = train_df[['id','keyword', 'location', 'text', 'target']]


In [None]:
train_df[train_df["target"] == '0']["text"].values[1]

In [None]:
train_df[train_df["target"] == '1']["text"].values[1]

In [None]:
count_vectorizer = feature_extraction.text.CountVectorizer()

## let's get counts for the first 5 tweets in the data
example_train_vectors = count_vectorizer.fit_transform(train_df["text"][0:5])

In [None]:
## we use .todense() here because these vectors are "sparse" (only non-zero elements are kept to save space)
print(example_train_vectors[0].todense().shape)
print(example_train_vectors[0].todense())

In [None]:
X = train_df["text"]
y = train_df["target"]
X_submission = test_df['text']
X.fillna('', inplace=True)  # Replace np.nan with empty string
X_submission.fillna('', inplace=True)

In [None]:
def map_to_binary(value):
    if value == '0':
        return 0
    elif value == '1':
        return 1
    else:
        return 0

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)
y_train = y_train.map(lambda x: map_to_binary(x))
y_test = y_test.map(lambda x: map_to_binary(x))

In [None]:
# Convert binary vectors to TF-IDF representation
tfidf_vectorizer = TfidfVectorizer(binary=True, max_features=5000)  # Adjust max_features as needed
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_submission_tfidf = tfidf_vectorizer.transform(X_submission)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

In [None]:
# Tokenize the text
tokenizer = Tokenizer(oov_token='<OOV>')
tokenizer.fit_on_texts(X_train)

# Tokenize and pad sequences for training data
sequences_train = tokenizer.texts_to_sequences(X_train)
max_sequence_length = max(len(seq) for seq in sequences_train)
X_train_padded = pad_sequences(sequences_train, maxlen=max_sequence_length, padding='post', truncating='post')
sequences_test = tokenizer.texts_to_sequences(X_test)
X_test_padded = pad_sequences(sequences_test, maxlen=max_sequence_length, padding='post', truncating='post')
sequences_submission = tokenizer.texts_to_sequences(X_submission)
X_submission_padded = pad_sequences(sequences_submission, maxlen=max_sequence_length, padding='post', truncating='post')

# Load pre-trained GloVe word embeddings
glove_path = '/content/glove.6B.50d.txt' # '/content/gdrive/MyDrive/glove.6B.50d.txt' # Replace with the path to your downloaded GloVe file
embedding_dim = 50  # Should match the dimension of the GloVe file you downloaded

embedding_index = {}
with open(glove_path, 'r', encoding='utf-8') as file:
    for line in file:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embedding_index[word] = coefs

# Create an embedding matrix
vocab_size = len(tokenizer.word_index) + 1
embedding_matrix = np.zeros((vocab_size, embedding_dim))
for word, index in tokenizer.word_index.items():
    embedding_vector = embedding_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[index] = embedding_vector

# Create the model input
X_word_embeddings = embedding_matrix[X_train_padded]
X_test_embeddings = embedding_matrix[X_test_padded]
X_submission_word_embeddings = embedding_matrix[X_submission_padded]

In [None]:
X_train_tfidf, X_val_tfidf, X_train_word_embeddings, X_val_word_embeddings, y_train, y_val = train_test_split(
    X_train_tfidf, X_word_embeddings, y_train, test_size=0.2, random_state=42
)

In [None]:
# Create the model input for word embeddings
word_embeddings_input = Input(shape=(max_sequence_length, embedding_dim))  # Fix the input shape
word_embeddings_output = LSTM(64)(word_embeddings_input)
word_embeddings_model = Model(inputs=word_embeddings_input, outputs=word_embeddings_output)


In [None]:
# Reshape the word embeddings to match the number of features in TF-IDF
X_train_word_embeddings_flat = X_train_word_embeddings.reshape((X_train_word_embeddings.shape[0], -1))
X_val_word_embeddings_flat = X_val_word_embeddings.reshape((X_val_word_embeddings.shape[0], -1))
X_test_word_embeddings_flat = X_test_embeddings.reshape((X_test_embeddings.shape[0], -1))
X_submission_embeddings_flat = X_submission_word_embeddings.reshape((X_submission_word_embeddings.shape[0], -1))

In [None]:
X_train_tfidf_dense = X_train_tfidf.toarray()
X_test_tfidf_dense = X_test_tfidf.toarray()
X_submission_tfidf_dense = X_submission_tfidf.toarray()
# Concatenate the representations
X_train_combined = np.concatenate((X_train_tfidf_dense, X_train_word_embeddings_flat), axis=1)
X_test_combined = np.concatenate((X_test_tfidf_dense, X_test_word_embeddings_flat), axis=1)
X_submission_combined = np.concatenate((X_submission_tfidf_dense, X_submission_embeddings_flat), axis=1)
# Build a simple model
model_input = Input(shape=(X_train_combined.shape[1],))
dense_layer = Dense(256, activation='relu')(model_input)
dropout_layer = Dropout(0)(dense_layer)
dense_layer = Dense(256, activation='relu')(dropout_layer)
dropout_layer = Dropout(0)(dense_layer)
dense_layer = Dense(256, activation='relu')(dropout_layer)
dropout_layer = Dropout(0)(dense_layer)
output_layer = Dense(1, activation='sigmoid')(dropout_layer)
model = Model(inputs=model_input, outputs=output_layer)

In [None]:
def linear_decay(epoch):
    initial_learning_rate = 0.02  # Adjust this based on your preference
    decay = 0.002  # Adjust the decay rate based on your preference
    new_learning_rate = initial_learning_rate - epoch * decay
    return max(new_learning_rate, 0.0001)

In [None]:
from tensorflow.keras.optimizers.schedules import ExponentialDecay

# Define the learning rate schedule
initial_learning_rate = 0.01
decay_steps = 10  # Adjust as needed
decay_rate = 0.9  # Adjust as needed
staircase = False  # If True, the learning rate will decay in discrete steps

# Create the learning rate scheduler
lr_scheduler = ExponentialDecay(
    initial_learning_rate=initial_learning_rate,
    decay_steps=decay_steps,
    decay_rate=decay_rate,
    staircase=staircase
)

optimizer = Adam(learning_rate=lr_scheduler)

# Compile and train the model
model.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=['accuracy'])
model.fit(X_train_combined, y_train, epochs=20, batch_size=32, validation_split=0.2)


In [None]:
model.summary()

In [None]:
# Get predictions on the training data
y_pred_test = model.predict(X_test_combined)

# Convert predictions to binary (0 or 1) based on a threshold (e.g., 0.5)
threshold = 0.5
y_pred_binary = (y_pred_test > threshold).astype(int)

# Compute the F1 score
f1 = f1_score(y_test, y_pred_binary)
print("F1 Score:", f1)

In [None]:
sample_submission = pd.read_csv("/content/sample_submission.csv")

In [None]:
print(X_submission_combined[0].nonzero())


In [None]:
print(X_submission[0:10])

In [None]:
# Get predictions on the training data
y_pred_submission = model.predict(X_submission_combined)

# Convert predictions to binary (0 or 1) based on a threshold (e.g., 0.5)
threshold = 0.5
y_pred_binary = (y_pred_submission > threshold).astype(int)

In [None]:
print(y_pred_binary[0:10])

In [None]:
sample_submission["target"] = y_pred_binary

In [None]:
sample_submission.head()

In [None]:
sample_submission.to_csv("/content/submission.csv", index=False)