In [1]:
# Data manipulation and analysis
import numpy as np
import pandas as pd

# Visualization libraries
import seaborn as sns
import plotly.express as px
import matplotlib.pyplot as plt
from matplotlib.ticker import MaxNLocator
import matplotlib.gridspec as gridspec
import matplotlib.patches as mpatches
%matplotlib inline

# Text processing and visualization
import re
from wordcloud import WordCloud
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

# Deep learning framework
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import optimizers
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Embedding, LSTM, GRU, Bidirectional
from tensorflow.keras.layers import Conv1D, MaxPooling1D, Flatten
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Machine learning utilities
from sklearn.model_selection import train_test_split

# Model training utilities
from keras.callbacks import ReduceLROnPlateau, EarlyStopping, ModelCheckpoint

# Additional utilities
from collections import Counter
import missingno as msno

# Suppress warnings
import warnings
warnings.filterwarnings("ignore")


ModuleNotFoundError: No module named 'plotly'

In [2]:
# Load the training dataset from CSV file
training_data = pd.read_csv("training_set.csv")

# Import the test dataset from CSV file
evaluation_set = pd.read_csv("evaluation_set.csv")


FileNotFoundError: [Errno 2] File train.csv does not exist: 'train.csv'

In [None]:
# Display summary statistics for the evaluation dataset
print('Evaluation Dataset Statistics:')
display(evaluation_set.describe(include='all'))

# Show the first few rows of the training dataset
print('Training Dataset Preview:')
display(training_data.head())

# Present summary statistics for the training dataset
print('Training Dataset Statistics:')
display(training_data.describe(include='all'))

# Exhibit the first few rows of the evaluation dataset
print('Evaluation Dataset Preview:')
display(evaluation_set.head())


In [None]:
# Check for missing values in the training dataset
print('Null values in Train set')
display(training_data.isna().sum())

# Check for missing values in the evaluation dataset
print('Null values in Train set')
display(evaluation_set.isna().sum())


In [None]:
# Set up the figure size
plt.figure(figsize=(8,8))

# Create a donut chart of the outcome variable distribution
outcome_counts = training_data['label'].value_counts()
colors = ['#ff9999','#66b3ff','#99ff99','#ffcc99']
explode = (0.05, 0, 0, 0)  # To emphasize the first slice

outcome_counts.plot(kind='pie', autopct='%1.1f%%', startangle=90, colors=colors, 
                    explode=explode, pctdistance=0.85, wedgeprops=dict(width=0.5))

# Customize the plot
plt.title('Distribution of Outcome Labels', fontsize=16)
plt.ylabel('')  # Remove y-label
plt.axis('equal')  # Equal aspect ratio ensures that pie is drawn as a circle

# Add a circle at the center to create a donut chart
center_circle = plt.Circle((0,0), 0.70, fc='white')
fig = plt.gcf()
fig.gca().add_artist(center_circle)

# Display the plot
plt.tight_layout()
plt.show()


In [None]:
def visualize_frequent_terms(dataframe, dataset_type, color_scheme, top_n=20):
    # Set the plot style
    sns.set_style('darkgrid')
    
    # Create the figure and axes
    fig, ax = plt.subplots(figsize=(10, 10))

    # Get the top N most frequent terms
    frequent_terms = dataframe['term'].value_counts()[:top_n]
    
    # Create a horizontal bar plot
    bars = sns.barplot(x=frequent_terms, y=frequent_terms.index, palette=color_scheme, ax=ax)
    
    # Add value labels to the bars
    for container in bars.containers:
        ax.bar_label(container, padding=5)

    # Customize x-axis labels
    ax.set_xticklabels([f'{int(i):,}' for i in ax.get_xticks()], fontsize=9)    
    
    # Customize y-axis labels
    ax.set_yticklabels([t.get_text() for t in ax.get_yticklabels()], fontsize=11)
    
    # Set title and labels
    plt.title(f'Top {top_n} Frequent Terms in {dataset_type} Dataset', fontsize=16, pad=20)
    plt.xlabel('Frequency', fontsize=12)
    plt.ylabel('Terms', fontsize=12)
    
    # Adjust layout and display the plot
    plt.tight_layout()
    plt.show()

# Visualize frequent terms for training and evaluation datasets
visualize_frequent_terms(training_data, 'Training', 'viridis')
visualize_frequent_terms(evaluation_set, 'Evaluation', 'magma')


In [None]:
def clean_tweet_text(text):
    # Convert text to lowercase
    text = text.lower()
    
    # Initialize lemmatizer
    lemmatizer = WordNetLemmatizer()
    
    # Remove stop words
    text = " ".join(word for word in word_tokenize(text) if word not in stopwords.words('english'))
    
    # Replace specific patterns
    text = re.sub(r"like", " ", text)
    text = re.sub(r"nt", " ", text)
    
    # Remove punctuation and special characters
    text = re.sub(r'[^\w\s]', '', text)
    
    # Remove numbers
    text = re.sub(r'\d+', '', text)
    
    # Remove HTML tags
    text = re.sub(r"<.*?>|&.*?;", " ", text)
    
    # Remove URLs
    text = re.sub(r"https?://\S+|www\.\S+", " ", text)
    
    # Remove non-alphabetic characters
    text = re.sub(r"[^a-z]", " ", text)
    
    # Apply lemmatization
    text = " ".join(lemmatizer.lemmatize(word) for word in word_tokenize(text))
    
    # Remove single-character words
    text = re.sub(r"\b\w\b", "", text).strip()
    
    return text

# Apply the cleaning function to the training and evaluation datasets
training_data["processed_text"] = training_data["text"].apply(clean_tweet_text)
evaluation_set["processed_text"] = evaluation_set["text"].apply(clean_tweet_text)


In [None]:
# Display the first few rows of the training dataset
print("Preview of Training Dataset:")
display(training_data.head(5))

# Show summary statistics for the training dataset
print("\nSummary Statistics of Training Dataset:")
display(training_data.describe())

# Display the first few rows of the evaluation dataset
print("\nPreview of Evaluation Dataset:")
display(evaluation_set.head(5))

# Show summary statistics for the evaluation dataset
print("\nSummary Statistics of Evaluation Dataset:")
display(evaluation_set.describe())

# Display information about the datasets
print("\nTraining Dataset Info:")
training_data.info()

print("\nEvaluation Dataset Info:")
evaluation_set.info()


In [None]:
# Initialize lists for different tweet categories
non_emergency = []
emergency = []

# Process non-emergency tweets
for tweet in train.loc[train['target'] == 0, 'processed_text'].dropna():
    for word in tweet.lower().strip().split():
        non_emergency.append(word)

# Process emergency tweets
for tweet in train.loc[train['target'] == 1, 'processed_text'].dropna():
    for word in tweet.lower().strip().split():
        emergency.append(word)

# Set up the plot
plt.figure(figsize=(20, 20))

# Create and display non-emergency word cloud
plt.subplot(1, 2, 1)
normal_cloud = WordCloud(background_color="white", max_font_size=400, width=500, height=500, stopwords="english", random_state=42, repeat=True)
normal_cloud.generate(' '.join(non_emergency))
plt.title("Tweets Unrelated to Emergencies", size=15)
plt.imshow(normal_cloud)

# Create and display emergency word cloud
plt.subplot(1, 2, 2)
crisis_cloud = WordCloud(background_color="white", max_font_size=400, width=500, height=500, stopwords="english", random_state=42, repeat=True)
crisis_cloud.generate(' '.join(emergency))
plt.title("Emergency-related Tweets", size=15)
plt.imshow(crisis_cloud)


In [None]:
# Count the number of characters in each tweet
train['word_count'] = train["text"].apply(len)
test['word_count'] = test["text"].apply(len)

# Display summary statistics for word counts in the training set
train['word_count'].describe()


In [None]:
def word_frequency(text_series):
    frequency = Counter()
    for entry in text_series.values:
        for word in entry.split():
            frequency[word] += 1
    return frequency

# Calculate word frequencies in the preprocessed text
word_counts = word_frequency(train.preprocessed_text)

# Determine the number of unique words
unique_word_count = len(word_counts)

# Display the count of unique words
unique_word_count


In [None]:
# Prepare features and target variable
features = train.processed_text
target = train.label

# Divide dataset into training and validation sets
features_train, features_val, target_train, target_val = train_test_split(
    features, target, test_size=0.15, random_state=42
)

# Convert to numpy arrays for compatibility with some ML libraries
features_train = np.array(features_train)
features_val = np.array(features_val)
target_train = np.array(target_train)
target_val = np.array(target_val)


In [None]:
# Initialize tokenizer with vocabulary size
text_tokenizer = Tokenizer(num_words=unique_word_count)

# Fit tokenizer on training data
text_tokenizer.fit_on_texts(features_train)

# Transform text to numerical sequences
train_seq = text_tokenizer.texts_to_sequences(features_train)
val_seq = text_tokenizer.texts_to_sequences(features_val)

# Define maximum sequence length
seq_max_len = 20

# Pad sequences to ensure uniform length
train_padded_seq = pad_sequences(train_seq, maxlen=seq_max_len, padding="post", truncating="post")
val_padded_seq = pad_sequences(val_seq, maxlen=seq_max_len, padding="post", truncating="post")


In [None]:
# Check for available GPUs and configure memory growth
gpu_devices = tf.config.experimental.list_physical_devices('GPU')
print("Number of GPUs detected: ", len(gpu_devices))

# Enable dynamic memory allocation for the first GPU
if gpu_devices:
    tf.config.experimental.set_memory_growth(gpu_devices[0], True)


In [None]:
# Define the bidirectional LSTM model
bi_lstm_model = keras.models.Sequential([
    Embedding(input_dim=unique_word_count, output_dim=32, input_length=seq_max_len),
    Bidirectional(LSTM(256, return_sequences=True, dropout=0.1)),
    Bidirectional(LSTM(256, return_sequences=True, dropout=0.1)),
    LSTM(256),
    Dropout(0.3),
    Dense(32, activation='relu'),
    Dense(1, activation='sigmoid')
])

# Display model architecture
bi_lstm_model.summary()

# Configure optimizer
from tensorflow.keras.optimizers import Adam
adam_optimizer = Adam(learning_rate=0.0001)

# Compile the model
bi_lstm_model.compile(optimizer=adam_optimizer, loss='binary_crossentropy', metrics=['accuracy'])

# Set up early stopping
from tensorflow.keras.callbacks import EarlyStopping
stop_early = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)

# Train the model
training_history = bi_lstm_model.fit(
    train_padded_seq, target_train, 
    epochs=10, 
    batch_size=64, 
    validation_data=(val_padded_seq, target_val), 
    callbacks=[stop_early]
)


In [None]:
# Plot training and validation accuracy over epochs
plt.figure(figsize=(12, 6))
plt.plot(training_history.history['accuracy'], label='Train Accuracy')
plt.plot(training_history.history['val_accuracy'], label='Validation Accuracy')
plt.title('Model Accuracy Over Epochs')
plt.ylabel('Accuracy')
plt.xlabel('Epochs')
plt.legend(loc='upper left')
plt.grid()
plt.show()

# Plot training and validation loss over epochs
plt.figure(figsize=(12, 6))
plt.plot(training_history.history['loss'], label='Train Loss')
plt.plot(training_history.history['val_loss'], label='Validation Loss')
plt.title('Model Loss Over Epochs')
plt.ylabel('Loss')
plt.xlabel('Epochs')
plt.legend(loc='upper left')
plt.grid()
plt.show()


In [None]:
# Define the CNN model architecture
cnn_classifier = Sequential([
    Embedding(input_dim=unique_word_count, output_dim=128, input_length=seq_max_len),
    Conv1D(filters=64, kernel_size=3, activation='relu'),
    MaxPooling1D(pool_size=2),
    Conv1D(filters=32, kernel_size=3, activation='relu'),
    MaxPooling1D(pool_size=2),
    Flatten(),
    Dense(10, activation='relu'),
    Dropout(0.35),
    Dense(1, activation='sigmoid')
])

# Configure optimizer
adam_opt = Adam(learning_rate=0.0001)

# Compile the model
cnn_classifier.compile(loss='binary_crossentropy', optimizer=adam_opt, metrics=['accuracy'])

# Display model summary
cnn_classifier.summary()

# Set up early stopping mechanism
halt_early = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)

# Train the CNN model
cnn_training_history = cnn_classifier.fit(
    train_padded_seq, target_train,
    epochs=20,
    batch_size=32,
    validation_data=(val_padded_seq, target_val),
    callbacks=[halt_early]
)


In [None]:
import matplotlib.pyplot as plt

# Plot training and validation accuracy
plt.figure(figsize=(12, 6))
plt.plot(cnn_training_history.history['accuracy'], label='Train Accuracy')
plt.plot(cnn_training_history.history['val_accuracy'], label='Validation Accuracy')
plt.title('CNN Model Accuracy Over Epochs')
plt.ylabel('Accuracy')
plt.xlabel('Epochs')
plt.legend(loc='upper left')
plt.grid(True)
plt.show()

# Plot training and validation loss
plt.figure(figsize=(12, 6))
plt.plot(cnn_training_history.history['loss'], label='Train Loss')
plt.plot(cnn_training_history.history['val_loss'], label='Validation Loss')
plt.title('CNN Model Loss Over Epochs')
plt.ylabel('Loss')
plt.xlabel('Epochs')
plt.legend(loc='upper left')
plt.grid(True)
plt.show()


In [None]:
# Transform test data into numerical sequences
test_seq = text_tokenizer.texts_to_sequences(test.processed_text)

# Ensure uniform sequence length through padding
test_padded_seq = pad_sequences(test_seq, maxlen=seq_max_len, padding="post", truncating="post")

# Generate predictions using the trained bidirectional LSTM model
prediction_probabilities = bi_lstm_model.predict(test_padded_seq)

# Convert probabilities to binary class labels
predicted_classes = (prediction_probabilities > 0.5).astype(int)

# Append predicted labels to the test dataset
test['model_prediction'] = predicted_classes

test.head()


In [None]:
test.head()

In [None]:
import pandas as pd

# Create a submission DataFrame
result_df = pd.DataFrame({
    'id': test_data.id,
    'target': test_data.model_prediction
})

# Save the results to a CSV file
result_df.to_csv('bi_lstm_results.csv', index=False)


In [None]:
# Transform test data into numerical sequences using the same tokenizer
test_sequences = text_tokenizer.texts_to_sequences(test.processed_text)

# Pad the sequences to ensure uniform length
test_padded_seq = pad_sequences(test_sequences, maxlen=seq_max_len, padding="post", truncating="post")

# Generate predictions for the test data using the CNN model
cnn_predictions = model_cnn.predict(test_padded_seq)

# Convert predicted probabilities to binary class labels (0 or 1)
predicted_classes = (cnn_predictions > 0.5).astype(int)

# Append predicted labels to the test DataFrame
test['model_prediction'] = predicted_classes


In [None]:
# Create a DataFrame for submission
submission_df = pd.DataFrame({
    'id': test.id,
    'target': test.model_prediction  # Use the updated column name
})

# Save the submission DataFrame to a CSV file
submission_df.to_csv('cnn_model_submission.csv', index=False)


In [None]:
from IPython.display import Image
Image(filename="/path/to/your/image/Kaggle_score_NLP.png")
