In [1]:
# Install required packages
!pip install numpy pandas matplotlib seaborn scikit-learn tensorflow




In [2]:
## Setup and Imports

import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Conv1D, MaxPooling1D, Flatten, Dropout, Embedding
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from sklearn.model_selection import train_test_split
from tensorflow.keras.optimizers import Adam
from sklearn.preprocessing import LabelEncoder

import urllib.request
import zipfile
import os
import re
import matplotlib.pyplot as plt
import seaborn as sns


In [3]:

# Set random seeds for reproducibility
np.random.seed(42)
tf.random.set_seed(42)

# Load GloVe embeddings

def load_glove_embeddings(glove_file="../glove.6B.100d.txt"):
    """Load GloVe embeddings into a dictionary"""
    embeddings_index = {}
    
    with open(glove_file, 'r', encoding='utf-8') as f:
        for line in f:
            values = line.split()
            word = values[0]
            coefs = np.asarray(values[1:], dtype='float32')
            embeddings_index[word] = coefs
    
    print(f'Found {len(embeddings_index)} word vectors.')
    return embeddings_index

embeddings_index = load_glove_embeddings()

Found 400000 word vectors.


In [4]:
# Load TCFD dataset for training and testing
train_df = pd.read_json('../data/train.json1', lines=True)
test_df = pd.read_csv("../data/test.csv")
print(f"Dataset shape: {train_df.shape}")
print(f"Label distribution:\n{train_df['label'].value_counts()}")

Dataset shape: (1300, 2)
Label distribution:
label
2    519
0    300
1    255
3    164
4     62
Name: count, dtype: int64


In [5]:
# Text Preprocessing

def preprocess_text(text):
    """Clean and preprocess text"""
    # Convert to lowercase
    text = text.lower()
    # Remove special characters and digits
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    # Remove extra whitespace
    text = ' '.join(text.split())
    return text

train_df['text_clean'] = train_df['text'].apply(preprocess_text)
test_df['text_clean'] = test_df['text'].apply(preprocess_text)

# Encode labels and fit on training data
label_encoder = LabelEncoder()
train_df['label_encoded'] = label_encoder.fit_transform(train_df['label'])
test_df['label_encoded'] = label_encoder.transform(test_df['label'])

print(f"Sample processed text: {train_df['text_clean'][0]}")
print(f"Corresponding label: {train_df['label'][0]} -> {train_df['label_encoded'][0]}")

Sample processed text: scope optional scope that includes indirect emissions associated with the goods and services supply chain produced outside the organization included are emissions from the transport of products from our logistics centres to stores downstream performed by external logistics operators air land and sea transport as well as the emissions associated with electricity consumption in franchise stores
Corresponding label: 1 -> 1


In [6]:
# Tokenization and Sequence Preparation

# Parameters
MAX_WORDS = 10000  # Maximum number of words in vocabulary
MAX_LEN = 500      # Maximum sequence length
EMBEDDING_DIM = 100  # GloVe embedding dimension

# Tokenize texts
tokenizer = Tokenizer(num_words=MAX_WORDS, oov_token="<OOV>")
tokenizer.fit_on_texts(train_df['text_clean'])

# Convert texts to sequences
train_sequences = tokenizer.texts_to_sequences(train_df['text_clean'])
test_sequences = tokenizer.texts_to_sequences(test_df['text_clean'])


# Pad sequences to same length
X_train_full = pad_sequences(train_sequences, maxlen=MAX_LEN, padding='post', truncating='post')
X_test = pad_sequences(test_sequences, maxlen=MAX_LEN, padding='post', truncating='post')

# Prepare labels
# y = to_categorical(df['label_encoded'])
y_train_full = train_df['label_encoded'].values
y_test = test_df['label_encoded'].values


print(f"Vocabulary size: {len(tokenizer.word_index)}")
print(f"Training data shape: {X_train_full.shape}")
print(f"Test data shape: {X_test.shape}")
print(f"Training labels shape: {y_train_full.shape}")
print(f"Test labels shape: {y_test.shape}")

# Split training data into train and validation sets
X_train, X_val, y_train, y_val = train_test_split(
    X_train_full, y_train_full, test_size=0.2, random_state=42, stratify=y_train_full
)

print(f"Final training set shape: {X_train.shape}")
print(f"Validation set shape: {X_val.shape}")

Vocabulary size: 7963
Training data shape: (1300, 500)
Test data shape: (200, 500)
Training labels shape: (1300,)
Test labels shape: (200,)
Final training set shape: (1040, 500)
Validation set shape: (260, 500)


In [7]:
## Create Embedding Matrix

def create_embedding_matrix(tokenizer, embeddings_index, max_words, embedding_dim):
    """Create embedding matrix from GloVe embeddings"""
    word_index = tokenizer.word_index
    num_words = min(max_words, len(word_index)) + 1
    embedding_matrix = np.zeros((num_words, embedding_dim))
    
    found_words = 0
    for word, i in word_index.items():
        if i < max_words:
            embedding_vector = embeddings_index.get(word)
            if embedding_vector is not None:
                embedding_matrix[i] = embedding_vector
                found_words += 1
    
    print(f"Found embeddings for {found_words}/{num_words-1} words")
    return embedding_matrix

# Create embedding matrix
embedding_matrix = create_embedding_matrix(
    tokenizer, embeddings_index, MAX_WORDS, EMBEDDING_DIM
)

print(f"Embedding matrix shape: {embedding_matrix.shape}")


Found embeddings for 7050/7963 words
Embedding matrix shape: (7964, 100)


In [8]:
## Build CNN Model

def build_cnn_model(vocab_size, max_len, embedding_dim, embedding_matrix, num_classes):
    """Build CNN model with pre-trained GloVe embeddings"""
    model = Sequential([
        # Embedding layer with pre-trained GloVe weights
        Embedding(
            input_dim=vocab_size,  # Use actual vocabulary size
            output_dim=embedding_dim,
            input_length=max_len,
            weights=[embedding_matrix],
            trainable=False # Freeze embeddings
        ),
        
        Dropout(0.2),
        
        Conv1D(filters=128, kernel_size=3, activation='relu', padding='same'),
        MaxPooling1D(pool_size=3, padding='same'),
        # Dropout(0.3),  # Increased dropout
        
        Conv1D(filters=128, kernel_size=4, activation='relu', padding='same'),
        MaxPooling1D(pool_size=3, padding='same'),
        #Dropout(0.3),  # Add back dropout
        
        Conv1D(filters=128, kernel_size=5, activation='relu', padding='same'),
        MaxPooling1D(pool_size=3, padding='same'),
        #Dropout(0.3),  # Add back dropout

        Conv1D(filters=128, kernel_size=5, activation='relu', padding='same'),
        Dropout(0.2),
        MaxPooling1D(pool_size=3, padding='same'),

        # Another Conv + Dropout + Pool
        Conv1D(filters=128, kernel_size=5, activation='relu', padding='same'),
        Dropout(0.2),
        MaxPooling1D(pool_size=3, padding='same'),
        
        Flatten(),
        Dense(128, activation='relu'),
        Dropout(0.5),  # Higher dropout before final layer
        # Dense(64, activation='relu'),
        # Dropout(0.4),
        Dense(num_classes, activation='softmax')
    ])
    
    return model

# Build model
num_classes = len(np.unique(train_df['label_encoded']))
model = build_cnn_model(embedding_matrix.shape[0], MAX_LEN, EMBEDDING_DIM, embedding_matrix, num_classes)

# Compile model
model.compile(
    optimizer=Adam(learning_rate=0.0001),
    loss='sparse_categorical_crossentropy',
    metrics=['accuracy']
)

# Display model architecture
model.summary()

print("Tokenizer vocab size:", len(tokenizer.word_index))
print("Embedding matrix shape:", embedding_matrix.shape)
#print("Max token in sequences:", np.max(X_padded))
print("Number of classes:", num_classes)
# print("Unique classes:", np.unique(df['label_encoded']))

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 500, 100)          796400    
                                                                 
 dropout (Dropout)           (None, 500, 100)          0         
                                                                 
 conv1d (Conv1D)             (None, 500, 128)          38528     
                                                                 
 max_pooling1d (MaxPooling1D  (None, 167, 128)         0         
 )                                                               
                                                                 
 conv1d_1 (Conv1D)           (None, 167, 128)          65664     
                                                                 
 max_pooling1d_1 (MaxPooling  (None, 56, 128)          0         
 1D)                                                    

In [18]:
## Train the Model

from sklearn.utils.class_weight import compute_class_weight

class_weights = compute_class_weight(
    class_weight='balanced',
    classes=np.unique(y_train),
    y=y_train
)
class_weights_dict = dict(enumerate(class_weights))

from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau

print(f"Training set shape: {X_train.shape}")
print(f"Test set shape: {X_test.shape}")

# Train the model
history = model.fit(
    X_train, y_train,
    validation_data=(X_val, y_val),
    batch_size=8,
    epochs=40,
    class_weight=class_weights_dict,
    verbose=1,
)

Training set shape: (1040, 500)
Test set shape: (200, 500)
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [10]:
print("Class distribution:")
print(pd.Series(y_train).value_counts(normalize=True))
print("\nClass weights:", class_weights_dict)

Class distribution:
2    0.399038
0    0.230769
1    0.196154
3    0.125962
4    0.048077
Name: proportion, dtype: float64

Class weights: {0: 0.8666666666666667, 1: 1.0196078431372548, 2: 0.5012048192771085, 3: 1.5877862595419847, 4: 4.16}


In [19]:
## Evaluate the Model

# Evaluate on test set
test_loss, test_accuracy = model.evaluate(X_test, y_test, verbose=0)
print(f"\nTest Loss: {test_loss:.4f}")
print(f"Test Accuracy: {test_accuracy:.4f}")

# Make predictions
y_pred = model.predict(X_test, verbose=0)
y_pred_classes = np.argmax(y_pred, axis=1)

# Calculate additional metrics
from sklearn.metrics import classification_report, confusion_matrix, f1_score, accuracy_score

accuracy = accuracy_score(y_test, y_pred_classes)
f1_macro = f1_score(y_test, y_pred_classes, average='macro')
f1_weighted = f1_score(y_test, y_pred_classes, average='weighted')

print(f"Accuracy: {accuracy:.4f}")
print(f"F1 Macro: {f1_macro:.4f}")
print(f"F1 Weighted: {f1_weighted:.4f}")



Test Loss: 1.6331
Test Accuracy: 0.5700
Accuracy: 0.5700
F1 Macro: 0.4590
F1 Weighted: 0.5639


In [12]:
# save the trained model
model.save('./cnn_model/cnn_model.h5')

# tokenizer
import pickle
with open('./cnn_model/tokenizer.pkl', 'wb') as f:
    pickle.dump(tokenizer, f)

# embedding matrix
np.save('./cnn_model/embedding_matrix.npy', embedding_matrix)

# model parameters/config
model_config = {
    'MAX_LEN': MAX_LEN,
    'EMBEDDING_DIM': EMBEDDING_DIM,
    'vocab_size': embedding_matrix.shape[0],
    'num_classes': num_classes
}

with open('./cnn_model/model_config.pkl', 'wb') as f:
    pickle.dump(model_config, f)

In [13]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import LabelEncoder

# Example: Predict on dev or test set
# y_true = label_encoder.transform(test_df['label'])  # Actual labels
# y_pred_proba = model.predict(X_dev)                # Raw model predictions
# y_pred = np.argmax(y_pred_proba, axis=1)           # Convert to label indices

# Confusion matrix plotting function
def plot_conf_matrix(y_true, y_pred, class_names):
    cm = confusion_matrix(y_true, y_pred)
    plt.figure(figsize=(8, 6))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
                xticklabels=class_names, yticklabels=class_names)
    plt.title('Confusion Matrix')
    plt.xlabel('Predicted Label')
    plt.ylabel('True Label')
    plt.tight_layout()
    plt.show()

# Call the function with class labels
class_names = label_encoder.classes_
plot_conf_matrix(y_true, y_pred_classes, class_names)


NameError: name 'y_true' is not defined