# Data analysis for the jeopardy dataset

## Read and process the data

### Import necessary modules

In [40]:
import json
import pandas as pd
import numpy as np
import random
import matplotlib.pyplot as plt

In [2]:
path = r"data/acl_titles_and_abstracts.txt"
abstractTitles = open(path, "r").read()
abstractTitles



In [4]:
papers = [paper.split("\n") for paper in abstractTitles.split("\n\n")]

In [5]:
f"Number of papers: {len(papers)}"

'Number of papers: 10875'

In [24]:
papers[0]

In [30]:
import tensorflow as tf
from tensorflow.keras.layers import Embedding, LSTM, Dense, Attention
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import numpy as np

# Example data
examples = papers

# Split examples into titles and abstracts
titles, abstracts = zip(*examples)

### Step 1: Text Vectorization with Tokenizer
# Initialize the tokenizer and fit on the combined text
tokenizer = Tokenizer()
tokenizer.fit_on_texts(titles + abstracts)

# Convert text to integer sequences
abstract_sequences = tokenizer.texts_to_sequences(abstracts)
title_sequences = tokenizer.texts_to_sequences(titles)

# Set max lengths for padding
max_abstract_len = max(len(seq) for seq in abstract_sequences)
max_title_len = max(len(seq) for seq in title_sequences)

# Pad sequences to ensure uniform shape
abstract_sequences = pad_sequences(abstract_sequences, maxlen=max_abstract_len, padding='post')
title_sequences = pad_sequences(title_sequences, maxlen=max_title_len, padding='post')

# Vocabulary size
vocab_size = len(tokenizer.word_index) + 1  # +1 for padding token

# Split title_sequences into inputs and outputs for teacher forcing
title_inputs = title_sequences[:, :-1]
title_outputs = title_sequences[:, 1:]

### Step 2: Define Seq2Seq Model with Attention
embedding_dim = 128
lstm_units = 128


In [22]:
%matplotlib qt
def createHistPlot(s_lens, c_lens, title = "Title"):
    fig, (chars_ax, words_ax) = plt.subplots(2)
    words_ax.hist(s_lens, bins=100)
    words_ax.set_xlabel(f"Length of {title}s (words)")
    words_ax.set_title(f"Distribution of {title} lengths in words")
    median = np.median(s_lens)
    words_ax.axvline(median, c="red", linewidth=3, label=f"Average length: {median}")
    max_len = np.max(s_lens)
    words_ax.axvline(max_len, c="red", linewidth=3, label=f"Max length: {max_len}")
    words_ax.set_ylabel("Number of samples")
    words_ax.legend()

    chars_ax.hist(c_lens, bins=100)
    chars_ax.set_title(f"Distribution of {title} lengths in chars")
    median = np.median(c_lens)
    chars_ax.axvline(np.median(c_lens),  c="red", linewidth=3, label=f"Average length: {median}")
    max_len = np.max(c_lens)
    chars_ax.axvline(max_len, c="red", linewidth=3, label=f"Max length: {max_len}")
    chars_ax.set_xlabel(f"Length of {title} (chars)")
    chars_ax.set_ylabel("Number of samples")
    chars_ax.legend()
    fig.tight_layout()
    plt.show()

sample_lens = np.asarray([len(s.split()) for s in titles])
sample_lens_chars = np.asarray([len(s) for s in titles])
createHistPlot(sample_lens, sample_lens_chars)

sample_lens = np.asarray([len(s.split()) for s in abstracts])
sample_lens_chars = np.asarray([len(s) for s in abstracts])
createHistPlot(sample_lens, sample_lens_chars, title="Abstract")

# Model

In [41]:
# Encoder
class Encoder(tf.keras.layers.Layer):
    def __init__(self, vocab_size, embedding_dim, lstm_units):
        super(Encoder, self).__init__()
        self.embedding = Embedding(vocab_size, embedding_dim)
        self.lstm = LSTM(lstm_units, return_sequences=True, return_state=True)

    def call(self, x):
        x = self.embedding(x)
        output, state_h, state_c = self.lstm(x)
        return output, state_h, state_c

# Decoder with Attention
class Decoder(tf.keras.layers.Layer):
    def __init__(self, vocab_size, embedding_dim, lstm_units):
        super(Decoder, self).__init__()
        self.embedding = Embedding(vocab_size, embedding_dim)
        self.lstm = LSTM(lstm_units, return_sequences=True, return_state=True)
        self.attention = Attention()
        self.dense = Dense(vocab_size, activation='softmax')

    def call(self, x, encoder_output, state):
        x = self.embedding(x)
        context_vector = self.attention([x, encoder_output])
        x_and_context = tf.concat([tf.expand_dims(context_vector, 1), x], axis=-1)
        output, state_h, state_c = self.lstm(x_and_context, initial_state=state)
        return self.dense(output), state_h, state_c

# Define the full Seq2Seq model
class Seq2SeqModel(tf.keras.Model):
    def __init__(self, vocab_size, embedding_dim, lstm_units):
        super(Seq2SeqModel, self).__init__()
        self.encoder = Encoder(vocab_size, embedding_dim, lstm_units)
        self.decoder = Decoder(vocab_size, embedding_dim, lstm_units)

    def call(self, encoder_input, decoder_input):
        encoder_output, state_h, state_c = self.encoder(encoder_input)
        decoder_output, _, _ = self.decoder(decoder_input, encoder_output, [state_h, state_c])
        return decoder_output


# Instantiate model
model = Seq2SeqModel(vocab_size, embedding_dim, lstm_units)

### Step 3: Compile and Train the Model
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy')
model()

In [32]:
model(abstract_sequences[:1], title_inputs[:1])

In [33]:
### Step 4: Inference (Generating Titles)
def generate_title(abstract):
    abstract_seq = tokenizer.texts_to_sequences([abstract])
    abstract_seq = pad_sequences(abstract_seq, maxlen=max_abstract_len, padding='post')
    decoder_input = np.zeros((1, 1))
    decoder_input[0, 0] = tokenizer.word_index['<start>']  # Assuming <start> is the starting token

    title = []
    state_h, state_c = None, None
    for _ in range(max_title_len):
        predictions, state_h, state_c = model.decoder(decoder_input, encoder_output, [state_h, state_c])
        predicted_id = np.argmax(predictions[0, -1, :])
        if predicted_id == tokenizer.word_index['<end>']:
            break
        title.append(tokenizer.index_word[predicted_id])
        decoder_input = np.array([[predicted_id]])

    return ' '.join(title)

# Training function
def train_step(encoder_input, decoder_input, target_output):
    with tf.GradientTape() as tape:
        predictions = model(encoder_input, decoder_input)
        loss = tf.reduce_mean(tf.keras.losses.sparse_categorical_crossentropy(target_output, predictions))
    gradients = tape.gradient(loss, model.trainable_variables)
    model.optimizer.apply_gradients(zip(gradients, model.trainable_variables))
    return loss

In [34]:
# Instantiate model
model = Seq2SeqModel(vocab_size, embedding_dim, lstm_units)

### Step 3: Compile and Train the Model
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy')

# Prepare dataset
batch_size = 2
dataset = tf.data.Dataset.from_tensor_slices((abstract_sequences, title_inputs, title_outputs))
dataset = dataset.batch(batch_size)


# Training loop
epochs = 5
for epoch in range(epochs):
    total_loss = 0
    for encoder_input, decoder_input, target_output in dataset:
        batch_loss = train_step(encoder_input, decoder_input, target_output)
        total_loss += batch_loss
    print(f"Epoch {epoch + 1}, Loss: {total_loss.numpy()}")



# Example usage
abstract_example = "we propose a multi-step system for the analysis of childrens stories ..."
print("Generated title:", generate_title(abstract_example))


In [35]:
import tensorflow as tf
import numpy as np

# Dummy Data
abstracts = ["This is a dummy abstract about neural networks.",
             "Another abstract on machine learning techniques.",
             "Deep learning advances in computer vision."]

# Update the titles with start and end tokens
titles = ["<start> Neural Networks Overview <end>",
          "<start> Machine Learning Techniques <end>",
          "<start> Advances in Computer Vision <end>"]

# Parameters
vocab_size = 1000  # Just for demonstration
embedding_dim = 64
units = 128
batch_size = 2
max_input_len = 10  # Max length for abstracts
max_output_len = 5  # Max length for titles

# Tokenizer
tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words=vocab_size, oov_token='<unk>')
tokenizer.fit_on_texts(abstracts + titles)
abstracts_seq = tokenizer.texts_to_sequences(abstracts)
titles_seq = tokenizer.texts_to_sequences(titles)
abstracts_padded = tf.keras.preprocessing.sequence.pad_sequences(abstracts_seq, maxlen=max_input_len, padding='post')
titles_padded = tf.keras.preprocessing.sequence.pad_sequences(titles_seq, maxlen=max_output_len, padding='post')

# Dataset
dataset = tf.data.Dataset.from_tensor_slices((abstracts_padded, titles_padded)).batch(batch_size)

# Encoder
class Encoder(tf.keras.layers.Layer):
    def __init__(self, vocab_size, embedding_dim, enc_units):
        super(Encoder, self).__init__()
        self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
        self.lstm = tf.keras.layers.LSTM(enc_units, return_sequences=True, return_state=True)

    def call(self, x):
        x = self.embedding(x)
        output, h_state, c_state = self.lstm(x)
        return output, h_state  # We only return the hidden state, but you can also return the cell state if needed.

# Attention Mechanism
class Attention(tf.keras.layers.Layer):
    def __init__(self, units):
        super(Attention, self).__init__()
        self.W1 = tf.keras.layers.Dense(units)
        self.W2 = tf.keras.layers.Dense(units)
        self.V = tf.keras.layers.Dense(1)

    def call(self, enc_output, dec_hidden):
        # Expand dec_hidden to match enc_output
        dec_hidden_with_time_axis = tf.expand_dims(dec_hidden, 1)
        score = self.V(tf.nn.tanh(self.W1(enc_output) + self.W2(dec_hidden_with_time_axis)))
        attention_weights = tf.nn.softmax(score, axis=1)
        context_vector = attention_weights * enc_output
        context_vector = tf.reduce_sum(context_vector, axis=1)
        return context_vector, attention_weights

# Decoder
class Decoder(tf.keras.layers.Layer):
    def __init__(self, vocab_size, embedding_dim, dec_units):
        super(Decoder, self).__init__()
        self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
        self.lstm = tf.keras.layers.LSTM(dec_units, return_sequences=True, return_state=True)
        self.fc = tf.keras.layers.Dense(vocab_size)
        self.attention = Attention(dec_units)

    def call(self, x, enc_output, hidden):
        context_vector, _ = self.attention(enc_output, hidden)
        x = self.embedding(x)
        x = tf.concat([tf.expand_dims(context_vector, 1), x], axis=-1)
        output, state = self.lstm(x)
        output = tf.reshape(output, (-1, output.shape[2]))
        x = self.fc(output)
        return x, state

# Model
encoder = Encoder(vocab_size, embedding_dim, units)
decoder = Decoder(vocab_size, embedding_dim, units)

# Optimizer and Loss
optimizer = tf.keras.optimizers.Adam()
loss_object = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True, reduction='none')

# Loss Function
def loss_function(real, pred):
    mask = tf.math.logical_not(tf.math.equal(real, 0))
    loss = loss_object(real, pred)
    mask = tf.cast(mask, dtype=loss.dtype)
    loss *= mask
    return tf.reduce_mean(loss)

# Training Step
@tf.function
def train_step(input, target, enc_hidden):
    loss = 0
    with tf.GradientTape() as tape:
        enc_output, enc_hidden = encoder(input)
        dec_hidden = enc_hidden
        dec_input = tf.expand_dims([tokenizer.word_index['start']] * batch_size, 1)

        for t in range(1, target.shape[1]):
            predictions, dec_hidden = decoder(dec_input, enc_output, dec_hidden)
            loss += loss_function(target[:, t], predictions)
            dec_input = tf.expand_dims(target[:, t], 1)

    batch_loss = (loss / int(target.shape[1]))
    variables = encoder.trainable_variables + decoder.trainable_variables
    gradients = tape.gradient(loss, variables)
    optimizer.apply_gradients(zip(gradients, variables))
    return batch_loss

# Training
EPOCHS = 10
for epoch in range(EPOCHS):
    enc_hidden = tf.zeros((batch_size, units))
    total_loss = 0

    for (batch, (inp, targ)) in enumerate(dataset):
        batch_loss = train_step(inp, targ, enc_hidden)
        total_loss += batch_loss

    print(f'Epoch {epoch+1} Loss {total_loss / batch_size}')

# Inference (Generating a title for a new abstract)
def evaluate(abstract):
    abstract_seq = tokenizer.texts_to_sequences([abstract])
    abstract_padded = tf.keras.preprocessing.sequence.pad_sequences(abstract_seq, maxlen=max_input_len, padding='post')
    enc_output, enc_hidden = encoder(abstract_padded)
    dec_hidden = enc_hidden
    dec_input = tf.expand_dims([tokenizer.word_index['start']], 0)
    result = []

    for t in range(max_output_len):
        predictions, dec_hidden = decoder(dec_input, enc_output, dec_hidden)
        predicted_id = tf.argmax(predictions[0]).numpy()
        result.append(tokenizer.index_word[predicted_id])
        if tokenizer.index_word[predicted_id] == '<end>':
            break
        dec_input = tf.expand_dims([predicted_id], 0)

    return ' '.join(result)

# Test the model with a sample abstract
sample_abstract = "A new abstract on neural networks and deep learning."
print("Generated Title:", evaluate(sample_abstract))


In [42]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

# Example data
examples = papers

# Split examples into titles and abstracts
titles, abstracts = zip(*examples)

# Example lists of text
list_a = titles
list_b = abstracts

# Combine lists for TF-IDF processing
all_text = list_a + list_b

# Generate TF-IDF features
vectorizer = TfidfVectorizer(max_features=10)  # Adjust max_features as needed
tfidf_matrix = vectorizer.fit_transform(all_text)

# Split the TF-IDF vectors for each list
tfidf_a = tfidf_matrix[:len(list_a)]
tfidf_b = tfidf_matrix[len(list_a):]

# Calculate cosine similarity between corresponding elements in list A and B
similarities = [cosine_similarity(tfidf_a[i], tfidf_b[i])[0, 0] for i in range(len(list_a))]

# Calculate correlation between similarities
correlation = np.corrcoef(similarities)
print("Correlation between list A and B features:", correlation)


In [38]:
from sentence_transformers import SentenceTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity, euclidean_distances
import numpy as np

from sentence_transformers import SentenceTransformer
# Step 1: Encode text samples using Sentence-BERT
model = SentenceTransformer('all-MiniLM-L6-v2')  # Use a lightweight model


In [43]:
examples = papers

# Split examples into titles and abstracts
titles, abstracts = zip(*examples)



'n_iter' was renamed to 'max_iter' in version 1.5 and will be removed in 1.7.


'n_iter' was renamed to 'max_iter' in version 1.5 and will be removed in 1.7.



In [134]:
embeddings1 = model.encode(titles)

In [135]:
embeddings2 = model.encode(abstracts)

In [136]:
from sentence_transformers import SentenceTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity, euclidean_distances
import numpy as np

# Calculate semantic similarity (using cosine similarity)
semantic_similarities = [cosine_similarity([emb1], [emb2])[0][0] for emb1, emb2 in zip(embeddings1, embeddings2)]



In [148]:
plt.hist(semantic_similarities)
plt.title("Cosine Similarity of Embeddings of abstracts and titles")
plt.ylabel("Occurence")
plt.xlabel("Cosine Similarity")
sim_mean = int(round(np.mean(semantic_similarities), 3)*100)/100
plt.axvline(sim_mean, c="red", label=f"Average Similarity between Abstract and Title {sim_mean}")
plt.legend()
plt.show()

In [166]:
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
import plotly.express as px

# Step 2: Dimensionality Reduction with PCA and t-SNE
pca = PCA(n_components=10)
n = 10
pca_result_1 = pca.fit_transform(embeddings1[:n])
tsne = TSNE(n_components=2, perplexity=5, n_iter=300)
tsne_result_1 = tsne.fit_transform(pca_result_1)
pca_result_2 = pca.fit_transform(embeddings2[:n])
tsne = TSNE(n_components=2, perplexity=5, n_iter=300)
tsne_result_2 = tsne.fit_transform(pca_result_2)
labels = ["titles"]*n+["abstracts"]*n
# Step 3: Create interactive plot with Plotly
fig = px.scatter(
    x=np.concatenate((tsne_result_1[:, 0],tsne_result_2[:, 0])), y=np.concatenate((tsne_result_1[:, 1],tsne_result_2[:, 1])),
    color=labels,
    hover_name=titles[:n]+abstracts[:n]  # Text displayed on hover
)

fig.update_layout(
    title="Embedding",
    xaxis_title="TSNE Dimension 1",
    yaxis_title="TSNE Dimension 2"
)

fig.show()


'n_iter' was renamed to 'max_iter' in version 1.5 and will be removed in 1.7.


'n_iter' was renamed to 'max_iter' in version 1.5 and will be removed in 1.7.



array([  4.841934  ,   1.4372314 , -13.154844  ,  -0.3446792 ,
        -0.07003962,   0.581297  ,  -8.812213  ,   3.2655282 ,
        -2.5088139 ,  -2.6026568 ,   0.41747054,  -9.57238   ,
        15.021317  ,  -7.1529474 ,   4.0766363 ,   3.591648  ,
        -2.7937603 , -10.702146  ,   4.455477  ,   9.466564  ,
        -4.194246  ,  -3.6975946 , -10.669995  ,  11.755438  ,
       -15.740773  ,  -2.6768377 ,   9.646371  ,   1.652284  ,
        -4.072566  ,   0.5673855 ,   4.4434657 ,   0.86689055,
         4.104611  ,  10.809399  , -15.1406145 ,  -8.012122  ,
         5.175685  ,  -0.43424255,  -7.10845   ,   3.5976439 ,
        -2.3248148 ,  -4.273344  , -10.470519  ,  -4.485003  ,
        10.32735   ,   4.436664  ,   0.20331824,  -9.36802   ,
         4.5803266 , -10.513613  ,   5.2114086 ,  -9.371864  ,
         4.185111  ,  -6.083634  ,   5.1272583 ,  -2.901036  ,
         5.350857  ,  -4.7458444 ,  -7.8018622 ,  -6.758852  ,
         6.982871  ,  -7.18122   ,   3.9430537 ,  -2.52

In [8]:
from typing import List
from sklearn.feature_extraction.text import CountVectorizer

def normalized_word_overlap(list_a: List[str], list_b: List[str]) -> List[float]:
    overlaps = []
    for text_a, text_b in zip(list_a, list_b):
        # Tokenize the words
        words_a = set(text_a.lower().split())
        words_b = text_b.lower().split()

        # Calculate the overlap
        overlap_count = sum(1 for word in words_b if word in words_a)

        # Normalize by the total number of words in text_b
        if len(words_b) > 0:
            normalized_overlap = overlap_count / len(words_b)
        else:
            normalized_overlap = 0  # Avoid division by zero if text_b is empty

        overlaps.append(normalized_overlap)

    return overlaps


# Example data
examples = papers

# Split examples into titles and abstracts
titles, abstracts = zip(*examples)

# Example lists of text
list_a = titles
list_b = abstracts

overlaps = normalized_word_overlap(list_a, list_b)
print("Normalized word overlaps:", overlaps)


Normalized word overlaps: [0.16058394160583941, 0.16470588235294117, 0.03571428571428571, 0.12, 0.1897810218978102, 0.13934426229508196, 0.05172413793103448, 0.19811320754716982, 0.265625, 0.25806451612903225, 0.1, 0.09210526315789473, 0.21, 0.16455696202531644, 0.02, 0.13768115942028986, 0.11392405063291139, 0.072992700729927, 0.09375, 0.15126050420168066, 0.07627118644067797, 0.20915032679738563, 0.16417910447761194, 0.14285714285714285, 0.09411764705882353, 0.20454545454545456, 0.1095890410958904, 0.14184397163120568, 0.19811320754716982, 0.19696969696969696, 0.26136363636363635, 0.045454545454545456, 0.175, 0.20833333333333334, 0.11920529801324503, 0.32075471698113206, 0.05747126436781609, 0.145985401459854, 0.21904761904761905, 0.18487394957983194, 0.07203389830508475, 0.015384615384615385, 0.06896551724137931, 0.12280701754385964, 0.19148936170212766, 0.04081632653061224, 0.08620689655172414, 0.1111111111111111, 0.1724137931034483, 0.125, 0.13043478260869565, 0.21978021978021978,

In [16]:
overlaps = normalized_word_overlap(list_b, list_a)
plt.suptitle("Proportion of words in title also present in abstract")
plt.hist(overlaps, bins=20)
plt.show()

In [120]:
titles[np.argmax(overlaps)], abstracts[np.argmax(overlaps)], np.max(overlaps)

('morpho-syntactic clues for terminological processing in serbian',
 'in this paper we discuss morpho-syntactic clues that can be used to facilitate terminological processing in serbian . a method ( called srce ) for automatic extraction of multiword terms is presented . the approach incorporates a set of generic morpho-syntactic filters for recognition of term candidates , a method for conflation of morphological variants and a module for foreign word recognition . morpho-syntactic filters describe general term formation patterns , and are implemented as generic regular expressions . the inner structure together with the agreements within term candidates are used as clues to discover the boundaries of nested terms . the results of the terminological processing of a textbook corpus in the domains of mathematics and computer science are presented .',
 1.0)