# Upload and Preprocessing

In [1]:
from google.colab import drive

drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import os

path = 'proj-nlp-2024/data'
os.chdir(f'/content/drive/MyDrive/{path}')
os.getcwd()

'/content/drive/.shortcut-targets-by-id/1U14ChUV9tg2aE9-Vr4OWEIFYXGBvM5Vo/proj-nlp-2024/data'

#### Preproc Wiki

Loading the trainig set from the json file

In [None]:
import json
train_path = os.path.join(os.getcwd(), 'beerqa_train_v1.0.json')
with open(train_path, 'r') as file:
    train_dataset = json.load(file)

In [None]:
# train_dataset

Defining text preprocessing functions

In [None]:
import string
import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

# TODO: Remove roman numbers and specific date stuff!
# TODO: Remove \n

# make all text lowercase
def text_lowercase(text):
    return text.lower()
# remove numbers
def remove_numbers(text):
    result = re.sub(r'\d+', '', text)
    return result
# remove punctuation
def remove_punctuation(text):
    translator = str.maketrans('', '', string.punctuation)
    return text.translate(translator)
# tokenize
def tokenize(text):
    text = word_tokenize(text)
    return text
# remove stopwords
stop_words = set(stopwords.words('english'))
def remove_stopwords(text):
    text = [i for i in text if not i in stop_words]
    return text
# lemmatize
lemmatizer = WordNetLemmatizer()
def lemmatize(text):
    text = [lemmatizer.lemmatize(token) for token in text]
    return text

def preprocessing(text):
    text = text_lowercase(text)
    text = remove_numbers(text)
    text = remove_punctuation(text)
    text = tokenize(text)
    text = remove_stopwords(text)
    text = lemmatize(text)
    # text = ' '.join(text) # Word2Vec wants a list of tokens
    return text

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


## json to Dataframe

In [None]:
import pandas as pd
from pandas.core.common import flatten

keys = list(train_dataset.keys())
train_dataset = {key: train_dataset[key] for key in keys}
dataset = pd.DataFrame(train_dataset['data'])
dataset['context'] = dataset['context'].apply(lambda x: ' '.join(list(flatten(x)))) # TODO: Not differentiating between contexts

In [None]:
dataset.head()

Unnamed: 0,id,src,answers,question,context
0,8af07575b8444ae748634478f96b00d4e7dbd170,squad,[1793],When did Wordsworth initially attack Burke?,"Edmund Burke In the 19th century, Burke was pr..."
1,7e07be1a905b70f143b2c8aff00caf16e2b32b31,squad,[Investiture Controversy],The clash between Henry IV and the pope was pa...,"Middle Ages During the early High Middle Ages,..."
2,c9dfc01655bc5abc59a1ee2583c8b3f7fba6c2e2,squad,[the Thermidorian Reaction],What historical event brought about the fall o...,Napoleon Some contemporaries alleged that Bona...
3,cf10ffbb5023cd0bc4637655458512b2186dc886,squad,[Morales],Who came up with a policy for indigenous auton...,Indigenous peoples of the Americas Morales beg...
4,0a185aca81f9e36bfb2486f690885011a6aefb2e,hotpotqa,[Darren Lynn Bousman],"Greg Hoffman was working on ""Saw III,"" a 2006 ...",Saw III Saw III is a 2006 horror film directed...


Applying preprocessing to context columns

In [None]:
pp_text_test = [] # our preprocessed text column
for text_data in dataset['context']:
    pp_text_data = preprocessing(text_data)
    pp_text_test.append(pp_text_data)
dataset['pp_context'] = pp_text_test # add the preprocessed text as a column

In [None]:
dataset.head()

Unnamed: 0,id,src,answers,question,context,pp_context
0,8af07575b8444ae748634478f96b00d4e7dbd170,squad,[1793],When did Wordsworth initially attack Burke?,"Edmund Burke In the 19th century, Burke was pr...","[edmund, burke, th, century, burke, praised, l..."
1,7e07be1a905b70f143b2c8aff00caf16e2b32b31,squad,[Investiture Controversy],The clash between Henry IV and the pope was pa...,"Middle Ages During the early High Middle Ages,...","[middle, age, early, high, middle, age, german..."
2,c9dfc01655bc5abc59a1ee2583c8b3f7fba6c2e2,squad,[the Thermidorian Reaction],What historical event brought about the fall o...,Napoleon Some contemporaries alleged that Bona...,"[napoleon, contemporary, alleged, bonaparte, p..."
3,cf10ffbb5023cd0bc4637655458512b2186dc886,squad,[Morales],Who came up with a policy for indigenous auton...,Indigenous peoples of the Americas Morales beg...,"[indigenous, people, america, morale, began, w..."
4,0a185aca81f9e36bfb2486f690885011a6aefb2e,hotpotqa,[Darren Lynn Bousman],"Greg Hoffman was working on ""Saw III,"" a 2006 ...",Saw III Saw III is a 2006 horror film directed...,"[saw, iii, saw, iii, horror, film, directed, d..."


## Directly from json

In [None]:
from pandas.core.common import flatten

data_dict = train_dataset.get('data', {}) # Extracting the data dictionary
context_list = [list(flatten(entry.get('context', ''))) for entry in data_dict] # List of list of strings

In [None]:
context_list[:1]

[['Edmund Burke',
  'In the 19th century, Burke was praised by both liberals and conservatives. Burke\'s friend Philip Francis wrote that Burke "was a man who truly & prophetically foresaw all the consequences which would rise from the adoption of the French principles", but because Burke wrote with so much passion, people were doubtful of his arguments. William Windham spoke from the same bench in the House of Commons as Burke had when he had separated from Fox and an observer said Windham spoke "like the ghost of Burke" when he made a speech against peace with France in 1801. William Hazlitt, a political opponent of Burke, regarded him as amongst his three favourite writers (the others being Junius and Rousseau) and made it "a test of the sense and candour of any one belonging to the opposite party, whether he allowed Burke to be a great man". William Wordsworth was originally a supporter of the French Revolution and attacked Burke in "A Letter to the Bishop of Llandaff" (1793), but 

Flattening the list of lists into a list of string

In [None]:
context_string = list(flatten(context_list))
context_string[:3]

['Edmund Burke',
 'In the 19th century, Burke was praised by both liberals and conservatives. Burke\'s friend Philip Francis wrote that Burke "was a man who truly & prophetically foresaw all the consequences which would rise from the adoption of the French principles", but because Burke wrote with so much passion, people were doubtful of his arguments. William Windham spoke from the same bench in the House of Commons as Burke had when he had separated from Fox and an observer said Windham spoke "like the ghost of Burke" when he made a speech against peace with France in 1801. William Hazlitt, a political opponent of Burke, regarded him as amongst his three favourite writers (the others being Junius and Rousseau) and made it "a test of the sense and candour of any one belonging to the opposite party, whether he allowed Burke to be a great man". William Wordsworth was originally a supporter of the French Revolution and attacked Burke in "A Letter to the Bishop of Llandaff" (1793), but by

Running preprocessing on the flattened data

In [None]:
pp_context = [preprocessing(text) for text in context_string]

In [None]:
print(pp_context[:100])

[['edmund', 'burke'], ['th', 'century', 'burke', 'praised', 'liberal', 'conservative', 'burke', 'friend', 'philip', 'francis', 'wrote', 'burke', 'man', 'truly', 'prophetically', 'foresaw', 'consequence', 'would', 'rise', 'adoption', 'french', 'principle', 'burke', 'wrote', 'much', 'passion', 'people', 'doubtful', 'argument', 'william', 'windham', 'spoke', 'bench', 'house', 'common', 'burke', 'separated', 'fox', 'observer', 'said', 'windham', 'spoke', 'like', 'ghost', 'burke', 'made', 'speech', 'peace', 'france', 'william', 'hazlitt', 'political', 'opponent', 'burke', 'regarded', 'amongst', 'three', 'favourite', 'writer', 'others', 'junius', 'rousseau', 'made', 'test', 'sense', 'candour', 'one', 'belonging', 'opposite', 'party', 'whether', 'allowed', 'burke', 'great', 'man', 'william', 'wordsworth', 'originally', 'supporter', 'french', 'revolution', 'attacked', 'burke', 'letter', 'bishop', 'llandaff', 'early', 'th', 'century', 'changed', 'mind', 'came', 'admire', 'burke', 'two', 'addres

In [None]:
from gensim.models import Word2Vec
from sklearn.cluster import KMeans
from sklearn.manifold import TSNE
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

## Embedding models

### Word2Vec

In [None]:
model_w2v = Word2Vec(sentences=pp_context, vector_size=100, window=5, min_count=1, workers=4)

In [None]:
term = 'zoom'
model_w2v.wv.most_similar(term)

[('transat', 0.8238064646720886),
 ('scoot', 0.8111199140548706),
 ('rayani', 0.8079903721809387),
 ('haavir', 0.8048369884490967),
 ('mcclung', 0.7932616472244263),
 ('farnaz', 0.7871574759483337),
 ('borealis', 0.7871526479721069),
 ('gooch', 0.7852275967597961),
 ('weissman', 0.7820276021957397),
 ('mairi', 0.7813788056373596)]

In [None]:
model = model_w2v
# def document_vector(doc):
#     # Remove out-of-vocabulary words and get the mean of the word vectors
#     return np.mean([model.wv[word] for word in doc if word in model.wv.key_to_index], axis=0)

# # Create document embeddings (you might need to exclude empty documents)
# doc_embeddings = np.array([document_vector(doc) for doc in pp_context if len(doc) > 0])

# # Clustering with K-means
# kmeans = KMeans(n_clusters=5, random_state=42)
# clusters = kmeans.fit_predict(doc_embeddings)

# # Dimensionality reduction with t-SNE for visualization
# tsne = TSNE(n_components=2, random_state=42)
# X_reduced = tsne.fit_transform(doc_embeddings)

# # Plotting the clusters
# plt.figure(figsize=(12, 8))
# sns.scatterplot(x=X_reduced[:, 0], y=X_reduced[:, 1], hue=clusters, palette='viridis')
# plt.title('t-SNE visualization of document clusters')
# plt.xlabel('t-SNE feature 1')
# plt.ylabel('t-SNE feature 2')
# plt.legend(loc='best', title='Cluster')
# plt.show()

### SentTransf

In [None]:
!pip -q install -U transformers sentence-transformers

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.0/9.0 MB[0m [31m52.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m171.5/171.5 kB[0m [31m18.0 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
from sentence_transformers import SentenceTransformer, CrossEncoder

semb_model = SentenceTransformer('multi-qa-MiniLM-L6-cos-v1')
xenc_model = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-6-v2')

# Preliminary Tasks
- Clustering docs and visualization of the cluster
- Search task of the docs
- Embedding (word2vec)
- Basic Statistics on dataset (written on assignment)
---
2do later:
- Load model from Github and load it into Pytorch (they did it on TF)


## Clustering_Part

In [None]:
import requests
from bs4 import BeautifulSoup

# Fetch the web page
response = requests.get('https://beerqa.github.io/')
soup = BeautifulSoup(response.text, 'html.parser')

# Assuming documents are text inside specific HTML elements, e.g., paragraphs
documents = [p.text for p in soup.find_all('p')]


In [None]:
documents

In [None]:
data_dict = train_dataset.get('data', {}) # Extracting the data dictionary
data_dict

In [None]:
context_list3 = [entry.get('context', '') for entry in data_dict] # List^3
context_list3

In [None]:
context_list2 = [item for sublist in context_list3 for item in sublist] # List^2
context_list2

In [None]:
context_list = [' '.join(pair) for pair in context_list2] # List
context_list[0]

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(max_df=0.8, min_df=5, stop_words='english')

In [None]:
vectorizer.fit(context_list)

In [None]:
vocab = vectorizer.get_feature_names_out()

print(f"Length of vocabulary: {len(vocab)}")

In [None]:
import random

sorted(random.sample(vocab.tolist(),100))

In [None]:
vector_documents = vectorizer.transform(context_list)

In [None]:
print(vector_documents[0])

In [None]:
sorted([(vocab[j], vector_documents[0, j]) for j in vector_documents[0].nonzero()[1]], key=lambda x: -x[1])

In [None]:
print(context_list[0])

In [None]:
for i in range(1,200):
  print('Similarity:', vector_documents[0].multiply(vector_documents[i]).sum())

In [None]:
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

silhouette_scores = []
K = range(10, 50)
for k in K:
    kmeanModel = KMeans(n_clusters=k, max_iter=20, n_init=2, verbose=True, random_state=2307)
    cluster_labels = kmeanModel.fit_predict(vector_documents)
    silhouette_scores.append(silhouette_score(vector_documents, cluster_labels))

# Plotting the silhouette scores
plt.figure(figsize=(10, 6))
plt.plot(K, silhouette_scores, 'bo-')
plt.xlabel('k')
plt.ylabel('Silhouette Score')
plt.title('Silhouette Score showing the optimal k')
plt.show()

# Plotting the elbow method
plt.figure(figsize=(10, 6))
plt.plot(K, inertias, 'bx-')
plt.xlabel('k')
plt.ylabel('Inertia')
plt.title('The Elbow Method showing the optimal k')
plt.show()

## Embedding

Creating a copy and visualizing

### Tokenization

TODO: Quantitatively find out different noise in text e.g. "[...]" or (r. 1890-95) (are those the only ones?)

Flattening the context text data into a list of strings

In [None]:
data_dict = train_dataset.get('data', {}) # Extracting the data dictionary
context_list3 = [entry.get('context', '') for entry in data_dict] # List^3
context_list2 = [item for sublist in context_list3 for item in sublist] # List^2
context_list = [' '.join(pair) for pair in context_list2] # List

Removing noise in text data

In [None]:
import re

docs = [re.sub('(\d{2,4})', '', doc) for doc in context_list] # Year dates
docs = [re.sub('(r\..*)', '', doc) for doc in docs] # (r. -)
docs = [re.sub('\[\.{3}\]', '', doc) for doc in docs] # [...]
docs = [re.sub('[\[\]()"-\';]', '', doc) for doc in docs] # Specific non alphanumerical

# TODO: Maybe remove "th"; improve regex expressions
# TODO: Remove stopwords

Splitting into senteces and tokenization

In [None]:
sentences = [re.split('[?!.]\s', doc) for doc in docs]

In [None]:
from pandas.core.common import flatten

sentences = list(flatten(sentences))
tokenized_sentences = [re.sub('\W', ' ', sentence).split() for sentence in sentences]

In [None]:
len(tokenized_sentences)

In [None]:
count = sum(1 for sentence in tokenized_sentences if len(sentence) == 1) # Counting sentences that are only 1 word long
print(count)

In [None]:
for sentence in tokenized_sentences[:5]
    print(sentence)

TODO: Use basic statistics to determine window length in the word2vec model

In [None]:
from gensim.models.word2vec import Word2Vec

model = Word2Vec(tokenized_sentences, vector_size=30, min_count=5, window=10)

In [None]:
len(model.wv)

Playing with the model

In [None]:
term = 'zoom'
model.wv.most_similar(term)

Life is to death what royce is to pizza

In [None]:
A = 'death'
B = 'life'
C = 'pizza'
vec = model.wv.get_vector(A) - model.wv.get_vector(B) + model.wv.get_vector(C)
model.wv.similar_by_vector(vec)

### Visualization

Finding random subset

In [None]:
import random

sample = random.sample(list(model.wv.key_to_index), 500)
word_vectors = model.wv[sample]

Visualizing the tSNE projection in 3D

In [None]:
from sklearn.manifold import TSNE

tsne = TSNE(n_components=3, n_iter=2000)
tsne_embedding = tsne.fit_transform(word_vectors)

In [None]:
import numpy as np

x, y, z = np.transpose(tsne_embedding)

In [None]:
import plotly.express as px

fig = px.scatter_3d(x=x[:200],y=y[:200],z=z[:200],text=sample[:200])
fig.update_traces(marker=dict(size=3,line=dict(width=2)),textfont_size=10)
fig.show()

TODO: PCA to properly reduce dimensionality