In [1]:
import pandas as pd
import json
import random
import torch
import sklearn
from transformers import BertTokenizer, BertModel
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer

from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
from tqdm import tqdm
from torch.amp import autocast
import nltk
from nltk.corpus import stopwords
from torch.optim import AdamW
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm
from nltk import word_tokenize          
from nltk.stem import WordNetLemmatizer
import re
import spacy
import autofaiss
import faiss

In [2]:
is_cuda = torch.cuda.is_available()
print("Cuda is available: ", is_cuda)
print("GPU: ", torch.cuda.get_device_name(0))
device = torch.device("cuda" if is_cuda else "cpu")

Cuda is available:  True
GPU:  Quadro RTX 4000


In [3]:
dev = pd.read_csv('./Data/dev.csv')
test = pd.read_csv('./Data/test.csv')
train = pd.read_csv('./Data/train.csv')
sample_submission = pd.read_csv('./Data/sample_submission.csv')
with open("./Data/corpus.json", "r") as f:
    documents = json.load(f)

In [4]:
docs_fr =[doc['text'] for doc in documents if doc['lang'] == 'fr']
docs_fr_ids = [doc['docid'] for doc in documents if doc['lang'] == 'fr']

In [5]:
nlp = spacy.load('fr_core_news_sm')


nltk.download('stopwords')
stopwords_fr = set(stopwords.words('french'))
stopwords_fr =list(stopwords_fr)
def clean_text(text):
    # soup = BeautifulSoup(text, "html.parser")

    # for img in soup.find_all("img"):
    #     img.decompose()
    #
    # for table in soup.find_all("table"):
    #     table.decompose()

    # clean_text = soup.get_text()
    # Step 1: Remove URLs
    text = re.sub(r"http[s]?://\S+|www\.\S+", " ", text)

    # Step 2: Remove long sequences of non-alphanumeric characters (e.g., encoded data or code)
    text = re.sub(r"[^\w\s]{10,}", " ", text)  # Removes any sequence of 10 or more non-alphanumeric characters

    # Step 3: Remove excessive whitespace
    clean_text = re.sub(r"\s+", " ", text, flags=re.UNICODE).strip()

    return clean_text
def preprocess_text (text):
    text = " ".join([ent.text for ent in text.ents if ent.text not in stopwords_fr])
    return text



[nltk_data] Downloading package stopwords to
[nltk_data]     /users/eleves-b/2021/madeleine.hueber/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [6]:
test_docs = docs_fr[:5000]
test_docs = [clean_text(doc) for doc in test_docs]
docs_fr = [clean_text(doc) for doc in tqdm(docs_fr)]

100%|██████████| 10676/10676 [00:21<00:00, 504.74it/s]


In [7]:
%time

test_docs = [preprocess_text(doc) for doc in tqdm(nlp.pipe(test_docs,disable=["tagger","parser","textcat"]),total=len(test_docs))]

CPU times: user 2 µs, sys: 0 ns, total: 2 µs
Wall time: 3.1 µs


100%|██████████| 1000/1000 [01:32<00:00, 10.85it/s]
100%|██████████| 1000/1000 [01:35<00:00, 10.52it/s]
100%|██████████| 1000/1000 [01:37<00:00, 10.31it/s]
100%|██████████| 1000/1000 [06:07<00:00,  2.72it/s]
100%|██████████| 1000/1000 [06:22<00:00,  2.61it/s]


In [10]:
vectorizer = TfidfVectorizer(analyzer='word', max_df = 0.1,stop_words=stopwords_fr)
doc_vectors_2 = vectorizer.fit_transform(test_docs)

In [11]:
print(doc_vectors_2.shape)
doc_vectors_2 = doc_vectors_2.toarray()

(5000, 278006)


In [12]:
# load document embeddings
d = doc_vectors_2.shape[1] # dimension of the embeddings
N = 30 # nb of chunks neighbors in the graph

#Normalize vectors 
doc_vectors_2 = doc_vectors_2 / np.linalg.norm(doc_vectors_2, axis=1)[:, None]

#Create Index 
index = autofaiss.build_index(doc_vectors_2, save_on_disk=True, index_path="my_index.faiss", max_index_memory_usage="2GB",metric_type=faiss.METRIC_INNER_PRODUCT)


2024-11-02 12:16:49,857 [INFO]: Using 16 omp threads (processes), consider increasing --nb_cores if you have more
2024-11-02 12:16:52,615 [INFO]: Launching the whole pipeline 11/02/2024, 12:16:52
2024-11-02 12:16:52,615 [INFO]: Reading total number of vectors and dimension 11/02/2024, 12:16:52
100%|██████████| 1/1 [00:00<00:00, 24672.38it/s]
2024-11-02 12:16:52,633 [INFO]: There are 5000 embeddings of dim 278006
2024-11-02 12:16:52,634 [INFO]: >>> Finished "Reading total number of vectors and dimension" in 0.0181 secs
2024-11-02 12:16:52,634 [INFO]: 	Compute estimated construction time of the index 11/02/2024, 12:16:52
2024-11-02 12:16:52,634 [INFO]: 		-> Train: 16.7 minutes
2024-11-02 12:16:52,635 [INFO]: 		-> Add: 15.5 seconds
2024-11-02 12:16:52,635 [INFO]: 		Total: 16.9 minutes
2024-11-02 12:16:52,635 [INFO]: 	>>> Finished "Compute estimated construction time of the index" in 0.0013 secs
2024-11-02 12:16:52,636 [INFO]: 	Checking that your have enough memory available to create the 

In [15]:
index = faiss.read_index("my_index.faiss")


In [32]:
i=5
print(dev[dev["lang"] == "fr"].iloc[i])
query = dev[dev["lang"] == "fr"].iloc[i]["query"] 

idx = dev[dev["lang"] == "fr"].iloc[i]["positive_docs"]
idx = docs_fr_ids.index(idx)
print(idx)

query_id                                                   q-fr-43
query            identiques, mais il y a une légère différence ...
positive_docs                                           doc-fr-344
negative_docs    ['doc-fr-345', 'doc-fr-346', 'doc-fr-347', 'do...
lang                                                            fr
Name: 206, dtype: object
2360


In [None]:

query = clean_text(query)
if len(preprocess_text(nlp(query))) >0 :
    query = preprocess_text(nlp(query))
query = vectorizer.transform([query])
query = query.toarray()
query = query / np.linalg.norm(query, axis=1)[:, None]

print(query@doc_vectors_2[idx])

distances, indices = index.search(query, 5)

for i in range(5):
    print(docs_fr_ids[indices[0][i]])
    print(docs_fr[indices[0][i]])
    print(distances[0][i])



[nan]


  query = query / np.linalg.norm(query, axis=1)[:, None]
