<a href="https://colab.research.google.com/github/nespar7/Spring_24/blob/main/NLP/Assignments/Assignment_1/NLP_A1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Installations

Please reload session after the installations are done and DO NOT run the install block again.

In [None]:
!pip install spacy --force-reinstall
!python -m spacy download en_core_web_sm
!pip install contextualSpellCheck
!pip install scikit-learn
!pip install nltk

In [1]:
# make sure spacy version is 3.7.2
!python -m spacy info

[1m

spaCy version    3.7.2                         
Location         /usr/local/lib/python3.10/dist-packages/spacy
Platform         Linux-6.1.58+-x86_64-with-glibc2.35
Python version   3.10.12                       
Pipelines        en_core_web_sm (3.7.1)        



## Imports

In [2]:
import spacy
import contextualSpellCheck
import csv
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
from nltk import download as nltk_dw
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize

nltk_dw('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

### Models

In [110]:
nlp = spacy.load("en_core_web_sm")

contextualSpellCheck.add_to_pipe(nlp)

# Removing since initially only spell check is to be done
nlp.remove_pipe('lemmatizer')
nlp.remove_pipe('tagger')
nlp.remove_pipe('ner')

('ner', <spacy.pipeline.ner.EntityRecognizer at 0x7bb7c8fdcf20>)

## Function Definitions

In [95]:
def spell_check(text):
    doc = nlp(text)

    return doc._.performed_spellCheck, doc._.outcome_spellCheck

In [96]:
def correct_spellings(csv_file, out_csv_file, column_name, should_log=False):
    # read a csv file, in each line, see if spellCheck was performed(if text was correctly spelled) and print original and corrected texts
    lines = []

    with open(csv_file, 'r') as f:
        reader = csv.reader(f)
        lines = list(reader)

        # Index of the column to be spell checked
        column_index = lines[0].index(column_name)

        # For each line, perform spell check
        for i in range(1, len(lines)):
            line = lines[i][column_index]
            performed, outcome = spell_check(line)


            # If spell check was performed, update the line and print the original and corrected texts
            if performed:
                # Update the line
                lines[i][column_index] = outcome

                if should_log:
                    print(line)
                    print(outcome)
                    print()

            if i%100 == 0:
                print(i)

    with open(out_csv_file, 'w') as f:
        writer = csv.writer(f)
        writer.writerows(lines)

    print("Spell check completed")

In [97]:
def pre_process(csv_file, out_csv_file, column_name):
    # read a csv file, in each line, remove characters apart from alphanumerics and whitespaces in the given column
    lines = []

    with open(csv_file, 'r') as f:
        reader = csv.reader(f)
        lines = list(reader)

        # Index of the column to be pre processed
        column_index = lines[0].index(column_name)

        for i in range(1, len(lines)):
            lines[i][column_index] = ''.join([c for c in lines[i][column_index] if c.isalnum() or c.isspace()])

    # write the pre-processed data to the resulting csv file
    with open(out_csv_file, 'w') as f:
        writer = csv.writer(f)
        writer.writerows(lines)

In [98]:
def get_vocabulary(nlp, csv_file, column_name):
    # Read the spell checked docs csv file and store the documents in a list
    docs = []

    print(nlp.pipeline)

    with open(csv_file, 'r') as f:
        reader = csv.reader(f)
        lines = [line for line in reader]

        doc_col_index = lines[0].index(column_name)

        for i in range(1, len(lines)):
            docs.append(lines[i][doc_col_index])

        num_docs = len(docs)
        lower_lim = 5
        upper_lim = int(num_docs * 0.85)

        # tokenize the documents using spacy and store the tokens in a vocabulary,
        # remove the tokens that occur less than lower_lim times and more than upper_lim times

        doc_tokenized = []

        # use the model to tokenize the documents
        for doc in docs:
            # process doc using the model passed
            doc = nlp(doc)

            # Select alphabetical tokens and remove stop words
            tokens = [token.text.lower() for token in doc if token.is_alpha and not token.is_stop]

            doc_tokenized.append(tokens)

        # Now that we have the tokenized documents, we can create a vocabulary
        vocabulary = {}

        for doc in doc_tokenized:
            for token in doc:
                if token in vocabulary:
                    vocabulary[token] += 1
                else:
                    vocabulary[token] = 1

        # Remove the tokens that occur less than lower_lim times and more than upper_lim times
        vocabulary = [k for k, v in vocabulary.items() if v >= lower_lim and v <= upper_lim]

        return vocabulary

In [99]:
def get_processed_text(nlp, csv_file, col_name):
    # Read the csv file and return the processed text using the passed model
    lines = []
    docs = []

    print(nlp.pipeline)

    with open(csv_file, 'r') as f:
        reader = csv.reader(f)
        lines = list(reader)

        col_index = lines[0].index(col_name)

        for i in range(1, len(lines)):
            doc = nlp(lines[i][col_index])
            docs.append(' '.join([token.text.lower() for token in doc]))

    return docs

In [100]:
def get_stemmed_vocabulary(csv_file, col_name):
    # Read the spell checked docs csv file and store the documents in a list
    docs = []

    with open(csv_file, 'r') as f:
        reader = csv.reader(f)
        lines = [line for line in reader]

        doc_col_index = lines[0].index(col_name)

        for i in range(1, len(lines)):
            docs.append(lines[i][doc_col_index])

    # tokenize the documents using nltk and store the tokens in a vocabulary,
    # remove the tokens that occur less than lower_lim times and more than upper_lim times

    lower_lim = 5
    upper_lim = int(len(docs) * 0.85)

    doc_tokenized = []

    # use nltk to tokenize the documents
    for doc in docs:
        doc = word_tokenize(doc)

        tokens = [token.lower() for token in doc if token.isalpha()]

        # stem the tokens
        stemmed_tokens = [ps.stem(token) for token in tokens]

        doc_tokenized.append(stemmed_tokens)

    vocabulary = {}

    for doc in doc_tokenized:
        for token in doc:
            if token in vocabulary:
                vocabulary[token] += 1
            else:
                vocabulary[token] = 1

    # Remove the tokens that occur less than lower_lim times and more than upper_lim times
    stemmed_vocabulary = [k for k, v in vocabulary.items() if v >= lower_lim and v <= upper_lim]

    return stemmed_vocabulary

In [101]:
def get_stemmed_texts(csv_file, column_name, porter_stemmer):
    # Read the spell checked docs csv file and store the queries in a list
    texts = []

    with open(csv_file, 'r') as f:
        reader = csv.reader(f)
        lines = [line for line in reader]

        col_index = lines[0].index(column_name)

        for i in range(1, len(lines)):
            texts.append(lines[i][col_index])

    stemmed_texts = []

    # use nltk to tokenize the documents
    for text in texts:
        text = word_tokenize(text)

        tokens = [token.lower() for token in text]

        # stem the tokens
        stemmed_tokens = [porter_stemmer.stem(token) for token in tokens]

        stemmed_texts.append(' '.join(stemmed_tokens))

    return stemmed_texts

In [102]:
def get_lemmatized_vocabulary(nlp, csv_file, column_name):
    # Read the spell checked docs csv file and store the documents in a list
    docs = []

    print(nlp.pipeline)

    with open(csv_file, 'r') as f:
        reader = csv.reader(f)
        lines = [line for line in reader]

        doc_col_index = lines[0].index(column_name)

        for i in range(1, len(lines)):
            docs.append(lines[i][doc_col_index])

        num_docs = len(docs)
        lower_lim = 5
        upper_lim = int(num_docs * 0.85)

        # tokenize the documents using spacy and store the tokens in a vocabulary,
        # remove the tokens that occur less than lower_lim times and more than upper_lim times

        doc_tokenized = []

        # use the model to tokenize the documents
        for i, doc in enumerate(docs):
            # process doc using the model passed
            doc = nlp(doc)

            # Select alphabetical tokens and remove stop words
            tokens = [token.lemma_ for token in doc if token.is_alpha and not token.is_stop]

            doc_tokenized.append(tokens)

        # Now that we have the tokenized documents, we can create a vocabulary
        vocabulary = {}

        for doc in doc_tokenized:
            for token in doc:
                if token in vocabulary:
                    vocabulary[token] += 1
                else:
                    vocabulary[token] = 1

        # Remove the tokens that occur less than lower_lim times and more than upper_lim times
        vocabulary = [k for k, v in vocabulary.items() if v >= lower_lim and v <= upper_lim]

        return vocabulary

In [103]:
def get_lemmatized_text(nlp, csv_file, col_name):
    # Read the csv file and return the processed text using the passed model
    lines = []
    docs = []

    print(nlp.pipeline)

    with open(csv_file, 'r') as f:
        reader = csv.reader(f)
        lines = list(reader)

        col_index = lines[0].index(col_name)

        for i in range(1, len(lines)):
            doc = nlp(lines[i][col_index])
            docs.append(' '.join([token.lemma_ for token in doc]))

    return docs

In [104]:
def get_texts(nlp, csv_file, id_col, text_col):
    # Read the spell checked docs csv file and store the documents in a list
    ids = []
    texts = []

    print(nlp.pipeline)

    with open(csv_file, 'r') as f:
        reader = csv.reader(f)
        lines = [line for line in reader]

        id_col_index = lines[0].index(id_col)
        text_col_index = lines[0].index(text_col)

        for i in range(1, len(lines)):
            ids.append(lines[i][id_col_index])

            doc = nlp(lines[i][text_col_index])
            texts.append(' '.join([token.text.lower() for token in doc]))

    return ids, texts

In [105]:
def get_relevant_docs(qdrels):
    # Get relevant docs list for each query from qdrels.csv file
    relevant_docs = {}

    with open(qdrels, 'r') as f:
        reader = csv.reader(f)
        lines = [line for line in reader]

        for i in range(1, len(lines)):
            line = lines[i]

            query_id = line[1]
            doc_id = line[2]

            if query_id in relevant_docs:
                relevant_docs[query_id].append(doc_id)
            else:
                relevant_docs[query_id] = [doc_id]

    return relevant_docs

In [106]:
def precision_k_score(query_id_idx_map, relevant_docs, top_docs, k):
    # For each query in relevant_docs, calculate true positives in top_docs and return average precision_k score
    precision_k = 0

    for query_id, relevant_docs_list in relevant_docs.items():
        true_positives = 0

        query_idx = query_id_idx_map[query_id]

        for doc in top_docs[query_idx]:
            if doc in relevant_docs_list:
                true_positives += 1

        precision_k += true_positives / k

    return precision_k / len(relevant_docs)

In [107]:
def weigh_vectors(nlp, vocabulary, docs_vector, queries_vector):
    print(nlp.pipeline)

    for i, token in enumerate(vocabulary):
        doc = nlp(token)

        # There is only one word in doc but taking a list to make it easier to understand
        pos = [token.pos_ for token in doc]
        nes = doc.ents

        # If part of speech is tagged as noun multiply that token's value by 2 in all the vectors
        if pos and pos[0] == "NOUN":
            docs_vector[:][i] *= 2
            queries_vector[:][i] *= 2

        # If entities are found multiply by 4
        if nes:
            docs_vector[:][i] *= 4
            queries_vector[:][i] *= 4

    return docs_vector, queries_vector

## Main

The CSV file locations need to be changed as applicable if being run locally.

In [60]:
docs_file = "/content/Query_Doc/docs.csv"
queries_file = "/content/Query_Doc/queries.csv"
pre_processed_docs_file = "/content/Query_Doc/pre_processed_docs.csv"
pre_processed_queries_file = "/content/Query_Doc/pre_processed_queries.csv"
spell_checked_docs_file = "/content/Query_Doc/spell_checked_docs.csv"
spell_checked_queries_file = "/content/Query_Doc/spell_checked_queries.csv"
qdrels_file = "/content/Query_Doc/qdrel.csv"

Pre processing

In [61]:
pre_process(docs_file, pre_processed_docs_file, "doc_text")
pre_process(queries_file, pre_processed_queries_file, "query_text")

Spell correction on the pre processed docs and queries

In [None]:
# Print logs(original and corrected for query doc only)
correct_spellings(pre_processed_queries_file, spell_checked_queries_file, "query_text", True)
correct_spellings(pre_processed_docs_file, spell_checked_docs_file, "doc_text")

What is Atal Pension Yojana What are its benefits
What is the Pension? What are its benefits

Where is starch digested How is it digested
Where is it eaten How is it here

What can India do to support the people suffering from civilian war in Syria
What can I do to support the people suffering from civilian war in India

How do introverts enjoy life
How do they enjoy life

Kindly tell me whole process of admission at vits Vellore for biotechi m a bio student in 12I dont have math there
Kindly tell me whole process of admission at the college for i m a bio student in I donot have math there

What did Theodore Roosevelt mean when he said Black care never sits behind a rider whose pace is fast enough
What did Theodore Roosevelt mean when he said Black care never sits behind a rider whose pace is fast enough

How does Quora look to a moderator
How does one look to a man

Did Tywin sleep with Shae out of pure spite
Did you sleep with her out of pure spite

Why does phase shift take place in

Tokenize the documents and get the tokens forming the vocabulary

In [111]:
# Remove contextualSpellCheck so that it does not run while tokenizing
nlp.remove_pipe('contextual spellchecker')

('contextual spellchecker',
 <contextualSpellCheck.contextualSpellCheck.ContextualSpellCheck at 0x7bb7c911aef0>)

In [112]:
vocabulary = get_vocabulary(nlp, spell_checked_docs_file, "doc_text")

TypeError: get_vocabulary() missing 1 required positional argument: 'column_name'

In [113]:
print(len(vocabulary))

2030


In [116]:
doc_ids, docs = get_texts(nlp, spell_checked_docs_file, 'doc_id', 'doc_text')
query_ids, queries = get_texts(nlp, spell_checked_queries_file, 'query_id', 'query_text')

query_id_idx_map = {query_ids[i]: i for i in range(len(query_ids))}
doc_id_idx_map = {doc_ids[i]: i for i in range(len(doc_ids))}

[('tok2vec', <spacy.pipeline.tok2vec.Tok2Vec object at 0x7bb7c8f8f220>), ('parser', <spacy.pipeline.dep_parser.DependencyParser object at 0x7bb7c8fdd070>), ('attribute_ruler', <spacy.pipeline.attributeruler.AttributeRuler object at 0x7bb7db97d340>)]
[('tok2vec', <spacy.pipeline.tok2vec.Tok2Vec object at 0x7bb7c8f8f220>), ('parser', <spacy.pipeline.dep_parser.DependencyParser object at 0x7bb7c8fdd070>), ('attribute_ruler', <spacy.pipeline.attributeruler.AttributeRuler object at 0x7bb7db97d340>)]


In [117]:
relevant_docs = get_relevant_docs(qdrels_file)

Vectorize documents and queries, and find cosine similarities for each query with the documents

In [118]:
vectorizer = TfidfVectorizer(vocabulary=vocabulary)

# Create tf idf vectors for the documents and queries
doc_vectors = vectorizer.fit_transform(docs)
query_vectors = vectorizer.transform(queries)

In [119]:
similarities = cosine_similarity(query_vectors, doc_vectors)

In [120]:
# find top 5 and top 10 similar document indices for each query
top_5_indices = np.argsort(similarities, axis=1)[:, -5:]
top_10_indices = np.argsort(similarities, axis=1)[:, -10:]

# for each query, store the top 5 and top 10 similar document ids
top_5_ids = [[doc_ids[i] for i in indices] for indices in top_5_indices]
top_10_ids = [[doc_ids[i] for i in indices] for indices in top_10_indices]
top_ids = [[i[-1]] for i in top_5_ids]

In [121]:
prec_1 = precision_k_score(query_id_idx_map, relevant_docs, top_ids, 1)
prec_5 = precision_k_score(query_id_idx_map, relevant_docs, top_5_ids, 5)
prec_10 = precision_k_score(query_id_idx_map, relevant_docs, top_10_ids, 10)

print(f"Average precision at 1: {round(prec_1 * 100, 2)} %")
print(f"Average precision at 5: {round(prec_5 * 100, 2)} %")
print(f"Average precision at 10: {round(prec_10 * 100, 2)} %")

Average precision at 1: 46.0 %
Average precision at 5: 15.0 %
Average precision at 10: 8.4 %


## Task 2

### Stemming

Using nltk's porter stemmer here since spacy does not provide a stemmer

In [122]:
ps = PorterStemmer()

In [123]:
stemmed_vocabulary = get_stemmed_vocabulary(spell_checked_docs_file, "doc_text")
stemmed_docs = get_stemmed_texts(spell_checked_docs_file, "doc_text", ps)
stemmed_queries = get_stemmed_texts(spell_checked_queries_file, "query_text", ps)

In [124]:
print(len(stemmed_vocabulary))

1971


Vectorize the documents and queries, and find cosine similarities for each query with documents

In [125]:
vectorizer_stemmed = TfidfVectorizer(vocabulary=stemmed_vocabulary)

# Create tf idf vectors for the documents and queries
stemmed_doc_vectors = vectorizer_stemmed.fit_transform(stemmed_docs)
stemmed_query_vectors = vectorizer_stemmed.transform(stemmed_queries)

In [126]:
stemmed_similarities = cosine_similarity(stemmed_query_vectors, stemmed_doc_vectors)

# find top 5 and top 10 similar document indices for each query
stemmed_top_5_indices = np.argsort(stemmed_similarities, axis=1)[:, -5:]
stemmed_top_10_indices = np.argsort(stemmed_similarities, axis=1)[:, -10:]

# for each query, store the top 5 and top 10 similar document ids
stemmed_top_5_ids = [[doc_ids[i] for i in indices] for indices in stemmed_top_5_indices]
stemmed_top_10_ids = [[doc_ids[i] for i in indices] for indices in stemmed_top_10_indices]
stemmed_top_ids = [[ids[-1]] for ids in stemmed_top_5_ids]

Precision Scores

In [127]:
prec_1_stemmed = precision_k_score(query_id_idx_map, relevant_docs, stemmed_top_ids, 1)
prec_5_stemmed = precision_k_score(query_id_idx_map, relevant_docs, stemmed_top_5_ids, 5)
prec_10_stemmed = precision_k_score(query_id_idx_map, relevant_docs, stemmed_top_10_ids, 10)

print(f"Average precision at 1 for stemmed: {round(prec_1_stemmed * 100, 2)} %")
print(f"Average precision at 5 for stemmed: {round(prec_5_stemmed * 100, 2)} %")
print(f"Average precision at 10 for stemmed: {round(prec_10_stemmed * 100, 2)} %")

Average precision at 1 for stemmed: 57.0 %
Average precision at 5 for stemmed: 18.2 %
Average precision at 10 for stemmed: 10.1 %


## Lemmatization

Add lemmatizer back to the nlp pipe

In [128]:
nlp.add_pipe('lemmatizer')
nlp.initialize()

<thinc.optimizers.Optimizer at 0x7bb7db751940>

In [129]:
lemmatized_vocabulary = get_lemmatized_vocabulary(nlp, spell_checked_docs_file, "doc_text")
lemmatized_docs = get_lemmatized_text(nlp, spell_checked_docs_file, "doc_text")
lemmatized_queries = get_lemmatized_text(nlp, spell_checked_queries_file, "query_text")

print(len(lemmatized_vocabulary))

[('tok2vec', <spacy.pipeline.tok2vec.Tok2Vec object at 0x7bb7c8f8f220>), ('parser', <spacy.pipeline.dep_parser.DependencyParser object at 0x7bb7c8fdd070>), ('attribute_ruler', <spacy.pipeline.attributeruler.AttributeRuler object at 0x7bb7db97d340>), ('lemmatizer', <spacy.lang.en.lemmatizer.EnglishLemmatizer object at 0x7bb7e2b5c0c0>)]


  matches = self.matcher(doc, allow_missing=True, as_spans=False)


[('tok2vec', <spacy.pipeline.tok2vec.Tok2Vec object at 0x7bb7c8f8f220>), ('parser', <spacy.pipeline.dep_parser.DependencyParser object at 0x7bb7c8fdd070>), ('attribute_ruler', <spacy.pipeline.attributeruler.AttributeRuler object at 0x7bb7db97d340>), ('lemmatizer', <spacy.lang.en.lemmatizer.EnglishLemmatizer object at 0x7bb7e2b5c0c0>)]
[('tok2vec', <spacy.pipeline.tok2vec.Tok2Vec object at 0x7bb7c8f8f220>), ('parser', <spacy.pipeline.dep_parser.DependencyParser object at 0x7bb7c8fdd070>), ('attribute_ruler', <spacy.pipeline.attributeruler.AttributeRuler object at 0x7bb7db97d340>), ('lemmatizer', <spacy.lang.en.lemmatizer.EnglishLemmatizer object at 0x7bb7e2b5c0c0>)]
2030


In [130]:
print(len(lemmatized_vocabulary))

2030


Vectorize the documents and queries, and find cosine similarities for each query with documents

In [131]:
vectorizer_lemmatized = TfidfVectorizer(vocabulary=lemmatized_vocabulary)

# Create tf idf vectors for the documents and queries
lemmatized_doc_vectors = vectorizer_lemmatized.fit_transform(lemmatized_docs)
lemmatized_query_vectors = vectorizer_lemmatized.transform(lemmatized_queries)

In [132]:
lemmatized_similarities = cosine_similarity(lemmatized_query_vectors, lemmatized_doc_vectors)

# find top 5 and top 10 similar document indices for each query
lemmatized_top_5_indices = np.argsort(lemmatized_similarities, axis=1)[:, -5:]
lemmatized_top_10_indices = np.argsort(lemmatized_similarities, axis=1)[:, -10:]

# for each query, store the top 5 and top 10 similar document ids
lemmatized_top_5_ids = [[doc_ids[i] for i in indices] for indices in lemmatized_top_5_indices]
lemmatized_top_10_ids = [[doc_ids[i] for i in indices] for indices in lemmatized_top_10_indices]
lemmatized_top_ids = [[ids[-1]] for ids in lemmatized_top_5_ids]

Precision Scores

In [133]:
prec_1_lemmatized = precision_k_score(query_id_idx_map, relevant_docs, lemmatized_top_ids, 1)
prec_5_lemmatized = precision_k_score(query_id_idx_map, relevant_docs, lemmatized_top_5_ids, 5)
prec_10_lemmatized = precision_k_score(query_id_idx_map, relevant_docs, lemmatized_top_10_ids, 10)

print(f"Average precision at 1 for lemmatized: {round(prec_1_lemmatized * 100, 2)} %")
print(f"Average precision at 5 for lemmatized: {round(prec_5_lemmatized * 100, 2)} %")
print(f"Average precision at 10 for lemmatized: {round(prec_10_lemmatized * 100, 2)} %")

Average precision at 1 for lemmatized: 46.0 %
Average precision at 5 for lemmatized: 15.0 %
Average precision at 10 for lemmatized: 8.4 %


## NER and POS tagging

In [134]:
nlp.add_pipe('ner')
nlp.add_pipe('tagger', source=spacy.load("en_core_web_sm"))
nlp.initialize()

<thinc.optimizers.Optimizer at 0x7bb7e29de7a0>

In [135]:
ner_pos_vocabulary = get_vocabulary(nlp, spell_checked_docs_file, "doc_text")
ner_pos_docs = get_processed_text(nlp, spell_checked_docs_file, "doc_text")
ner_pos_queries = get_processed_text(nlp, spell_checked_queries_file, "query_text")

[('tok2vec', <spacy.pipeline.tok2vec.Tok2Vec object at 0x7bb7c8f8f220>), ('parser', <spacy.pipeline.dep_parser.DependencyParser object at 0x7bb7c8fdd070>), ('attribute_ruler', <spacy.pipeline.attributeruler.AttributeRuler object at 0x7bb7db97d340>), ('lemmatizer', <spacy.lang.en.lemmatizer.EnglishLemmatizer object at 0x7bb7e2b5c0c0>), ('ner', <spacy.pipeline.ner.EntityRecognizer object at 0x7bb7e2044040>), ('tagger', <spacy.pipeline.tagger.Tagger object at 0x7bb7da75d600>)]
[('tok2vec', <spacy.pipeline.tok2vec.Tok2Vec object at 0x7bb7c8f8f220>), ('parser', <spacy.pipeline.dep_parser.DependencyParser object at 0x7bb7c8fdd070>), ('attribute_ruler', <spacy.pipeline.attributeruler.AttributeRuler object at 0x7bb7db97d340>), ('lemmatizer', <spacy.lang.en.lemmatizer.EnglishLemmatizer object at 0x7bb7e2b5c0c0>), ('ner', <spacy.pipeline.ner.EntityRecognizer object at 0x7bb7e2044040>), ('tagger', <spacy.pipeline.tagger.Tagger object at 0x7bb7da75d600>)]
[('tok2vec', <spacy.pipeline.tok2vec.Tok2V

In [136]:
print(len(ner_pos_vocabulary))

2030


In [137]:
vectorizer_ner_pos = TfidfVectorizer(vocabulary=ner_pos_vocabulary)

# Create tf idf vectors for the documents and queries
ner_pos_doc_vectors = vectorizer_ner_pos.fit_transform(ner_pos_docs)
ner_pos_query_vectors = vectorizer_ner_pos.transform(ner_pos_queries)

In [138]:
ner_pos_doc_vectors, ner_pos_query_vectors = weigh_vectors(nlp, ner_pos_vocabulary, ner_pos_doc_vectors, ner_pos_query_vectors)

[('tok2vec', <spacy.pipeline.tok2vec.Tok2Vec object at 0x7bb7c8f8f220>), ('parser', <spacy.pipeline.dep_parser.DependencyParser object at 0x7bb7c8fdd070>), ('attribute_ruler', <spacy.pipeline.attributeruler.AttributeRuler object at 0x7bb7db97d340>), ('lemmatizer', <spacy.lang.en.lemmatizer.EnglishLemmatizer object at 0x7bb7e2b5c0c0>), ('ner', <spacy.pipeline.ner.EntityRecognizer object at 0x7bb7e2044040>), ('tagger', <spacy.pipeline.tagger.Tagger object at 0x7bb7da75d600>)]


In [139]:
ner_pos_similarities = cosine_similarity(ner_pos_query_vectors, ner_pos_doc_vectors)

# find top 5 and top 10 similar document indices for each query
ner_pos_top_5_indices = np.argsort(ner_pos_similarities, axis=1)[:, -5:]
ner_pos_top_10_indices = np.argsort(ner_pos_similarities, axis=1)[:, -10:]

# for each query, store the top 5 and top 10 similar document ids
ner_pos_top_5_ids = [[doc_ids[i] for i in indices] for indices in ner_pos_top_5_indices]
ner_pos_top_10_ids = [[doc_ids[i] for i in indices] for indices in ner_pos_top_10_indices]
ner_pos_top_ids = [[ids[-1]] for ids in ner_pos_top_5_ids]

In [140]:
prec_1_ner_pos = precision_k_score(query_id_idx_map, relevant_docs, ner_pos_top_ids, 1)
prec_5_ner_pos = precision_k_score(query_id_idx_map, relevant_docs, ner_pos_top_5_ids, 5)
prec_10_ner_pos = precision_k_score(query_id_idx_map, relevant_docs, ner_pos_top_10_ids, 10)

print(f"Average precision at 1 for ner and pos: {round(prec_1_ner_pos * 100, 2)} %")
print(f"Average precision at 5 for ner and pos: {round(prec_5_ner_pos * 100, 2)} %")
print(f"Average precision at 10 for ner and pos: {round(prec_10_ner_pos * 100, 2)} %")

Average precision at 1 for ner and pos: 46.0 %
Average precision at 5 for ner and pos: 15.0 %
Average precision at 10 for ner and pos: 8.4 %
