<a href="https://colab.research.google.com/github/nespar7/Spring_24/blob/main/NLP/Assignments/Assignment_1/NLP_A1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Installations

In [None]:
!pip install spacy
!python -m spacy download en_core_web_sm
!pip install contextualSpellCheck
!pip install scikit-learn
!pip install nltk

## Imports

In [None]:
import spacy
import contextualSpellCheck
import csv
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
from nltk import download as nltk_dw
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize

nltk_dw('punkt')

### Models

In [None]:
nlp = spacy.load("en_core_web_sm")
contextualSpellCheck.add_to_pipe(nlp)

## Function Definitions

In [None]:
def spell_check(text):
    doc = nlp(text)

    return doc._.performed_spellCheck, doc._.outcome_spellCheck

In [None]:
def correct_spellings(csv_file, out_csv_file, column_name, should_log=False):
    # read a csv file, in each line, see if spellCheck was performed(if text was correctly spelled) and print original and corrected texts
    lines = []

    with open(csv_file, 'r') as f:
        reader = csv.reader(f)
        lines = list(reader)

        # Index of the column to be spell checked
        column_index = lines[0].index(column_name)

        # For each line, perform spell check
        for i in range(1, len(lines)):
            line = lines[i][column_index]
            performed, outcome = spell_check(line)


            # If spell check was performed, update the line and print the original and corrected texts
            if performed:
                # Update the line
                lines[i][column_index] = outcome

                if should_log:
                    print(line)
                    print(outcome)
                    print()

            if i%100 == 0:
                print(i)

    with open(out_csv_file, 'w') as f:
        writer = csv.writer(f)
        writer.writerows(lines)

    print("Spell check completed")

In [None]:
def pre_process(csv_file, out_csv_file, column_name):
    # read a csv file, in each line, remove characters apart from alphanumerics and whitespaces in the given column
    lines = []

    with open(csv_file, 'r') as f:
        reader = csv.reader(f)
        lines = list(reader)

        # Index of the column to be pre processed
        column_index = lines[0].index(column_name)

        for i in range(1, len(lines)):
            lines[i][column_index] = ''.join([c for c in lines[i][column_index] if c.isalnum() or c.isspace()])

    # write the pre-processed data to the resulting csv file
    with open(out_csv_file, 'w') as f:
        writer = csv.writer(f)
        writer.writerows(lines)

    print("Pre procesing completed")

In [None]:
def get_vocabulary(model, csv_file, column_name):
    # Read the spell checked docs csv file and store the documents in a list
    docs = []

    with open(csv_file, 'r') as f:
        reader = csv.reader(f)
        lines = [line for line in reader]

        doc_col_index = lines[0].index(column_name)

        for i in range(1, len(lines)):
            docs.append(lines[i][doc_col_index])

        num_docs = len(docs)
        lower_lim = 5
        upper_lim = int(num_docs * 0.85)

        # tokenize the documents using spacy and store the tokens in a vocabulary,
        # remove the tokens that occur less than lower_lim times and more than upper_lim times

        doc_tokenized = []

        # use the model to tokenize the documents
        for doc in docs:
            # process doc using the model passed
            doc = model(doc)

            # Select alphabetical tokens and remove stop words
            tokens = [token.text.lower() for token in doc if token.is_alpha and not token.is_stop]

            doc_tokenized.append(tokens)

        # Now that we have the tokenized documents, we can create a vocabulary
        vocabulary = {}

        for doc in doc_tokenized:
            for token in doc:
                if token in vocabulary:
                    vocabulary[token] += 1
                else:
                    vocabulary[token] = 1

        # Remove the tokens that occur less than lower_lim times and more than upper_lim times
        vocabulary = [k for k, v in vocabulary.items() if v >= lower_lim and v <= upper_lim]

        return vocabulary

In [None]:
def get_processed_text(model, csv_file, col_name):
    # Read the csv file and return the processed text using the passed model
    lines = []

    with open(csv_file, 'r') as f:
        reader = csv.reader(f)
        lines = list(reader)

        col_index = lines[0].index(col_name)

        for i in range(1, len(lines)):
            lines[i][col_index] = model(lines[i][col_index])

    return lines[:][col_index]

In [None]:
def get_stemmed_vocabulary(csv_file, col_name):
    # Read the spell checked docs csv file and store the documents in a list
    docs = []

    with open(csv_file, 'r') as f:
        reader = csv.reader(f)
        lines = [line for line in reader]

        doc_col_index = lines[0].index(col_name)

        for i in range(1, len(lines)):
            docs.append(lines[i][doc_col_index])

    # tokenize the documents using nltk and store the tokens in a vocabulary,
    # remove the tokens that occur less than lower_lim times and more than upper_lim times

    lower_lim = 5
    upper_lim = int(len(docs) * 0.85)

    doc_tokenized = []
    stemmed_docs = []

    # use nltk to tokenize the documents
    for doc in docs:
        doc = word_tokenize(doc)

        tokens = [token.lower() for token in doc if token.isalpha()]

        # stem the tokens
        stemmed_tokens = [ps.stem(token) for token in tokens]

        stemmed_docs.append(' '.join(stemmed_tokens))

        doc_tokenized.append(stemmed_tokens)

    vocabulary = {}

    for doc in doc_tokenized:
        for token in doc:
            if token in vocabulary:
                vocabulary[token] += 1
            else:
                vocabulary[token] = 1

    # Remove the tokens that occur less than lower_lim times and more than upper_lim times
    stemmed_vocabulary = [k for k, v in vocabulary.items() if v >= lower_lim and v <= upper_lim]

    return stemmed_vocabulary, stemmed_docs

In [None]:
def stem_queries(csv_file, porter_stemmer):
    # Read the spell checked docs csv file and store the queries in a list
    queries = []

    with open(csv_file, 'r') as f:
        reader = csv.reader(f)
        lines = [line for line in reader]

        query_col_index = lines[0].index('query_text')

        for i in range(1, len(lines)):
            queries.append(lines[i][query_col_index])

    stemmed_queries = []

    # use nltk to tokenize the documents
    for query in queries:
        query = word_tokenize(query)

        tokens = [token.lower() for token in query if token.isalpha()]

        # stem the tokens
        stemmed_tokens = [porter_stemmer.stem(token) for token in tokens]

        stemmed_queries.append(' '.join(stemmed_tokens))

    return stemmed_queries

In [None]:
def get_texts(csv_file, id_col, text_col):
    # Read the spell checked docs csv file and store the documents in a list
    ids = []
    texts = []

    with open(csv_file, 'r') as f:
        reader = csv.reader(f)
        lines = [line for line in reader]

        id_col_index = lines[0].index(id_col)
        text_col_index = lines[0].index(text_col)

        for i in range(1, len(lines)):
            ids.append(lines[i][id_col_index])
            texts.append(lines[i][text_col_index])

    return ids, texts

In [None]:
def get_relevant_docs(qdrels):
    # Get relevant docs list for each query from qdrels.csv file
    relevant_docs = {}

    with open(qdrels, 'r') as f:
        reader = csv.reader(f)
        lines = [line for line in reader]

        for i in range(1, len(lines)):
            line = lines[i]

            query_id = line[1]
            doc_id = line[2]

            if query_id in relevant_docs:
                relevant_docs[query_id].append(doc_id)
            else:
                relevant_docs[query_id] = [doc_id]

    return relevant_docs

In [None]:
def precision_k_score(query_id_idx_map, relevant_docs, top_docs, k):
    # For each query in relevant_docs, calculate true positives in top_docs and return average precision_k score
    precision_k = 0

    for query_id, relevant_docs_list in relevant_docs.items():
        true_positives = 0

        query_idx = query_id_idx_map[query_id]

        for doc in top_docs[query_idx]:
            if doc in relevant_docs_list:
                true_positives += 1

        precision_k += true_positives / k

    return precision_k / len(relevant_docs)

## Main

The CSV file locations need to be changed as applicable if being run locally.

In [None]:
docs_file = "/content/Query_Doc/docs.csv"
queries_file = "/content/Query_Doc/queries.csv"
pre_processed_docs_file = "/content/Query_Doc/pre_processed_docs.csv"
pre_processed_queries_file = "/content/Query_Doc/pre_processed_queries.csv"
spell_checked_docs_file = "/content/Query_Doc/spell_checked_docs.csv"
spell_checked_queries_file = "/content/Query_Doc/spell_checked_queries.csv"
qdrels_file = "/content/Query_Doc/qdrel.csv"

Pre processing

In [None]:
pre_process(docs_file, pre_processed_docs_file, "doc_text")
pre_process(queries_file, pre_processed_queries_file, "query_text")

Spell correction on the pre processed docs and queries

In [None]:
# Print logs(original and corrected for query doc only)
correct_spellings(pre_processed_queries_file, spell_checked_queries_file, "query_text", True)
correct_spellings(pre_processed_docs_file, spell_checked_docs_file, "doc_text")

Tokenize the documents and get the tokens forming the vocabulary

In [None]:
# Remove contextualSpellCheck, lemmatizer, ner and tagger from the nlp pipe since we are checking performance after only spell check
nlp.remove_pipe('contextual spellchecker')
nlp.remove_pipe('tagger')
nlp.remove_pipe('lemmatizer')
nlp.remove_pipe('ner')

In [None]:
vocabulary = get_vocabulary(nlp, spell_checked_docs_file, "doc_text")

In [None]:
print(vocabulary)
print(len(vocabulary))

In [None]:
doc_ids, docs = get_texts(spell_checked_docs_file, 'doc_id', 'doc_text')
query_ids, queries = get_texts(spell_checked_queries_file, 'query_id', 'query_text')

query_id_idx_map = {query_ids[i]: i for i in range(len(query_ids))}
doc_id_idx_map = {doc_ids[i]: i for i in range(len(doc_ids))}

In [None]:
relevant_docs = get_relevant_docs(qdrels_file)

In [None]:
print(query_id_idx_map)

Vectorize documents and queries, and find cosine similarities for each query with the documents

In [None]:
vectorizer = TfidfVectorizer(vocabulary=vocabulary)

# Create tf idf vectors for the documents and queries
doc_vectors = vectorizer.fit_transform(docs)
query_vectors = vectorizer.transform(queries)

In [None]:
similarities = cosine_similarity(query_vectors, doc_vectors)

In [None]:
# find top 5 and top 10 similar document indices for each query
top_5_indices = np.argsort(similarities, axis=1)[:, -5:]
top_10_indices = np.argsort(similarities, axis=1)[:, -10:]

# for each query, store the top 5 and top 10 similar document ids
top_5_ids = [[doc_ids[i] for i in indices] for indices in top_5_indices]
top_10_ids = [[doc_ids[i] for i in indices] for indices in top_10_indices]
top_ids = [[i[-1]] for i in top_5_ids]

In [None]:
prec_1 = precision_k_score(query_id_idx_map, relevant_docs, top_ids, 1)
prec_5 = precision_k_score(query_id_idx_map, relevant_docs, top_5_ids, 5)
prec_10 = precision_k_score(query_id_idx_map, relevant_docs, top_10_ids, 10)

print(f"Average precision at 1: {round(prec_1 * 100, 2)} %")
print(f"Average precision at 5: {round(prec_5 * 100, 2)} %")
print(f"Average precision at 10: {round(prec_10 * 100, 2)} %")

## Task 2

### Stemming

Using nltk's porter stemmer here since spacy does not provide a stemmer

In [None]:
ps = PorterStemmer()

In [None]:
stemmed_vocabulary, stemmed_docs = get_stemmed_vocabulary(spell_checked_docs_file, "doc_text")

print(stemmed_vocabulary)
print(len(stemmed_vocabulary))

print('\n'.join(stemmed_docs[0:10]))

In [None]:
stemmed_queries = stem_queries(spell_checked_queries_file, ps)

print('\n'.join(stemmed_queries[0:10]))

Vectorize the documents and queries, and find cosine similarities for each query with documents

In [None]:
vectorizer_stemmed = TfidfVectorizer(vocabulary=stemmed_vocabulary)

# Create tf idf vectors for the documents and queries
stemmed_doc_vectors = vectorizer_stemmed.fit_transform(stemmed_docs)
stemmed_query_vectors = vectorizer_stemmed.transform(stemmed_queries)

In [None]:
stemmed_similarities = cosine_similarity(stemmed_query_vectors, stemmed_doc_vectors)

# find top 5 and top 10 similar document indices for each query
stemmed_top_5_indices = np.argsort(stemmed_similarities, axis=1)[:, -5:]
stemmed_top_10_indices = np.argsort(stemmed_similarities, axis=1)[:, -10:]

# for each query, store the top 5 and top 10 similar document ids
stemmed_top_5_ids = [[doc_ids[i] for i in indices] for indices in stemmed_top_5_indices]
stemmed_top_10_ids = [[doc_ids[i] for i in indices] for indices in stemmed_top_10_indices]
stemmed_top_ids = [[ids[-1]] for ids in stemmed_top_5_ids]

Precision Scores

In [None]:
prec_1_stemmed = precision_k_score(query_id_idx_map, relevant_docs, stemmed_top_ids, 1)
prec_5_stemmed = precision_k_score(query_id_idx_map, relevant_docs, stemmed_top_5_ids, 5)
prec_10_stemmed = precision_k_score(query_id_idx_map, relevant_docs, stemmed_top_10_ids, 10)

print(f"Average precision at 1 for stemmed: {round(prec_1_stemmed * 100, 2)} %")
print(f"Average precision at 5 for stemmed: {round(prec_5_stemmed * 100, 2)} %")
print(f"Average precision at 10 for stemmed: {round(prec_10_stemmed * 100, 2)} %")

## Lemmatization

Add lemmatizer back to the nlp pipe

In [None]:
lemmatizer = nlp.add_pipe('lemmatizer')
lemmatizer.initialize()

In [None]:
lemmatized_vocabulary = get_vocabulary(lemmatizer, spell_checked_docs_file, "doc_text")
lemmatized_docs = get_processed_text(lemmatizer, spell_checked_docs_file, "doc_text")
lemmatized_queries = get_processed_text(lemmatizer, spell_checked_queries_file, "query_text")

print(lemmatized_docs[0:10])
print(lemmatized_queries[0:10])
