## Installations

In [None]:
!pip install spacy
!python -m spacy download en_core_web_sm
!pip install contextualSpellCheck

## Imports

import spacy
import contextualSpellCheck
import csv

## Models

nlp = spacy.load("en_core_web_sm")
contextualSpellCheck.add_to_pipe(nlp)

## Function Definitions

In [None]:
def spell_check(text):
    doc = nlp(text)

    return doc._.performed_spellCheck, doc._.outcome_spellCheck

In [None]:
def correct_spellings(csv_file, column_name, should_log=False):
    # read a csv file, in each line, see if spellCheck was performed(if text was correctly spelled) and print original and corrected texts
    lines = []

    with open(csv_file, 'r') as f:
        reader = csv.reader(f)
        lines = list(reader)

        # Index of the column to be spell checked
        column_index = lines[0].index(column_name)

        # For each line, perform spell check
        for i in range(1, len(lines)):
            line = lines[i][column_index]

            performed, outcome = spell_check(line)

            # If spell check was performed, update the line and print the original and corrected texts
            if should_log and performed:
                print(line)
                print(outcome)
                print()
            
            # Update the line
            lines[i][column_index] = outcome

    # Write the spell checked lines to a new csv file named "spell_checked + csv_file" in the same directory

    # get csv file directory
    csv_file_dir = csv_file.split('/')

    # get csv file name
    csv_file_name = csv_file_dir[-1]

    # add spell_checked to the csv file name
    csv_file_name = "spell_checked_" + csv_file_name

    # get csv file directory
    csv_file_dir = csv_file_dir[:-1]

    # join the directory and the file name for the final path
    spell_checked_csv_file_path = '/'.join(csv_file_dir) + '/' + csv_file_name 
    
    with open(spell_checked_csv_file_path, 'w') as f:
        writer = csv.writer(f)
        writer.writerows(lines)

    print("Spell check completed")

In [None]:
def pre_process(csv_file, column_name):
    # read a csv file, in each line, remove characters apart from alphanumerics and whitespaces in the given column
    lines = []

    with open(csv_file, 'r') as f:
        reader = csv.reader(f)
        lines = list(reader)

        # Index of the column to be pre processed
        column_index = lines[0].index(column_name)

        for i in range(1, len(lines)):
            lines[i][column_index] = ''.join([c for c in lines[i][column_index] if c.isalnum() or c.isspace()])

    # write the pre-processed data to a new csv file named "pre_processed + csv_file" in the same directory

    # get csv file directory
    csv_file_dir = csv_file.split('/')

    # get csv file name
    csv_file_name = csv_file_dir[-1]

    # add "pre_processed" to the csv file name
    pre_processed_csv_file_name = "pre_processed_" + csv_file_name

    # get csv file directory
    csv_file_dir = csv_file_dir[:-1]

    # get the final path of the resulting csv file
    pre_processed_csv_file_path = '/'.join(csv_file_dir) + '/' + pre_processed_csv_file_name

    # write the pre-processed data to the resulting csv file
    with open(pre_processed_csv_file_path, 'w') as f:
        writer = csv.writer(f)
        writer.writerows(lines)

    print("Pre procesing completed")

## Main

Pre processing the CSV files for docs and queries.

The CSV file locations need to be changed as applicable if being run locally.

In [None]:
docs_file = "/content/Query_Doc/docs.csv"
queries_file = "/content/Query_Doc/queries.csv"

pre_process(docs_file, "doc_text")
pre_process(queries_file, "query_text")

pre_processed_docs_file = "/content/Query_Doc/pre_processed_docs.csv"
pre_processed_queries_file = "/content/Query_Doc/pre_processed_queries.csv"

correct_spellings(pre_processed_docs_file, "doc_text")
correct_spellings(pre_processed_queries_file, "query_text", True)