# Constructing Positional Index

In [29]:
pip install python-docx pypdf2 pandas




In [38]:
import os
import nltk
import string
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
from PyPDF2 import PdfReader
from docx import Document
import pandas as pd

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')

def preprocess_text(text):
    text = text.lower()  # Convert to lowercase
    tokens = word_tokenize(text)  # Tokenize the text
    stop_words = set(stopwords.words('english'))
    filtered_tokens = [word for word in tokens if word not in stop_words and word not in string.punctuation]  # Remove stopwords and punctuation
    lemmatizer = WordNetLemmatizer()
    lemmatized_tokens = [lemmatizer.lemmatize(word) for word in filtered_tokens]  # Lemmatization
    return lemmatized_tokens

def read_txt(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        return file.read()

def read_pdf(file_path):
    text = ""
    with open(file_path, 'rb') as file:
        reader = PdfReader(file)
        for page in reader.pages:
            text += page.extract_text()
    return text

def read_docx(file_path):
    doc = Document(file_path)
    return "\n".join([para.text for para in doc.paragraphs])

def read_xlsx(file_path):
    df = pd.read_excel(file_path, sheet_name=None)
    text = ""
    for sheet_name, sheet_df in df.items():
        text += sheet_df.to_string(index=False, header=False)
    return text

def preprocess_document(doc_path):
    _, ext = os.path.splitext(doc_path)
    if ext == '.txt':
        text = read_txt(doc_path)
    elif ext == '.pdf':
        text = read_pdf(doc_path)
    elif ext == '.docx':
        text = read_docx(doc_path)
    elif ext == '.xlsx':
        text = read_xlsx(doc_path)
    else:
        raise ValueError(f"Unsupported file type: {ext}")
    return preprocess_text(text)

def preprocess_documents(documents_path):
    preprocessed_docs = {}
    for filename in os.listdir(documents_path):
        if filename.endswith(('.txt', '.pdf', '.docx', '.xlsx')):
            doc_path = os.path.join(documents_path, filename)
            doc_id = os.path.basename(doc_path)
            preprocessed_docs[doc_id] = preprocess_document(doc_path)
    return preprocessed_docs

# Example usage
documents_path = '/content/Documents'  # Update this path to your directory
preprocessed_docs = preprocess_documents(documents_path)

for doc_id, tokens in preprocessed_docs.items():
    print(f"Preprocessed {doc_id}: {tokens}")

def construct_inverted_index(documents_path):
    inverted_index = {}
    doc_id_mapping = {}
    doc_id_counter = 0

    for filename in os.listdir(documents_path):
        if filename.endswith(('.txt', '.pdf', '.docx', '.xlsx')):
            doc_id_counter += 1
            doc_id = f"doc_{doc_id_counter}"
            doc_id_mapping[doc_id] = filename
            doc_path = os.path.join(documents_path, filename)
            tokens = preprocess_document(doc_path)

            for position, token in enumerate(tokens):
                if token not in inverted_index:
                    inverted_index[token] = {}
                if doc_id not in inverted_index[token]:
                    inverted_index[token][doc_id] = []
                inverted_index[token][doc_id].append(position)

    return inverted_index, doc_id_mapping

# Example usage
documents_path = '/content/Documents'  # Update this path to your directory
inverted_index, doc_id_mapping = construct_inverted_index(documents_path)



[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


Preprocessed dd.pdf: ['big', 'data', 'system', 'session', '7-distributed', 'programming', 'janardhanan', 'p', 'janardhanan.ps', 'wilp.bits', '-pilani.ac.intopics', 'today', '•top', 'design', '•types', 'parallelism', '•mapreduce', 'programming', 'model', '•see', 'map', 'reduce', 'program', 'work', 'using', 'hadoop', '•iterative', 'mapreduce', '•hands', 'demo', 'k', '-means', 'clustering', 'using', 'iterative', 'mapreduce', '2top', 'design', '-sequential', 'context', '•in', 'context', 'sequential', 'program', '•divide', 'conquer', '•it', 'easier', 'divide', 'problem', 'sub', '-problems', 'execute', 'one', 'one', '•a', 'sub', '-problem', 'definition', 'may', 'left', 'programmer', 'sequential', 'programming', 'context', '3f1', 'f2', 'f3', 'f4', 'main', 'f5', 'top', 'design', '-parallel', 'context', '•in', 'context', 'parallel', 'program', 'w', 'e', 'decompose', 'problem', 'sub', '-problems', 'anyway', 'programmer', 'chooses', '•need', 'think', '•each', 'sub', '-problem', 'need', 'assigned'

# Boolean Query Function

In [39]:
def boolean_query(query, inverted_index, doc_id_mapping):
    query = query.lower()
    query_tokens = word_tokenize(query)
    result = None
    all_docs = set(doc_id_mapping.keys())

    i = 0
    while i < len(query_tokens):
        token = query_tokens[i]

        if token == 'not':
            i += 1
            next_token = query_tokens[i] if i < len(query_tokens) else None
            if next_token:
                result = result.difference(set(inverted_index.get(next_token, {}).keys())) if result is not None else all_docs.difference(set(inverted_index.get(next_token, {}).keys()))
        elif token in ('and', 'or'):
            i += 1
            continue
        else:
            if result is None:
                result = set(inverted_index.get(token, {}).keys())
            elif query_tokens[i-1] == 'and':
                result = result.intersection(set(inverted_index.get(token, {}).keys()))
            elif query_tokens[i-1] == 'or':
                result = result.union(set(inverted_index.get(token, {}).keys()))

        i += 1

    return list(result) if result is not None else []

# Query Function

In [40]:
# Construct inverted index
documents_path = '/content/Documents'
inverted_index, doc_id_mapping = construct_inverted_index(documents_path)
print (doc_id_mapping)

# Perform boolean queries
query1 = "context AND program"
query2 = "Health OR activity"
query3 = "NOT health"

result1 = boolean_query(query1, inverted_index, doc_id_mapping)
result2 = boolean_query(query2, inverted_index, doc_id_mapping)
result3 = boolean_query(query3, inverted_index, doc_id_mapping)

print(f"Results for query '{query1}': {[doc_id_mapping[doc_id] for doc_id in result1]}")
print(f"Results for query '{query2}': {[doc_id_mapping[doc_id] for doc_id in result2]}")
print(f"Results for query '{query3}': {[doc_id_mapping[doc_id] for doc_id in result3]}")


{'doc_1': 'dd.pdf', 'doc_2': 'ff.xlsx', 'doc_3': 'Upcoming 5G.docx', 'doc_4': 'cloud.txt'}
Results for query 'context AND program': ['dd.pdf']
Results for query 'Health OR activity': ['ff.xlsx', 'Upcoming 5G.docx', 'dd.pdf']
Results for query 'NOT health': ['cloud.txt', 'dd.pdf']


# Implementation and analysis of Context sensitive spell correction algorithm

In [41]:
from collections import defaultdict

def preprocess_text(text):
    """
    Preprocesses the input text: tokenization, lowercasing, removing stopwords and punctuation.
    Returns a list of preprocessed tokens.
    """
    tokens = word_tokenize(text.lower())
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words and token not in string.punctuation]
    return tokens

def is_misspelled(word, vocabulary):
    """
    Checks if a word is likely misspelled based on vocabulary.
    """
    return word.lower() not in vocabulary

def generate_candidates(word, vocabulary):
    """
    Generates candidate corrections for a misspelled word.
    Uses edit distance and dictionary lookup.
    """
    # Generate candidates using edit distance
    candidates = []
    for vocab_word in vocabulary:
        if nltk.edit_distance(word, vocab_word) <= 1:
            candidates.append(vocab_word)

    # Add the original word as a candidate
    candidates.append(word)

    return candidates

def rank_candidates(original_word, candidates, context_tokens):
    """
    Ranks candidate corrections based on context.
    Enhanced ranking by considering POS tags and edit distance.
    """
    candidate_scores = defaultdict(int)

    # Count frequency of each candidate in the context tokens
    for candidate in candidates:
        candidate_scores[candidate] += context_tokens.count(candidate)

    # Rank candidates by frequency (higher frequency is better)
    ranked_candidates = sorted(candidates, key=lambda x: candidate_scores[x], reverse=True)

    # Return the best candidate (first in the list)
    return ranked_candidates[0]

def context_sensitive_spell_correction(text, vocabulary):
    """
    Performs context-sensitive spell correction on the input text.
    Returns the corrected text.
    """
    # Preprocess the text
    tokens = preprocess_text(text)

    # POS tagging
    tagged_tokens = nltk.pos_tag(tokens)

    # Context-sensitive correction logic goes here
    corrected_text = []
    for i, (token, tag) in enumerate(tagged_tokens):
        # Check if the token is likely misspelled
        if is_misspelled(token, vocabulary):
            # Generate candidate corrections
            candidates = generate_candidates(token, vocabulary)

            # Rank candidates based on context (for simplicity, consider only surrounding tokens)
            context_tokens = tokens[max(0, i-2):i] + tokens[i+1:min(len(tokens), i+3)]
            best_candidate = rank_candidates(token, candidates, context_tokens)

            # Replace token with best candidate
            corrected_text.append(best_candidate)
        else:
            corrected_text.append(token)

    # Join tokens back into corrected text
    corrected_text = ' '.join(corrected_text)
    return corrected_text

# Example vocabulary (can be expanded with a larger corpus)
vocabulary = {'this', 'business', 'analysis', 'important', 'for', 'making', 'decisions'}

# Example usage
input_text = "Ths busi analysi is importnt for makin decisins."
corrected_text = context_sensitive_spell_correction(input_text, vocabulary)
print("Original Text:", input_text)
print("Corrected Text:", corrected_text)


Original Text: Ths busi analysi is importnt for makin decisins.
Corrected Text: this busi analysis important making decisions
