In [26]:
from google.colab import drive
drive.mount('/content/drive')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


QUESTION 1 DATA PREPROCESSING

In [21]:
import os
import string
import nltk
import random
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

# Ensure NLTK resources are downloaded (do this once)
nltk.download('punkt')
nltk.download('stopwords')

# Path to the dataset folder on Google Drive
dataset_path = '/content/drive/My Drive/IR ASSIGNMENT/text_files'
preprocessed_path = '/content/drive/My Drive/IR ASSIGNMENT/preprocessed_text_files'

# Create a directory for preprocessed files if it doesn't exist
if not os.path.exists(preprocessed_path):
    os.makedirs(preprocessed_path)

# Function to preprocess text
def preprocess_text(text):
    # Lowercase
    text = text.lower()
    # Tokenization
    tokens = word_tokenize(text)
    # Remove stopwords
    tokens = [word for word in tokens if word not in stopwords.words('english')]
    # Remove punctuation
    tokens = [word for word in tokens if word.isalpha()]
    return ' '.join(tokens)

# Preprocess all files
all_files = os.listdir(dataset_path)
preprocessed_files = []

for filename in all_files:
    file_path = os.path.join(dataset_path, filename)
    with open(file_path, 'r', encoding='utf-8') as file:
        original_text = file.read()

        preprocessed_text = preprocess_text(original_text)

        # Save the preprocessed text
        preprocessed_file_path = os.path.join(preprocessed_path, filename)
        with open(preprocessed_file_path, 'w', encoding='utf-8') as preprocessed_file:
            preprocessed_file.write(preprocessed_text)
            preprocessed_files.append(preprocessed_file_path)

print("Preprocessing complete for all files.")

# Randomly select 5 preprocessed files and print their contents
selected_files = random.sample(preprocessed_files, 5)

for file_path in selected_files:
    filename = os.path.basename(file_path)
    with open(file_path, 'r', encoding='utf-8') as file:
        preprocessed_text = file.read()
        print(f"Preprocessed text of {filename}:\n{preprocessed_text}\n")


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Preprocessing complete for all files.
Preprocessed text of file194.txt:
superb feel sound tips wah success rock pedal way forward match volume knob bass crank q love life seriously range pedal hits q amazing love expressive tone get going think retains low end adds creamy wah highs mids listen claim lose lows put bluntly needed board used delays octave overdrives clean never found sound like could use buy

Preprocessed text of file854.txt:
great les paul like guitar novice player minimum setup needed string buzzed little everyone loves look fun play affordable price

Preprocessed text of file329.txt:
design tool great overall like flexibility job well going hear less sibilance vocals recorded definitely recommend every artist mixing engineer get clean quality recording cheap price big problem see pop filter studios days worry expensive advanced time get job done equipment keep project wo let

Preprocessed text of file399.txt:
guitar amazing value usa pups chunky neck fast fretboard rou

QUESTION 2 Unigram Inverted Index

In [31]:
import os
import string
import pickle
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

nltk.download('punkt')
nltk.download('stopwords')

def preprocess_document(text):
    stop_words_set = set(stopwords.words('english'))
    lowercase_text = text.lower()
    no_punctuation_text = lowercase_text.translate(str.maketrans('', '', string.punctuation))
    tokenized_words = word_tokenize(no_punctuation_text)
    filtered_words = [word for word in tokenized_words if word not in stop_words_set]
    return filtered_words

def build_inverted_index(directory_path):
    index = {}
    for file_name in os.listdir(directory_path):
        if file_name.endswith(".txt"):
            full_path = os.path.join(directory_path, file_name)
            with open(full_path, 'r', encoding='utf-8') as document:
                document_content = document.read()
                document_words = preprocess_document(document_content)
                for word in document_words:
                    if word not in index:
                        index[word] = set()
                    index[word].add(file_name)
    return index

def perform_query_operation(doc_set_1, doc_set_2, op):
    if op == 'AND':
        return doc_set_1.intersection(doc_set_2)
    elif op == 'OR':
        return doc_set_1.union(doc_set_2)
    elif op == 'AND NOT':
        return doc_set_1 - doc_set_2
    elif op == 'OR NOT':
        return doc_set_1 - doc_set_2
    else:
        raise ValueError(f"Unknown operation: {op}")

def persist_inverted_index(index, path_to_save):
    with open(path_to_save, 'wb') as index_file:
        pickle.dump(index, index_file)

def retrieve_inverted_index(path_to_load):
    with open(path_to_load, 'rb') as index_file:
        return pickle.load(index_file)

def handle_query(query_tokens, query_ops, index):
    query_result = set()
    if query_tokens:
        initial_docs = index.get(query_tokens[0], set())
        for operation_index, operation in enumerate(query_ops):
            if operation_index < len(query_tokens) - 1:
                subsequent_docs = index.get(query_tokens[operation_index + 1], set())
                initial_docs = perform_query_operation(initial_docs, subsequent_docs, operation.strip())
        query_result = initial_docs
    return query_result

def query_system():
    index_path = '/content/drive/My Drive/IR ASSIGNMENT/inverted_index.pkl'
    if os.path.exists(index_path):
        doc_index = retrieve_inverted_index(index_path)
    else:
        docs_directory = '/content/drive/My Drive/IR ASSIGNMENT/preprocessed_text_files'
        doc_index = build_inverted_index(docs_directory)
        persist_inverted_index(doc_index, index_path)

    query_count = int(input("Enter the number of Queries: "))
    for i in range(1, query_count + 1):
        user_query = input(f"Enter query {i}: ")
        query_operations = input("Enter operations separated by comma for Query: ").split(',')
        query_words = preprocess_document(user_query)
        result_docs = handle_query(query_words, query_operations, doc_index)

        # Formatting and printing the query and results
        formatted_query = ' '.join([f"{query_words[j]} {op}" for j, op in enumerate(query_operations)] + [query_words[-1]])
        print(f"Query {i}: {formatted_query}")
        print(f"Number of documents retrieved for Query {i}: {len(result_docs)}")
        if len(result_docs) > 0:
            print(f"Names of the documents retrieved for Query {i}: {', '.join(sorted(result_docs))}")
        else:
            print("No documents retrieved.")

query_system()



[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Enter the number of Queries: 2
Enter query 1: Car bag in a canister
Enter operations separated by comma for Query: OR, AND NOT
Query 1: car OR bag  AND NOT canister
Number of documents retrieved for Query 1: 31
Names of the documents retrieved for Query 1: file118.txt, file166.txt, file174.txt, file264.txt, file3.txt, file313.txt, file363.txt, file404.txt, file459.txt, file466.txt, file542.txt, file573.txt, file665.txt, file682.txt, file686.txt, file698.txt, file699.txt, file73.txt, file738.txt, file746.txt, file780.txt, file797.txt, file860.txt, file863.txt, file864.txt, file886.txt, file892.txt, file930.txt, file942.txt, file956.txt, file981.txt
Enter query 2: Coffee brewing techniques in cookbook
Enter operations separated by comma for Query: AND, OR NOT, OR
Query 2: coffee AND brewing  OR NOT techniques  OR cookbook
Number of documents retrieved for Query 2: 0
No documents retrieved.


Question 3

In [38]:
import os
import pickle
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import string
import nltk

nltk.download('punkt')
nltk.download('stopwords')

def preprocess_document(document_text):
    stopwords_list = set(stopwords.words('english'))
    lowercase_text = document_text.lower()
    no_punctuation_text = lowercase_text.translate(str.maketrans('', '', string.punctuation))
    word_tokens = word_tokenize(no_punctuation_text)
    filtered_tokens = [word for word in word_tokens if word not in stopwords_list]
    return filtered_tokens

def persist_index(index, storage_path):
    with open(storage_path, 'wb') as index_file:
        pickle.dump(index, index_file)

def retrieve_index(index_path):
    with open(index_path, 'rb') as index_file:
        return pickle.load(index_file)

def handle_phrase_query(phrase, index):
    query_tokens = preprocess_document(phrase)
    if not query_tokens:
        return 0, []

    documents_with_phrase = set(index.get(query_tokens[0], {}).keys())
    for token in query_tokens[1:]:
        documents_with_phrase &= set(index.get(token, {}).keys())

    matched_documents = []
    for document in documents_with_phrase:
        token_positions = [index[token][document] for token in query_tokens if document in index[token]]
        for initial_position in token_positions[0]:
            if all((initial_position + offset in token_positions[offset]) for offset in range(1, len(query_tokens))):
                matched_documents.append(document)
                break

    return len(matched_documents), matched_documents

def build_positional_index(preprocessed_dir):
    positional_index = {}
    for file_name in os.listdir(preprocessed_dir):
        if file_name.endswith(".txt"):
            document_path = os.path.join(preprocessed_dir, file_name)
            with open(document_path, 'r', encoding='utf-8') as document_file:
                document_content = document_file.read()
                document_tokens = preprocess_document(document_content)
                for position, token in enumerate(document_tokens):
                    if token not in positional_index:
                        positional_index[token] = {}
                    if file_name not in positional_index[token]:
                        positional_index[token][file_name] = []
                    positional_index[token][file_name].append(position)
    return positional_index


def run_query_system():
    index_file_path = '/content/drive/My Drive/IR ASSIGNMENT/positional_index.pkl'
    documents_directory = '/content/drive/My Drive/IR ASSIGNMENT/preprocessed_text_files'

    if os.path.exists(index_file_path):
        document_index = retrieve_index(index_file_path)
    else:
        document_index = build_positional_index(documents_directory)
        persist_index(document_index, index_file_path)

    query_count = int(input("Enter the number of queries: "))
    for query_num in range(query_count):
        search_phrase = input(f"Enter phrase query {query_num+1}: ")
        documents_count, documents_list = handle_phrase_query(search_phrase, document_index)

        print(f"Number of documents retrieved for query {query_num+1} using positional index: {documents_count}")
        if documents_count > 0:
            print(f"Names of documents retrieved for query {query_num+1} using positional index: {', '.join(documents_list)}")
        else:
            print("No documents retrieved using positional index.")

if __name__ == "__main__":
    run_query_system()


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Enter the number of queries: 3
Enter phrase query 1: it is a good in front for poutch
Number of documents retrieved for query 1 using positional index: 0
No documents retrieved using positional index.
Enter phrase query 2: it is good in reliable for fit
Number of documents retrieved for query 2 using positional index: 1
Names of documents retrieved for query 2 using positional index: file9.txt
Enter phrase query 3: it is a fit front poutch
Number of documents retrieved for query 3 using positional index: 1
Names of documents retrieved for query 3 using positional index: file9.txt
