***ANSWER 1***

# New Section

In [3]:
!ls -la /content/drive


total 12
drwxr-xr-x 3 root root 4096 Feb  9 14:08 .
drwxr-xr-x 1 root root 4096 Feb  9 14:08 ..
drwxr-xr-x 3 root root 4096 Feb  9 14:08 MyDrive


In [1]:
from google.colab import drive
drive.mount('/content/drive')
drive.mount('/content/drive', force_remount=True)


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Mounted at /content/drive


In [5]:
import os
import random
from nltk import download
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import string

# Configuration for directories
src_dir = '/content/drive/My Drive/IR/text_files'
dest_dir = '/content/drive/MyDrive/IR/processed_files'

# Ensure necessary NLTK resources
def setup_nltk():
    download('punkt')
    download('stopwords')

# Clean and transform text
def transform_content(raw_text):
    # Convert to lowercase
    lower_text = raw_text.lower()
    # Tokenize
    words = word_tokenize(lower_text)
    # Filter out stopwords and punctuation
    filtered_words = [w for w in words if w not in set(stopwords.words('english')) | set(string.punctuation)]

    # Combine back to string
    return ' '.join(filtered_words)

# Function to print file content
def print_file_content(file_path, label):
    """Print the entire content of a file with a label."""
    with open(file_path, 'r') as file:
        content = file.read()
    print(f"--- {label} ---\n{content}")

# Process individual files
def handle_file(input_path, output_path):
    with open(input_path, 'r') as input_file:
        content = input_file.read()
        cleaned_content = transform_content(content)

    with open(output_path, 'w') as output_file:
        output_file.write(cleaned_content)

    # Print file content before and after processing
    print_file_content(input_path, "Before Processing")
    print_file_content(output_path, "After Processing")
    print("\n" + "="*60 + "\n")  # Add separator

# Main function to iterate through all files
def process_files(folder_source, folder_destination):
    setup_nltk()
    files = [filename for filename in os.listdir(folder_source) if filename.endswith('.txt')]
    selected_files = random.sample(files, min(len(files), 5))  # Select 5 random files

    for filename in selected_files:
        src_file_path = os.path.join(folder_source, filename)
        dest_file_path = os.path.join(folder_destination, filename[:-4] + '_cleaned.txt')
        handle_file(src_file_path, dest_file_path)

# Execute processing
if __name__ == "__main__":
    if not os.path.exists(dest_dir):
        os.makedirs(dest_dir)
    process_files(src_dir, dest_dir)


--- Before Processing ---
I'm a long time musician, and a long time user of Ernie Ball Strings (on electric guitars). Unfortunately, I can't endorse these strings, even though they sound great and the light gauge saves my fingers. The problem is that they break, in my humble opinion, excessively easily. I put a set on and broke a g string within 2 days of moderate use. It snapped near the saddle, so I didn't think much of it. I replaced the set, and within a few more days I noticed that the wrapping on the g string was broken and becoming unraveled near my third fret. I bought 7 sets, so I still have 5 more sets to go through, but honestly I'll probably go back to elixirs when these are gone, since they seem to last longer.

*update*
I purchased these strings on 12/19. It' now 1/18, and I've broken 3 strings from 3 different packs.
At this point, I really hate these strings.
--- After Processing ---
'm long time musician long time user ernie ball strings electric guitars unfortunately 

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


***ANSWER 2***

In [25]:
import os
import pickle
import nltk
import string
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

class QueryProcessor:
    def __init__(self, index_obj):
        self.index = index_obj

    def process_query(self, terms, operators):
        if not terms or not operators or not self.index:
            return set()

        results = set(self.index.get(terms[0], []))
        for i, operator in enumerate(operators):
            if i + 1 < len(terms):
                next_terms = terms[i + 1]
                postings = self.index.get(next_terms, [])
                if operator == 'AND':
                    results &= set(postings)
                elif operator == 'OR':
                    results |= set(postings)
                elif operator == 'AND NOT':
                    results -= set(postings)
                elif operator == 'OR NOT':
                    results = results.union(set(self.index.keys()) - postings)
        return results

    def execute_queries(self, num_queries, queries):
        print("\n" + "="*100 + "\n")  # Add separator

        print("b. Output:")
        for i in range(num_queries):
            query_input = queries[i][0]
            operator_input = queries[i][1]
            terms = preprocess_query(query_input)
            operators = operator_input.split(',')

            if len(terms) != len(operators) + 1:
                print(f"Error: Number of operations should be one less than the number of terms in the query inputted.")
                continue

            query_display = ' '.join([f"{term} {op}" for term, op in zip(terms[:-1], operators)])
            query_display += f" {terms[-1]}"  # Add the last term separately

            results = self.process_query(terms, operators)

            print(f"Query{i+1}: {query_display}")
            print(f"Number of documents retrieved for query {i+1}: {len(results)}")
            print(f"Names of the documents retrieved for query {i+1}: {', '.join(results) if results else 'None'}")



class InvertedIndex:
    def __init__(self, data_dir):
        self.data_dir = data_dir
        self.idx = self.build_idx()

    def build_idx(self):
        idx = {}
        for file in os.listdir(self.data_dir):
            if file.endswith(".txt"):
                path = os.path.join(self.data_dir, file)
                with open(path, 'r', encoding='utf-8') as content:
                    for term in content.read().split():
                        if term not in idx:
                            idx[term] = [file]
                        elif file not in idx[term]:
                            idx[term].append(file)
        return idx

    def save_index(self, filepath):
        with open(filepath, 'wb') as file:
            pickle.dump(self.idx, file)

    @staticmethod
    def load_index(filepath):
        with open(filepath, 'rb') as file:
            return pickle.load(file)


def preprocess_query(text):
    tokens = word_tokenize(text.lower())
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words and word not in string.punctuation]
    return tokens


def main():
    nltk.download('punkt')
    nltk.download('stopwords')

    src_dir = '/content/drive/My Drive/IR/text_files'
    dest_dir = '/content/drive/MyDrive/IR/processed_files'

    index = InvertedIndex(dest_dir)
    index_file = 'index_file.pkl'
    index.save_index(index_file)

    loaded_index = InvertedIndex.load_index(index_file)

    processor = QueryProcessor(loaded_index)

    print("a. Input:")
    num_queries = int(input())
    queries = []
    for i in range(num_queries):
        query_input = input( )
        operator_input = input()
        queries.append((query_input, operator_input))


    processor.execute_queries(num_queries, queries)



if __name__ == "__main__":
    main()


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


a. Input:
2
Car bag in a canister
OR, AND NOT
Coffee brewing techniques in cookbook
AND, OR NOT, OR


b. Output:
Query1: car OR bag  AND NOT canister
Number of documents retrieved for query 1: 31
Names of the documents retrieved for query 1: file930_cleaned.txt, file404_cleaned.txt, file118_cleaned.txt, file459_cleaned.txt, file860_cleaned.txt, file864_cleaned.txt, file313_cleaned.txt, file738_cleaned.txt, file73_cleaned.txt, file797_cleaned.txt, file166_cleaned.txt, file746_cleaned.txt, file466_cleaned.txt, file264_cleaned.txt, file942_cleaned.txt, file699_cleaned.txt, file981_cleaned.txt, file886_cleaned.txt, file698_cleaned.txt, file686_cleaned.txt, file174_cleaned.txt, file3_cleaned.txt, file542_cleaned.txt, file863_cleaned.txt, file573_cleaned.txt, file682_cleaned.txt, file956_cleaned.txt, file363_cleaned.txt, file780_cleaned.txt, file665_cleaned.txt, file892_cleaned.txt
Query2: coffee AND brewing  OR NOT techniques  OR cookbook
Number of documents retrieved for query 2: 0
Names o

***ANSWER 3**

In [15]:
import os
import pickle

class QueryPositionalIndex:
    def __init__(self, directory_path):
        self.dir_path = directory_path
        self.idx = self.build_index()

    def build_index(self):
        idx = {}
        for filename in os.listdir(self.dir_path):
            if filename.endswith(".txt"):
                file_path = os.path.join(self.dir_path, filename)
                with open(file_path, 'r', encoding='utf-8') as content:
                    for position, term in enumerate(content.read().split()):
                        postings = idx.setdefault(term, {})
                        positions = postings.setdefault(filename, [])
                        positions.append(position)
        return idx

    def save(self, file_path):
        with open(file_path, 'wb') as file:
            pickle.dump(self.idx, file)

    @staticmethod
    def load(file_path):
        with open(file_path, 'rb') as file:
            return pickle.load(file)

class PhraseQueryProcessor:
    def __init__(self, index):
        self.index = index

    def find_docs(self, phrase):
        terms = preprocess_text(phrase)
        if not terms:
            return set()

        postings_lists = [self.index.get(term, {}) for term in terms]
        if not all(postings_lists):
            return set()

        common_docs = set(postings_lists[0].keys())
        for postings in postings_lists:
            common_docs &= postings.keys()

        return common_docs

# Redundant function for preprocessing text
def preprocess_text(text):
    return text.lower().split()

# Initialize and save positional index
processed_files_folder = '/content/drive/My Drive/IR/processed_files'
pos_index = QueryPositionalIndex(processed_files_folder)

index_file = 'positional_index.pkl'
pos_index.save(index_file)
loaded_index = QueryPositionalIndex.load(index_file)

processor = PhraseQueryProcessor(loaded_index)

# Read number of queries
print("a. Input")
num_queries = int(input("Enter the number of queries: "))
queries = []
for i in range(num_queries):
    query_input = input(f"Enter phrase query {i+1}: ")
    queries.append(query_input)
print("\n")

# Process queries
print("b. Output")
for i, query_input in enumerate(queries):
    result_docs = processor.find_docs(query_input)

    print(f"Number of documents retrieved for query {i+1} using positional index: {len(result_docs)}")
    print(f"Names of documents retrieved for query {i+1} using positional index: {', '.join(result_docs) if result_docs else 'None'}")
    print("\n")


a. Input
Enter the number of queries: 2
Enter phrase query 1: Car bag in a canister
Enter phrase query 2: Coffee brewing techniques in cookbook


b. Output
Number of documents retrieved for query 1 using positional index: 0
Names of documents retrieved for query 1 using positional index: None


Number of documents retrieved for query 2 using positional index: 0
Names of documents retrieved for query 2 using positional index: None


