In [10]:
import os
import csv
import re
from collections import defaultdict
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
from nltk.stem import PorterStemmer

In [13]:
folder_path = r"E:\WIR CS-479\Assignments 2\crawled_pages"

files = os.listdir(folder_path)

inverted_index = defaultdict(lambda: {'frequency': 0, 'posting_list': set()})

stop_words = set(stopwords.words('english'))
stemmer = SnowballStemmer('english')

In [12]:
def preprocess_text(text):
    # Convert string to lowercase
    text = text.lower()
    # Remove digits
    text = re.sub(r'\d+', '', text)
    # Remove punctuation
    translator = str.maketrans('', '', string.punctuation)
    text = text.translate(translator)
    # Tokenize the text
    tokens = word_tokenize(text)
    # Remove stop words and perform stemming
    meaningful_tokens = [stemmer.stem(token) for token in tokens if token.lower() not in stop_words and re.match(r'^[a-zA-Z]+$', token)]
    # Remove duplicates while preserving order
    meaningful_tokens = list(dict.fromkeys(meaningful_tokens))
    return meaningful_tokens

In [11]:
def update_index(doc_id, tokens):
    for token in tokens:
        inverted_index[token]['frequency'] += 1
        inverted_index[token]['posting_list'].add(doc_id)

def write_to_csv(file_path):
    with open(file_path, 'w', newline='', encoding='utf-8') as csvfile:
        fieldnames = ['Keyword', 'Frequency', 'Posting List']
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        writer.writeheader()
        for keyword, info in inverted_index.items():
            writer.writerow({'Keyword': keyword, 'Frequency': info['frequency'], 'Posting List': '-'.join(map(str, info['posting_list']))})

In [7]:
for idx, file_name in enumerate(files):
    file_path = os.path.join(folder_path, file_name)
    with open(file_path, 'r', encoding='utf-8') as file:
        content = file.read()
        # Extract unique ID
        doc_id = re.search(r'ID: (\d+)', content).group(1)
        # Extract body text
        body_text = re.search(r'Body:\s+(.*)', content, re.DOTALL).group(1)
        # Preprocess text
        tokens = preprocess_text(body_text)
        # Update inverted index
        update_index(doc_id, tokens)

# Write inverted index to CSV
write_to_csv('inverted_index.csv')

In [17]:
keyword_docs = {}

# Read inverted index CSV file to populate the hash table
with open('inverted_index.csv', 'r', newline='', encoding='utf-8') as csvfile:
    reader = csv.DictReader(csvfile)
    for row in reader:
        keyword = (row['Keyword'])
        posting_list = row['Posting List']
        # Split posting list by hyphen and store document IDs in a list
        doc_ids = posting_list.split('-')
        keyword_docs[keyword] = doc_ids

In [20]:
# Function to preprocess keyword
def preprocess_keyword(keyword):
    # Convert keyword to lowercase
    keyword = keyword.lower()
    # Remove digits
    keyword = re.sub(r'\d+', '', keyword)
    return keyword

In [21]:
# Initialize meta file CSV writer
with open('meta_file.csv', 'w', newline='', encoding='utf-8') as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow(['ID', 'URI', 'Title', 'Description', 'Keywords'])

    # Iterate through crawled_pages folder
    for filename in os.listdir('crawled_pages'):
        if filename.endswith('.txt'):
            with open(os.path.join('crawled_pages', filename), 'r', encoding='utf-8') as file:
                # Extract ID from the file
                file_id = None
                uri = None
                title = None
                body = ''
                
                # Read lines from the file
                lines = file.readlines()
                for line in lines:
                    if line.startswith('ID:'):
                        file_id = line[4:].strip()
                    elif line.startswith('URL:'):
                        uri = line[5:].strip()
                    elif line.startswith('Title:'):
                        title = line[7:].strip()
                    elif line.startswith('Body:'):
                        body = ' '.join(lines[lines.index(line)+1:])
                        break

                # Summary of body - using first two sentences
                sentences = re.split(r'[.!?]', body)
                description = ' '.join(sentences[:2])

                # Find keywords for the webpage
                webpage_keywords = []
                for keyword, doc_ids in keyword_docs.items():
                    if file_id in doc_ids or title.lower().find(preprocess_keyword(keyword)) != -1 or description.lower().find(preprocess_keyword(keyword)) != -1:
                        webpage_keywords.append(keyword)

                # Write to meta file
                writer.writerow([file_id, uri, title, description, ', '.join(webpage_keywords)])

print("Meta file created successfully.")


Meta file created successfully.


In [3]:
import re
import string
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer

def preprocess_text(text):
    # Convert string to lowercase
    text = text.lower()
    # Remove digits
    text = re.sub(r'\d+', '', text)
    # Remove punctuation
    translator = str.maketrans('', '', string.punctuation)
    text = text.translate(translator)
    # Tokenize the text
    tokens = word_tokenize(text)
    # Remove stop words
    stop_words = set(stopwords.words('english'))
    filtered_tokens = [token for token in tokens if token.lower() not in stop_words and re.match(r'^[a-zA-Z]+$', token)]
    # Initialize stemmer and lemmatizer
    stemmer = PorterStemmer()
    lemmatizer = WordNetLemmatizer()
    # Apply stemming and lemmatization
    meaningful_tokens = [lemmatizer.lemmatize(stemmer.stem(token)) for token in filtered_tokens]
    # Remove duplicates while preserving order
    meaningful_tokens = list(dict.fromkeys(meaningful_tokens))
    return meaningful_tokens

# Example usage:
text = "This is an example sentence with various words like running, swimming, and jumped."
preprocessed_tokens = preprocess_text(text)
print(preprocessed_tokens)


['exampl', 'sentenc', 'variou', 'word', 'like', 'run', 'swim', 'jump']


In [6]:
from nltk.corpus import wordnet
from autocorrect import Speller

# Initialize spell checker
spell = Speller(lang='en')

def replace_meaningless_words(tokens):
    meaningful_tokens = []
    for token in tokens:
        if wordnet.synsets(token):
            # If token exists in WordNet, consider it meaningful
            meaningful_tokens.append(token)
        else:
            # If token is meaningless, try to find a meaningful alternative
            meaningful_alternative = find_meaningful_alternative(token)
            meaningful_tokens.append(meaningful_alternative if meaningful_alternative else token)
    return meaningful_tokens

def find_meaningful_alternative(word):
    # Use spell checker to find a meaningful alternative
    corrected_word = spell(word)
    if corrected_word != word and wordnet.synsets(corrected_word):
        return corrected_word
    else:
        return None

# Example usage:
preprocessed_tokens = ['exampl', 'sentenc', 'various', 'word', 'like', 'run', 'swim', 'jump']
meaningful_tokens = replace_meaningless_words(preprocessed_tokens)
print(meaningful_tokens)


['example', 'sentence', 'various', 'word', 'like', 'run', 'swim', 'jump']


In [None]:
import pandas as pd
from nltk.corpus import wordnet
from autocorrect import Speller

# Initialize spell checker
spell = Speller(lang='en')

def replace_meaningless_words(tokens):
    meaningful_tokens = []
    for token in tokens:
        if wordnet.synsets(token):
            # If token exists in WordNet, consider it meaningful
            meaningful_tokens.append(token)
        else:
            # If token is meaningless, try to find a meaningful alternative
            meaningful_alternative = find_meaningful_alternative(token)
            meaningful_tokens.append(meaningful_alternative if meaningful_alternative else token)
    return meaningful_tokens

def find_meaningful_alternative(word):
    # Use spell checker to find a meaningful alternative
    corrected_word = spell(word)
    if corrected_word != word and wordnet.synsets(corrected_word):
        return corrected_word
    else:
        return None

# Read input Excel file
input_file = "Book1.xlsx"  # Modify this with your file name
df = pd.read_excel(input_file)

# Convert any boolean values to strings in the 'Keywords' column
df['Keyword'] = df['Keyword'].astype(str)

# Preprocess keywords and replace meaningless words
df['Processed Keywords'] = df['Keyword'].apply(lambda x: replace_meaningless_words(str(x).split()))

# Write processed keywords to a new Excel file
output_file = "processed_keywords.xlsx"
df.to_excel(output_file, index=False)

print("Processed keywords have been written to", output_file)
