preprocessing the data

In [None]:
#ALL pre processing

import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import SnowballStemmer  

#list of English stopwords
stop_words = set(stopwords.words('english'))

#initializing Snowball Stemmer
stemmer = SnowballStemmer("english")

def clean_text(text):
    if isinstance(text, str):
        #lowercase
        text = text.lower()
        
        #expand contractions
        text = expand_contractions(text)
        
        # removing special characters and punctuation
        text = re.sub(r'[^\w\s]', '', text)

        #removing numerical values
        text = re.sub(r'\d+', '', text)
        
        #replacing multiple whitespaces with a single whitespace
        text = re.sub(r'\s+', ' ', text)
        
        #removing leading and trailing whitespaces
        text = text.strip()
        
        #removing stopwords
        text = ' '.join([word for word in text.split() if word not in stop_words])
        
        #stemimg words using Snowball Stemmer
        text = stem_text(text)
        
        return text
    else:
        return ''  

def expand_contractions(text):
    #common English contractions
    contractions_dict = {
        "can't": "cannot",
        "won't": "will not",
        "n't": " not",
        "'ll": " will",
        "'ve": " have",
        "'re": " are",
        "'s": " is",
        "'d": " would",
        "'m": " am"
    }
    #finding contractions
    contractions_re = re.compile('(%s)' % '|'.join(contractions_dict.keys()))
    
    def expand_match(contraction):
        match = contraction.group(0)
        expanded_contraction = contractions_dict.get(match) if contractions_dict.get(match) else contractions_dict.get(match.lower())
        return expanded_contraction

    #replacing contractions with expansions
    expanded_text = contractions_re.sub(expand_match, text)
    return expanded_text

def stem_text(text):
    tokens = word_tokenize(text)
    stemmed_tokens = [stemmer.stem(token) for token in tokens]
    return ' '.join(stemmed_tokens)

try:
    chunks_list = []
    chunk_size = 10000  
    
    
    for chunk in pd.read_csv(r'enwiki-20170820.csv', chunksize=chunk_size, low_memory=False, encoding='utf-8'):    
        chunk = chunk.drop_duplicates(subset='ARTICLE_ID', keep='first')
        chunk['SECTION_TEXT'] = chunk['SECTION_TEXT'].apply(clean_text)
        
        #filtering out rows where ARTICLE_ID equals TITLE
        chunk = chunk[chunk['ARTICLE_ID'] != chunk['TITLE']]
        
        chunks_list.append(chunk)
    
    df = pd.concat(chunks_list, ignore_index=True)
    df.drop_duplicates(subset='ARTICLE_ID', keep='first', inplace=True)

    #saving preprocessed data into file
    df.to_csv('preprocessed_data.csv', index=False)

    print(df)

except Exception as e:
    print(f"Error processing CSV file: {e}")


In [None]:
#sorted vocabulary
try:
    df = pd.read_csv('preprocessed_data.csv')

    # Define a function to tokenize the text
    def tokenize_text(text):
        if isinstance(text, str):
            tokens = word_tokenize(text)
            return tokens
        else:
            return []

    # only tokenizingbn'SECTION_TEXT' column
    df['TOKENIZED_SECTION_TEXT'] = df['SECTION_TEXT'].apply(tokenize_text)

    vocabulary = set()
    for tokens in df['TOKENIZED_SECTION_TEXT']:
        vocabulary.update(tokens)
#removing empty spaces
    filtered_vocabulary = [word for word in vocabulary if word and word[0].isalpha()]
    sorted_vocabulary = sorted(filtered_vocabulary)

    print("\nSorted Vocabulary:", sorted_vocabulary)

except Exception as e:
    print(f"Error processing DataFrame: {e}")

In [None]:
# INDEXING

indexed_vocabulary = [(index, word) for index, word in enumerate(sorted_vocabulary)]

print("\nIndexed and Sorted Vocabulary:", indexed_vocabulary)