In [1]:
!pip install tqdm
!pip install nltk



In [2]:
import json
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from tqdm import tqdm

# Download necessary NLTK data
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [3]:
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

In [4]:
def clean_and_preprocess_text(text):
    text = text.lower()

    text = re.sub(r'[^a-zA-Z\s]', '', text)

    tokens = nltk.word_tokenize(text)

    tokens = [word for word in tokens if word not in stop_words]

    tokens = [lemmatizer.lemmatize(word) for word in tokens]

    return ' '.join(tokens)

def preprocess_squad_data(filepath):
    with open(filepath, 'r') as f:
        squad_data = json.load(f)

    preprocessed_data = []

    for entry in tqdm(squad_data['data']):
        for paragraph in entry['paragraphs']:
            context = paragraph['context']
            context_cleaned = clean_and_preprocess_text(context)

            for qa in paragraph['qas']:
                question = qa['question']
                question_cleaned = clean_and_preprocess_text(question)

                if qa['is_impossible']:
                    continue

                answer = qa['answers'][0]['text']
                answer_cleaned = clean_and_preprocess_text(answer)

                preprocessed_data.append({
                    'context': context_cleaned,
                    'question': question_cleaned,
                    'answer': answer_cleaned
                })

    return preprocessed_data


In [5]:
def main():

    squad_filepath = '/content/train-v2.0.json'


    preprocessed_data = preprocess_squad_data(squad_filepath)


    with open('preprocessed_squad_data.json', 'w') as f:
        json.dump(preprocessed_data, f, indent=4)

    print("Preprocessing of SQuAD 2.0 dataset completed!")


In [6]:
if __name__ == '__main__':
    main()

100%|██████████| 442/442 [00:59<00:00,  7.48it/s]


Preprocessing of SQuAD 2.0 dataset completed!
