In [1]:
import pandas as pd
import string
import nltk
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from spellchecker import SpellChecker

In [2]:
#download some importatn NLP packages
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [3]:
# initialize necessary objects for preprocessing
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()
spell = SpellChecker()

In [4]:
def process_data(data_set):
    #drop null values
    data_set = data_set.dropna()
    #apply NLP Pre processing
    lst = []
    for index, row in data_set.iterrows():
        lst.append(preprocess_text(row['Discussion']))
    data_set['Discussion'] = lst
    data_set = data_set.drop(columns=["	SampleID"])
    data_set.head()
    return data_set




In [5]:

def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()
    # Normalize spaces
    text = re.sub(r'\s+', ' ', text.strip())  # Remove excessive spaces
    # Expand contractions
    text = re.sub(r"don't", "do not", text)
    text = re.sub(r"i'm", "i am", text)
    # Remove URLs and emails
    text = re.sub(r"http\S+|www\S+|https\S+", "", text, flags=re.MULTILINE)
    text = re.sub(r'\S+@\S+', '', text)
    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    # Tokenize text
    tokens = word_tokenize(text)
    # Correct spelling
    corrected_tokens = [spell.correction(word) if word not in spell else word for word in tokens]
    # Normalize tokens (e.g., standardize numbers or specific patterns)
    #corrected_tokens = ['<NUM>' if word.isdigit() else word for word in corrected_tokens]
    # Remove stop words
    filtered_tokens = [word for word in corrected_tokens if word not in stop_words]
    # Lemmatize tokens
    lemmatized_tokens = [lemmatizer.lemmatize(word) for word in filtered_tokens]
    # Join tokens back into a single string
    preprocessed_text = ' '.join(lemmatized_tokens)
    return preprocessed_text