In [12]:
import pandas as pd
import nltk
import string
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

# Download missing NLTK resource
nltk.download('averaged_perceptron_tagger')

# Download NLTK resources
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

# Initialize WordNet Lemmatizer
lemmatizer = WordNetLemmatizer()

# Load CSV data into a DataFrame
df = pd.read_csv("/content/updated_cleaned_preprocessed_data.csv")

# Drop null rows
df.dropna(inplace=True)

# Tokenize the text and remove punctuation
def preprocess_text(text):
    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))

    # Tokenize the text
    tokens = word_tokenize(text)

    # Remove stop words and lemmatize
    stop_words = set(stopwords.words('english'))
    tagged_tokens = nltk.pos_tag(tokens)
    filtered_tokens = []
    for word, tag in tagged_tokens:
        if word.lower() not in stop_words:
            if tag.startswith('N'):
                pos = 'n'
            elif tag.startswith('V'):
                pos = 'v'
            elif tag.startswith('R'):
                pos = 'r'
            else:
                pos = 'a'
            filtered_tokens.append(lemmatizer.lemmatize(word.lower(), pos=pos))

    # Join the lemmatized tokens back into a string
    return ' '.join(filtered_tokens)

# Apply text preprocessing function to the text column
if 'SECTION_TEXT' in df.columns:
    df['SECTION_TEXT'] = df['SECTION_TEXT'].apply(preprocess_text)
else:
    print("Column 'text' not found in the DataFrame.")

# Write processed data to a new CSV file
df.to_csv("processed_data.csv", index=False)


[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
