In [None]:
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import TweetTokenizer
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer
from textblob import TextBlob
import re

In [None]:
!pip install textblob



In [None]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [None]:
csv_file = "merged_comments.csv"
data = pd.read_csv(csv_file, encoding='utf-8')

In [None]:
# Initialize tools for pre-processing
tweet_tokenizer = TweetTokenizer()
lemmatizer = WordNetLemmatizer()
stemmer = PorterStemmer()

In [None]:
nltk.download('stopwords')
english_stopwords = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
processed_comments = []

In [None]:
# Pre-process each comment
for comment in data['Text']:

    # Remove URLs
    comment = re.sub(r'http\S+', '', comment)

    # Remove user names (assuming they start with u/)
    comment = re.sub(r'u/\w+', '', comment)

    # Tokenization using tweet tokenizer
    tokens = tweet_tokenizer.tokenize(comment)

    # Remove punctuation, lowercase, and keep only alphabetic words
    tokens = [token.lower() for token in tokens if token.isalpha()]

    # Lemmatization
    tokens = [lemmatizer.lemmatize(token) for token in tokens]

    # Stemming
    tokens = [stemmer.stem(token) for token in tokens]

    # Remove stopwords from different languages
    tokens = [token for token in tokens if token not in english_stopwords]

    # Remove punctuation
    tokens = [token for token in tokens if token.isalpha()]

    # Spelling correction
    corrected_tokens = []
    for token in tokens:
        blob = TextBlob(token)
        corrected_token = blob.correct().raw
        corrected_tokens.append(corrected_token)


    # Join tokens back into a processed comment
    processed_comment = ' '.join(tokens)
    processed_comments.append(processed_comment)

# Add processed comments to a new DataFrame
processed_data = pd.DataFrame({'Processed_Comment': processed_comments})

# Save the pre-processed data to a new CSV file
preprocessed_csv_file = "preprocessed_comments.csv"
processed_data.to_csv(preprocessed_csv_file, index=False)