In [20]:
import pandas as pd
import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
from collections import Counter
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt_tab')
pd.set_option('display.max_colwidth', None)
stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [8]:
df = pd.read_csv('product_reviews.csv', encoding='latin-1')

print("Original Dataset:")
print(df.to_string())
print("\n")

Original Dataset:
    Review_ID                                                                 Review_Text
0           1                    "The product is GREAT! Loved it, but its a bit pricey."
1           2                        "Worst product ever!! Wouldnt recommend to anyone."
2           3                 "Satisfactory quality, works as expected, no major issues."
3           4                          "Amazing product, I would buy it again and again!"
4           5                           "The delivery was slow, but the product is good."
5           6                "Horrible experience, the product broke after just one use."
6           7                       "Great value for the price! Definitely worth buying."
7           8                    "The product didnt meet my expectations, returning it."
8           9  "Im satisfied with the purchase, but there are better options available."
9          10       "Superb product! Excellent build quality and great customer se

In [14]:
lowercase_texts = []
for text in df['Review_Text']:
    lowercase_texts.append(text.lower())
df['lowercase_text'] = lowercase_texts

print("After Converting to Lowercase:")
print(df['lowercase_text'].to_string())
print("\n")

After Converting to Lowercase:
0                       "the product is great! loved it, but its a bit pricey."
1                           "worst product ever!! wouldnt recommend to anyone."
2                    "satisfactory quality, works as expected, no major issues."
3                             "amazing product, i would buy it again and again!"
4                              "the delivery was slow, but the product is good."
5                   "horrible experience, the product broke after just one use."
6                          "great value for the price! definitely worth buying."
7                       "the product didnt meet my expectations, returning it."
8     "im satisfied with the purchase, but there are better options available."
9          "superb product! excellent build quality and great customer service."
10        "the product is just okay, nothing special, but it gets the job done."
11                    "fast delivery and product as described. would buy again

In [26]:
cleaned_texts = []
for text in df['lowercase_text']:
    text = re.sub(r'[^\w\s]', '', text)
    text = re.sub(r'\d+', '', text)
    cleaned_texts.append(text)
df['cleaned_text'] = cleaned_texts
print("After Removing Punctuation, Numbers, and Special Characters:")
print(df['cleaned_text'])
print("\n")

After Removing Punctuation, Numbers, and Special Characters:
0                        the product is great loved it but its a bit pricey
1                            worst product ever wouldnt recommend to anyone
2                    satisfactory quality works as expected no major issues
3                            amazing product i would buy it again and again
4                             the delivery was slow but the product is good
5                  horrible experience the product broke after just one use
6                         great value for the price definitely worth buying
7                       the product didnt meet my expectations returning it
8     im satisfied with the purchase but there are better options available
9         superb product excellent build quality and great customer service
10        the product is just okay nothing special but it gets the job done
11                   fast delivery and product as described would buy again
12                   not wo

In [27]:
tokenized_texts = []
for text in df['cleaned_text']:
    tokens = word_tokenize(text)
    tokenized_texts.append(tokens)
df['tokens'] = tokenized_texts

print("After Tokenization:")
print(df['tokens'])
print("\n")

After Tokenization:
0                        [the, product, is, great, loved, it, but, its, a, bit, pricey]
1                                [worst, product, ever, wouldnt, recommend, to, anyone]
2                       [satisfactory, quality, works, as, expected, no, major, issues]
3                              [amazing, product, i, would, buy, it, again, and, again]
4                               [the, delivery, was, slow, but, the, product, is, good]
5                    [horrible, experience, the, product, broke, after, just, one, use]
6                            [great, value, for, the, price, definitely, worth, buying]
7                          [the, product, didnt, meet, my, expectations, returning, it]
8     [im, satisfied, with, the, purchase, but, there, are, better, options, available]
9           [superb, product, excellent, build, quality, and, great, customer, service]
10      [the, product, is, just, okay, nothing, special, but, it, gets, the, job, done]
11          

In [28]:
no_stopwords_texts = []
for tokens in df['tokens']:
    filtered_tokens = []
    for word in tokens:
        if word not in stop_words:
            filtered_tokens.append(word)
    no_stopwords_texts.append(filtered_tokens)
df['no_stopwords'] = no_stopwords_texts

print("After Removing Stopwords:")
print(df['no_stopwords'])
print("\n")

After Removing Stopwords:
0                                       [product, great, loved, bit, pricey]
1                         [worst, product, ever, wouldnt, recommend, anyone]
2                    [satisfactory, quality, works, expected, major, issues]
3                                             [amazing, product, would, buy]
4                                            [delivery, slow, product, good]
5                           [horrible, experience, product, broke, one, use]
6                           [great, value, price, definitely, worth, buying]
7                            [product, didnt, meet, expectations, returning]
8                      [im, satisfied, purchase, better, options, available]
9     [superb, product, excellent, build, quality, great, customer, service]
10                        [product, okay, nothing, special, gets, job, done]
11                          [fast, delivery, product, described, would, buy]
12                             [worth, money, prod

In [29]:
stemmed_texts = []
stemmer= PorterStemmer()
for tokens in df['no_stopwords']:
    stemmed_tokens = []
    for word in tokens:
        stemmed_tokens.append(stemmer.stem(word))
    stemmed_texts.append(stemmed_tokens)
df['stemmed_tokens'] = stemmed_texts

print("After Stemming:")
print(df['stemmed_tokens'])
print("\n")

After Stemming:
0                                 [product, great, love, bit, pricey]
1                   [worst, product, ever, wouldnt, recommend, anyon]
2                  [satisfactori, qualiti, work, expect, major, issu]
3                                         [amaz, product, would, buy]
4                                     [deliveri, slow, product, good]
5                         [horribl, experi, product, broke, one, use]
6                           [great, valu, price, definit, worth, buy]
7                              [product, didnt, meet, expect, return]
8                       [im, satisfi, purchas, better, option, avail]
9     [superb, product, excel, build, qualiti, great, custom, servic]
10                     [product, okay, noth, special, get, job, done]
11                     [fast, deliveri, product, describ, would, buy]
12                       [worth, money, product, feel, cheap, flimsi]
13                        [product, exceed, expect, fantast, perform]
14  

In [30]:
lemmatized_texts = []
lemmatizer = WordNetLemmatizer()
for tokens in df['no_stopwords']:
    lemmatized_tokens = []
    for word in tokens:
        lemmatized_tokens.append(lemmatizer.lemmatize(word))
    lemmatized_texts.append(lemmatized_tokens)
df['lemmatized_tokens'] = lemmatized_texts

print("After Lemmatization:")
print(df['lemmatized_tokens'])
print("\n")

After Lemmatization:
0                                       [product, great, loved, bit, pricey]
1                         [worst, product, ever, wouldnt, recommend, anyone]
2                      [satisfactory, quality, work, expected, major, issue]
3                                             [amazing, product, would, buy]
4                                            [delivery, slow, product, good]
5                           [horrible, experience, product, broke, one, use]
6                           [great, value, price, definitely, worth, buying]
7                             [product, didnt, meet, expectation, returning]
8                       [im, satisfied, purchase, better, option, available]
9     [superb, product, excellent, build, quality, great, customer, service]
10                         [product, okay, nothing, special, get, job, done]
11                          [fast, delivery, product, described, would, buy]
12                              [worth, money, product,

In [31]:
term_frequencies = []
for tokens in df['no_stopwords']:
    term_frequency = dict(Counter(tokens))
    term_frequencies.append(term_frequency)
df['term_frequency'] = term_frequencies

print("Term Frequency:")
print(df['term_frequency'])
print("\n")

df.to_csv('processed_product_reviews.csv', index=False)
print("Processed data saved to 'processed_product_reviews.csv'")

Term Frequency:
0                                                      {'product': 1, 'great': 1, 'loved': 1, 'bit': 1, 'pricey': 1}
1                                   {'worst': 1, 'product': 1, 'ever': 1, 'wouldnt': 1, 'recommend': 1, 'anyone': 1}
2                              {'satisfactory': 1, 'quality': 1, 'works': 1, 'expected': 1, 'major': 1, 'issues': 1}
3                                                                 {'amazing': 1, 'product': 1, 'would': 1, 'buy': 1}
4                                                                {'delivery': 1, 'slow': 1, 'product': 1, 'good': 1}
5                                     {'horrible': 1, 'experience': 1, 'product': 1, 'broke': 1, 'one': 1, 'use': 1}
6                                     {'great': 1, 'value': 1, 'price': 1, 'definitely': 1, 'worth': 1, 'buying': 1}
7                                           {'product': 1, 'didnt': 1, 'meet': 1, 'expectations': 1, 'returning': 1}
8                                {'im': 1, 'sati