In [1]:
import pandas as pd
import nltk
import string
import re
from sklearn.feature_extraction.text import TfidfVectorizer

# Download NLTK data files
nltk.download('punkt')
nltk.download('stopwords')

# Read the CSV file
reviews = pd.read_csv('Reviews.csv', nrows=5000)

# Preprocess the text to remove all punctuation marks and HTML tags
def preprocess_text(text):
    if pd.isnull(text):
        return ''
    if not isinstance(text, str):
        return ''
    
    # Expand contractions
    text = re.sub(r"n't", " not", text)
    text = re.sub(r"'re", " are", text)
    text = re.sub(r"'s", " is", text)
    text = re.sub(r"'d", " would", text)
    text = re.sub(r"'ll", " will", text)
    text = re.sub(r"'ve", " have", text)
    text = re.sub(r"'m", " am", text)
    
    # Remove HTML tags if present
    text = re.sub(r'<[^>]+>', '', text)
    
    # Remove punctuation using regex
    text = re.sub(f"[{re.escape(string.punctuation)}]", "", text)
    
    return text

# Apply the preprocessing function to the reviews
reviews['filtered_text'] = reviews['Text'].apply(preprocess_text)

# Initialize the TF-IDF Vectorizer
vectorizer = TfidfVectorizer(stop_words='english', max_features=10)

# Fit and transform the data
X = vectorizer.fit_transform(reviews['filtered_text'])
feature_names = vectorizer.get_feature_names_out()

# Convert the TF-IDF matrix to a DataFrame for easy analysis
df_tfidf = pd.DataFrame(X.toarray(), columns=feature_names)

# Function to get top TF-IDF keywords
def get_top_keywords(text, n=5):
    # Transform the text to TF-IDF vector
    tfidf_vector = vectorizer.transform([text])
    scores = dict(zip(feature_names, tfidf_vector.toarray()[0]))
    
    # Get the top n keywords
    top_keywords = sorted(scores.items(), key=lambda x: x[1], reverse=True)[:n]
    return ', '.join([word for word, score in top_keywords])

# Apply the keyword extraction function to the filtered text
reviews['keywords_tfidf'] = reviews['filtered_text'].apply(lambda x: get_top_keywords(x, n=5))

# Print the processed text and extracted keywords
for index, row in reviews.loc[1:5, ['Text', 'filtered_text', 'keywords_tfidf']].iterrows():
    original_text = row['Text']
    filtered_text = row['filtered_text']
    keywords_extracted = row['keywords_tfidf']
    print(f"Original: {original_text}")
    print(f"Filtered: {filtered_text}")
    print(f"Keywords (TF-IDF): {keywords_extracted}\n")


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Gayathri.m\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Gayathri.m\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Original: Product arrived labeled as Jumbo Salted Peanuts...the peanuts were actually small sized unsalted. Not sure if this was an error or if the vendor intended to represent the product as "Jumbo".
Filtered: Product arrived labeled as Jumbo Salted Peanutsthe peanuts were actually small sized unsalted Not sure if this was an error or if the vendor intended to represent the product as Jumbo
Keywords (TF-IDF): product, chips, coffee, flavor, good

Original: This is a confection that has been around a few centuries.  It is a light, pillowy citrus gelatin with nuts - in this case Filberts. And it is cut into tiny squares and then liberally coated with powdered sugar.  And it is a tiny mouthful of heaven.  Not too chewy, and very flavorful.  I highly recommend this yummy treat.  If you are familiar with the story of C.S. Lewis' "The Lion, The Witch, and The Wardrobe" - this is the treat that seduces Edmund into selling out his Brother and Sisters to the Witch.
Filtered: This is a confecti