In [42]:
import pandas as pd
import re
from nltk.tokenize import sent_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import nltk

# Make sure to download these once in your environment
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

# Load the dataset
df = pd.read_csv('positive_reviews.csv')  # Replace with your actual file path

# Function to clean the text
def clean_text(text):
    # Remove HTML tags
    text = re.sub(r'<.*?>', '', text)
    # Keep essential punctuation that defines sentence boundaries
    text = re.sub(r'[^\w\s.!?]', '', text)  # Remove all non-word characters except for sentence-ending punctuation
    # Convert to lowercase
    text = text.lower()
    return text

# Clean the 'Text' column
df['Cleaned_Text'] = df['Text'].apply(clean_text)

# Tokenize the cleaned text into sentences
df['Sentences'] = df['Cleaned_Text'].apply(sent_tokenize)

# Display the first few rows of the dataframe
df.head()
# Assuming `df` is your original dataframe with 500,000 rows
subset_df = df.iloc[:100].copy()

subset_df["Cleaned_Text"]

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\gpava\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\gpava\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\gpava\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


0     my kitties love these treats.  this flavor is ...
1     our poodle gets one every evening as a special...
2     we got a keurig system last year for christmas...
3     perfect tea for right before bedtime relaxing....
4     i would never retract that the product seems t...
                            ...                        
95    love these almonds!  great flavor  really hits...
96    no detailed analysis here i like the overall t...
97    i purchased these bread mixes intending to use...
98    i love the product and will continue using it....
99    love cytomax. when biking i follow the instruc...
Name: Cleaned_Text, Length: 100, dtype: object

In [43]:
df.head()

Unnamed: 0,Score,Summary,Text,processed_text,Cleaned_Text,Sentences
0,1,KITTIES LOVE EM,My kitties love these treats. This flavor is ...,kitties love treats flavor favorite kitties re...,my kitties love these treats. this flavor is ...,"[my kitties love these treats., this flavor is..."
1,1,Greenies,Our poodle gets one every evening as a special...,poodle gets one every evening special treat lo...,our poodle gets one every evening as a special...,[our poodle gets one every evening as a specia...
2,1,Finally!,We got a Keurig system last year for Christmas...,got keurig system last year christmas great ho...,we got a keurig system last year for christmas...,[we got a keurig system last year for christma...
3,1,My favorite peppermint tea,Perfect tea for right before bedtime relaxing....,perfect tea right bedtime relaxing enjoying st...,perfect tea for right before bedtime relaxing....,[perfect tea for right before bedtime relaxing...
4,1,"CET pet Chewies... and the ""Company""",I would never retract that the product seems t...,would never retract product seems excellent pe...,i would never retract that the product seems t...,[i would never retract that the product seems ...


In [44]:
# Assuming 'subset_df' is your DataFrame
subset_df['Cleaned_Text'] = subset_df['Text'].apply(clean_text)

# Optionally, you might want to re-tokenize into sentences if needed for further processing
subset_df['Sentences'] = subset_df['Cleaned_Text'].apply(nltk.sent_tokenize)

# Display the first few rows to verify the changes
subset_df[['Text', 'Cleaned_Text', 'Sentences']].head()


Unnamed: 0,Text,Cleaned_Text,Sentences
0,My kitties love these treats. This flavor is ...,my kitties love these treats. this flavor is ...,"[my kitties love these treats., this flavor is..."
1,Our poodle gets one every evening as a special...,our poodle gets one every evening as a special...,[our poodle gets one every evening as a specia...
2,We got a Keurig system last year for Christmas...,we got a keurig system last year for christmas...,[we got a keurig system last year for christma...
3,Perfect tea for right before bedtime relaxing....,perfect tea for right before bedtime relaxing....,[perfect tea for right before bedtime relaxing...
4,I would never retract that the product seems t...,i would never retract that the product seems t...,[i would never retract that the product seems ...


In [45]:
# Ensure each entry in 'Sentences' is a flat list of sentences
subset_df['Sentences'] = subset_df['Sentences'].apply(lambda s: [sent for sublist in s for sent in (sublist if isinstance(sublist, list) else [sublist])])


In [46]:
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
from nltk.tokenize import sent_tokenize

# Combine all texts into one large document
combined_text = " ".join(subset_df['Cleaned_Text'].tolist())


# Tokenize the combined text into sentences
sentences = sent_tokenize(combined_text)

# Function to summarize the text
def summarize_text(sentences, top_n=8):
    # Set a reasonable limit for top_n based on the number of sentences available
    top_n = min(top_n, len(sentences) // 10 + 1)  # e.g., at least 10% of the sentences, but at least one

    if len(sentences) > 1:  # Only proceed if there are multiple sentences
        tfidf_vectorizer = TfidfVectorizer(stop_words='english')
        tfidf_matrix = tfidf_vectorizer.fit_transform(sentences)
        sentence_scores = np.array(tfidf_matrix.sum(axis=1)).ravel()

        top_sentence_indices = np.argsort(sentence_scores)[-top_n:][::-1]
        top_sentences = [sentences[idx] for idx in sorted(top_sentence_indices)]
        return ' '.join(top_sentences)
    else:
        return sentences[0] if sentences else ''

# Apply summarization
summary = summarize_text(sentences, top_n=8)
print("Summary:", summary)


Summary: whiskas temptations seafood medley flavour treats for cats 3ounce pouches pack of 12  i always include some of these treats whenever i am sending a donation of whatever i am ordering for the center.amazon also donates a percentage of the order to the rescue center when ordered through the link on the rescue centers webpage. im a college kid and so my diet isnt very wholesome but grains such as couscous and rice are extraordinarily cheap and together with multivitamins cover most of my dietary needs.couscous is made from durum wheat and so contains more protein and less starches on average than rice making it a more complete meal. i might buy a few of these bags and just leave them sitting unopened in the closet it makes a very good survival food because couscous can be made into an edible mush by just adding water and allowing it to sit meaning that you could store this stuff indefinitely and consume it in a situation where you have no access to a heat source.and all that for 

In [47]:
print(combined_text)

my kitties love these treats.  this flavor is a favorite of some of the kitties at the rescue center to which i donate. other of their kitties like other temptation flavors .. but they all come running for them. whiskas temptations seafood medley flavour treats for cats 3ounce pouches pack of 12  i always include some of these treats whenever i am sending a donation of whatever i am ordering for the center.amazon also donates a percentage of the order to the rescue center when ordered through the link on the rescue centers webpage.  thanks amazon. our poodle gets one every evening as a special treat. he loves them. i always keep a good supply on hand. it is not only a treat but it helps clean their teeth. we got a keurig system last year for christmas.  it has been great. i however like coffee that has body strength and depth.  most kcups are way too weak for me. note i use a fair bit of milk in my coffee so that colors my interpretations.i tried emerils big easy bold since i am consta

In [13]:
pip install Rouge

Collecting Rouge
  Downloading rouge-1.0.1-py3-none-any.whl.metadata (4.1 kB)
Downloading rouge-1.0.1-py3-none-any.whl (13 kB)
Installing collected packages: Rouge
Successfully installed Rouge-1.0.1
Note: you may need to restart the kernel to use updated packages.


In [48]:
from rouge import Rouge

rouge = Rouge()
scores = rouge.get_scores(combined_text, summary)
print(scores)


[{'rouge-1': {'r': 1.0, 'p': 0.17019987886129617, 'f': 0.2908902666653218}, 'rouge-2': {'r': 0.989406779661017, 'p': 0.08627378533160909, 'f': 0.1587085796630629}, 'rouge-l': {'r': 1.0, 'p': 0.17019987886129617, 'f': 0.2908902666653218}}]


In [30]:
pip install rouge

Collecting rouge
  Downloading rouge-1.0.1-py3-none-any.whl (13 kB)
Installing collected packages: rouge
Successfully installed rouge-1.0.1


In [18]:
print(combined_text)

my kitties love these treats.  this flavor is a favorite of some of the kitties at the rescue center to which i donate. other of their kitties like other temptation flavors .. but they all come running for them. whiskas temptations seafood medley flavour treats for cats 3ounce pouches pack of 12  i always include some of these treats whenever i am sending a donation of whatever i am ordering for the center.amazon also donates a percentage of the order to the rescue center when ordered through the link on the rescue centers webpage.  thanks amazon. our poodle gets one every evening as a special treat. he loves them. i always keep a good supply on hand. it is not only a treat but it helps clean their teeth. we got a keurig system last year for christmas.  it has been great. i however like coffee that has body strength and depth.  most kcups are way too weak for me. note i use a fair bit of milk in my coffee so that colors my interpretations.i tried emerils big easy bold since i am consta

In [None]:
import torch
from transformers import DistilBertModel, DistilBertTokenizer
from nltk.tokenize import sent_tokenize
import nltk
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.cluster import KMeans
import numpy as np

# Ensure required NLTK packages are downloaded
nltk.download('punkt')

# Load pre-trained model and tokenizer
model_name = 'distilbert-base-uncased'
tokenizer = DistilBertTokenizer.from_pretrained(model_name)
model = DistilBertModel.from_pretrained(model_name)

def get_embedding(text):
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)
    outputs = model(**inputs)
    # Take the mean across the token dimension to reduce to a single vector per sentence
    return outputs.last_hidden_state.mean(dim=1).squeeze()

# Text and processing
text = combined_text

if len(sentences) > 1:
    # Calculate embeddings for each sentence
    embeddings = torch.stack([get_embedding(sentence) for sentence in sentences])

    # Flatten the embeddings tensor from 3D to 2D for compatibility with cosine_similarity and clustering
    embeddings_2d = embeddings.detach().numpy()

    # Calculate document embedding as mean of sentence embeddings
    doc_embedding = embeddings.mean(dim=0)

    # Calculate cosine similarity with each sentence
    cosine_similarities = torch.nn.functional.cosine_similarity(embeddings, doc_embedding.unsqueeze(0))

    # Cluster sentences using KMeans
    num_clusters = min(max(3, len(sentences) // 5), len(sentences))  # Define number of clusters, adjust as necessary
    kmeans = KMeans(n_clusters=num_clusters, random_state=0).fit(embeddings_2d)
    cluster_labels = kmeans.labels_

    # Select one sentence per cluster based on cosine similarity to the document embedding
    selected_indices = []
    for cluster in range(num_clusters):
        indices_in_cluster = np.where(cluster_labels == cluster)[0]
        if indices_in_cluster.size > 0:
            best_idx = sorted(indices_in_cluster, key=lambda idx: cosine_similarities[idx], reverse=True)[0]
            selected_indices.append(best_idx)

    selected_indices = sorted(set(selected_indices))  # Ensure indices are unique and sorted
    summary_sentences = [sentences[idx] for idx in selected_indices]
    summary = ' '.join(summary_sentences)
else:
    summary = sentences[0] if sentences else 'No text provided.'

print("Summary:", summary)


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\gpava\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [21]:
from rouge import Rouge

rouge = Rouge()
scores = rouge.get_scores(combined_text, summary)
print(scores)


[{'rouge-1': {'r': 0.998, 'p': 0.3022410660205936, 'f': 0.4639702428286845}, 'rouge-2': {'r': 0.9474689589302769, 'p': 0.18326251616478847, 'f': 0.3071207403179313}, 'rouge-l': {'r': 0.998, 'p': 0.3022410660205936, 'f': 0.4639702428286845}}]
