# Load extracted ngrams

In [1]:
import pandas as pd
df_list = []
for i in range(1, 10):
    temp_df = pd.read_csv(f'./Blog_Data/DpNgram_Extraction/basic_rephrase_50k_sample/f3b08b8a-5d7d-4d68-9a64-f37e91ab700d/output_path/{i}gram/0.tsv', sep='\t')
    df_list.append(temp_df)
df_combined = pd.concat(df_list, ignore_index=True)
df_combined.head()

Unnamed: 0,ngrams,histogram_count,ngram_count,privacy_unit_count,ngram_size,histogram_count_x,ngram_count_x,privacy_unit_count_x,ngram_size_x,histogram_count_y,ngram_count_y,privacy_unit_count_y,ngram_size_y
0,Create,288.787419,1368.0,1018.0,1.0,,,,,,,,
1,development,33.299593,278.0,184.0,1.0,,,,,,,,
2,for,2743.976738,16404.0,8300.0,1.0,,,,,,,,
3,game,115.1915,859.0,440.0,1.0,,,,,,,,
4,name,56.928565,524.0,258.0,1.0,,,,,,,,


In [2]:
# Initialize an empty list to store the largest ngrams
largest_ngrams = []

# Iterate through each ngram in the dataframe
for ngram in df_combined['ngrams']:
    # Check if the ngram is a substring of any ngram already in the largest_ngrams list
    if not any(ngram in larger_ngram for larger_ngram in largest_ngrams):
        # Remove any ngrams from the largest_ngrams list that are substrings of the current ngram
        largest_ngrams = [larger_ngram for larger_ngram in largest_ngrams if larger_ngram not in ngram]
        # Add the current ngram to the largest_ngrams list
        largest_ngrams.append(ngram)

# Create a sub-dataframe with only the largest ngrams
df_largest_ngrams = df_combined[df_combined['ngrams'].isin(largest_ngrams)]

# Display the sub-dataframe
df_largest_ngrams.head()

Unnamed: 0,ngrams,histogram_count,ngram_count,privacy_unit_count,ngram_size,histogram_count_x,ngram_count_x,privacy_unit_count_x,ngram_size_x,histogram_count_y,ngram_count_y,privacy_unit_count_y,ngram_size_y
1,development,33.299593,278.0,184.0,1.0,,,,,,,,
15,value,26.905074,348.0,168.0,1.0,,,,,,,,
33,personal,32.754445,184.0,152.0,1.0,,,,,,,,
43,Analyze,83.339315,326.0,289.0,1.0,,,,,,,,
58,first,25.154175,286.0,183.0,1.0,,,,,,,,


In [3]:
# Get ngrams larger than size 4

ngrams_larger_than_4 = df_largest_ngrams[df_largest_ngrams['ngram_size'] > 4]
ngrams_larger_than_4

Unnamed: 0,ngrams,histogram_count,ngram_count,privacy_unit_count,ngram_size,histogram_count_x,ngram_count_x,privacy_unit_count_x,ngram_size_x,histogram_count_y,ngram_count_y,privacy_unit_count_y,ngram_size_y
822,Condense text to one page,157.896715,191.0,175.0,5.0,,,,,,,,
830,Condense document to one page,136.860519,169.0,157.0,5.0,,,,,,,,
834,Generate image prompt for visualization,46.948407,51.0,51.0,5.0,,,,,,,,
841,Enhance text clarity and engagement,109.144206,128.0,117.0,5.0,,,,,,,,
842,Extract specific information from text,53.998258,71.0,59.0,5.0,,,,,,,,
843,Generate image prompts for visualization,1592.604526,2055.0,1736.0,5.0,,,,,,,,
844,Condense document into one page,57.992179,73.0,64.0,5.0,,,,,,,,
846,Revise code with provided suggestions,85.649121,94.0,90.0,5.0,,,,,,,,
853,Summarize text to one page,57.56654,76.0,69.0,5.0,,,,,,,,
862,Revise code for clarity and,9.778175,14.0,13.0,5.0,,,,,,,,


In [4]:
# Get ngrams with the coverage given that the total unique users/ privacy units is 24604
ngrams_larger_than_4 = ngrams_larger_than_4[['ngrams', 'privacy_unit_count']]
ngrams_larger_than_4['coverage'] = ngrams_larger_than_4['privacy_unit_count'] / 24604 * 100
ngrams_larger_than_4 = ngrams_larger_than_4.drop(columns=['privacy_unit_count'])
ngrams_larger_than_4 = ngrams_larger_than_4.sort_values(by='coverage', ascending=False)
ngrams_larger_than_4.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ngrams_larger_than_4['coverage'] = ngrams_larger_than_4['privacy_unit_count'] / 24604 * 100


Unnamed: 0,ngrams,coverage
843,Generate image prompts for visualization,7.055763
877,Generate image prompts for generative AI,5.547878
914,I'm sorry but I can't assist with that request,4.645586
872,Rewrite text for clarity and conciseness,1.28028
873,Expand text with details and examples,0.938872


# Cluster the ngrams 

In [5]:
# Perform agglomerative clustering
# Use sentence transformers model to create embeddings: https://sbert.net/docs/sentence_transformer/pretrained_models.html
from sklearn.cluster import AgglomerativeClustering

from sentence_transformers import SentenceTransformer

# Load the embedding model
embedder = SentenceTransformer("all-MiniLM-L6-v2")

  from tqdm.autonotebook import tqdm, trange


In [6]:
ngrams_list = ngrams_larger_than_4['ngrams'].tolist()

corpus = ngrams_list
corpus_embeddings = embedder.encode(corpus)

# Some models don't automatically normalize the embeddings, in which case you should normalize the embeddings:
# corpus_embeddings = corpus_embeddings / np.linalg.norm(corpus_embeddings, axis=1, keepdims=True)

# Perform agglomerative clustering
clustering_model = AgglomerativeClustering(
    n_clusters=None, distance_threshold=1
)  # , affinity='cosine', linkage='average', distance_threshold=0.4)
clustering_model.fit(corpus_embeddings)
cluster_assignment = clustering_model.labels_

clustered_sentences = {}
for sentence_id, cluster_id in enumerate(cluster_assignment):
    if cluster_id not in clustered_sentences:
        clustered_sentences[cluster_id] = []

    clustered_sentences[cluster_id].append(corpus[sentence_id])

In [7]:
cluster_df= pd.DataFrame(columns=["cluster_id", "cluster_sentences"])
sentences_list = []
cluster_id_list = []

for i, sentences in clustered_sentences.items():
    sentences_list.append(sentences)
    cluster_id_list.append(i)

cluster_df["cluster_id"] = cluster_id_list
cluster_df["cluster_sentences"] = sentences_list

cluster_df.head(3)

Unnamed: 0,cluster_id,cluster_sentences
0,11,"[Generate image prompts for visualization, Gen..."
1,8,"[Generate image prompts for generative AI, Cre..."
2,3,[I'm sorry but I can't assist with that reques...


# Create themes/ topics for each cluster

In [None]:
import os
import sys
# Check if we're in the local development or AML environment
current_dir = os.getcwd()

if os.path.exists(os.path.join(os.path.dirname(current_dir),'rephrase','utils')):
    print("In local development mode, adding common modules to path")
    # In development, add the project root to sys.path
    sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(current_dir), '..')))

from rephrase.utils.api.azure_openai_api import get_azure_openai_client

api_version = "2023-07-01-preview"

# https://learn.microsoft.com/en-us/azure/cognitive-services/openai/how-to/create-resource?pivots=web-portal#create-a-resource
# Replace the endpoint with the endpoint of your Azure OpenAI resource
endpoint = ""
client = get_azure_openai_client(api_version=api_version, endpoint=endpoint)

In local development mode, adding common modules to path


In [14]:
# Read prompt template from the markdown file
with open('./utils/prompts/topic_detection_prompt/prompt.md', 'r') as file:
    prompt = file.read()
print(prompt)

Given the following clustered list of ngrams, Try to create a common theme for the whole cluster. Keep the themes to 1-2 words at max. If possible try to extract the theme from the cluster of ngram sentences itself. 

                    `{}`
     If the cluster is just made up of ngram sentences that contain just prepositions/connector words and has no Verb, Noun, Adjective than the theme is General.  
     
     Return only the final theme as a string. Provide no explanation for the themes you provided and add no prefix or suffix text to the result.


In [15]:
# Generate themes from the clusters
total_theme_result = []

for index, row in cluster_df.iterrows():
    local_ngram_dict_list = []
    row_cluster_sentences = row["cluster_sentences"]
    
    response =  client.chat.completions.create(
            # Replace with your own deployed model name
            model="",
            messages=[
                {
                    "role": "user",
                    "content": prompt.format(row_cluster_sentences),
                }
            ],
            seed=42,
            temperature=0.0,
            max_tokens=4000,
            )
    response_text = response.choices[0].message.content
    temp_dict = {"ngrams list": row_cluster_sentences, "theme": response_text}
    print(temp_dict)
    local_ngram_dict_list.append(temp_dict)
    total_theme_result.extend(local_ngram_dict_list)


{'ngrams list': ['Generate image prompts for visualization', 'Generate image prompt for visualization'], 'theme': 'Visualization'}
{'ngrams list': ['Generate image prompts for generative AI', 'Create image prompts for generative AI', 'Generate image prompt for generative AI'], 'theme': 'Generative AI'}
{'ngrams list': ["I'm sorry but I can't assist with that request", "I'm sorry but I can't comply with that request"], 'theme': 'Apology'}
{'ngrams list': ['Rewrite text for clarity and conciseness', 'Revise text for clarity and conciseness', 'Rephrase text for clarity and conciseness', 'Paraphrase text for clarity and conciseness'], 'theme': 'Editing'}
{'ngrams list': ['Expand text with details and examples', 'Summarize text to one page'], 'theme': 'Text Management'}
{'ngrams list': ['Condense text to one page', 'Condense document to one page', 'Condense document into one page'], 'theme': 'Condense document'}
{'ngrams list': ['Generate product description with keywords and SEO', 'Generat

In [16]:
import pandas as pd
# Convert the total_theme_result list to a DataFrame
df_total_theme_result = pd.DataFrame(total_theme_result)

# Initialize an empty dictionary to store the total coverage for each ngram cluster
cluster_coverage = []

# Iterate through each row in df_total_theme_result
for index, row in df_total_theme_result.iterrows():
    ngram_cluster = row['ngrams list']
    total_coverage = 0
    
    # Iterate through each ngram in the ngram_cluster
    for ngram in ngram_cluster:
        # Check if the ngram is present in ngrams_larger_than_4 dataframe
        if ngram in ngrams_larger_than_4['ngrams'].values:
            # Get the coverage of the ngram and add it to the total coverage
            total_coverage += ngrams_larger_than_4[ngrams_larger_than_4['ngrams'] == ngram]['coverage'].values[0]
    
    # Store the total coverage for the ngram cluster
    cluster_coverage.append(total_coverage)

# Add the coverage as a new column in df_total_theme_result
df_total_theme_result['coverage'] = cluster_coverage

# Display the updated DataFrame
df_total_theme_result

Unnamed: 0,ngrams list,theme,coverage
0,"[Generate image prompts for visualization, Gen...",Visualization,7.263047
1,"[Generate image prompts for generative AI, Cre...",Generative AI,5.990896
2,[I'm sorry but I can't assist with that reques...,Apology,5.210535
3,"[Rewrite text for clarity and conciseness, Rev...",Editing,2.58901
4,"[Expand text with details and examples, Summar...",Text Management,1.219314
5,"[Condense text to one page, Condense document ...",Condense document,1.609494
6,[Generate product description with keywords an...,Product Generation,1.101447
7,"[ar 1 1 v 5 imagine prompt, 9 16 v 5 imagine p...",imagine prompt,1.158348
8,[style captured with a Canon EOS],Photography,0.487726
9,"[Enhance text clarity and engagement, Rewrite ...",Clarity and Engagement,0.699073
