In [27]:
import numpy as np
from collections import defaultdict
from sklearn.cluster import AgglomerativeClustering
from joblib import Parallel, delayed
import itertools
import json
from langchain.embeddings import OpenAIEmbeddings
from langchain.llms import OpenAI
from langchain import PromptTemplate, LLMChain



def construct_prompt(prompt_parameters, prompt_template):
    prompt = prompt_template.format(**prompt_parameters)
    return(prompt)

def semantic_clustering(texts, distance_threshold):
    print("Calculating embeddings...")
    embeddings = OpenAIEmbeddings(texts)
    print("done.")

    embeddings = np.array(embeddings)

    hclust_model = AgglomerativeClustering(n_clusters = None, linkage = "average", metric = "cosine", distance_threshold = distance_threshold)
    clustering = hclust_model.fit(embeddings)

    if True:
        clusters = defaultdict(list)
        for text, cluster_label in zip(texts, clustering.labels_):
            clusters[int(cluster_label)].append(text)
        print("Num clusters: " + str(len(clusters.keys())))
        print("Printing top clusters")
        for cluster_id, cluster_texts in clusters.items():
            if len(cluster_texts) > 5:
                print("Cluster id: " + str(cluster_id))
                print("Size: " + str(len(cluster_texts)))
                print(json.dumps(cluster_texts, indent = 4))
                print()

    return(clustering.labels_)


def get_topic_comments_single_review(review, topic, prompt_template):
    prompt_parameters = {
            "review_text" : review["review_text"],
            "topic" : topic
            }
    prompt = construct_prompt(prompt_parameters, prompt_template)
    print(prompt)
    completion = OpenAI(prompt)
    if completion is None:
        comments = []
        return(comments)
    comment_numbers = [x.strip() for x in completion.split(",")]
    parsed_review_text = review["parsed_review_text"]
    print(comment_numbers)
    print(parsed_review_text)
    comments = []
    for x in comment_numbers:
        if x in parsed_review_text:
            comments.append((x, parsed_review_text[x]))            
    
    return(comments)


def get_topic_comments(parsed_reviews, topic):
    """Given parsed version of the text corpus, get all lines that are relevant to topic.
    
    Given a text + topic, we ask GPT to generate line numbers for lines relevant to the topic.
    By working with line numbers instead of the lines, we avoid hallucinations in retrived lines.
    We pull relevant lines from all individual texts across the entire corpus.
     
    :param parsed_reviews: The text corpus as a list. Each entry in the list is a dict.
        Each dict must have the key 'review_text', which stores the individual text
        Each dict must have the key 'parsed_review_text', which stores the parsed individual text.
    :param topic: the topic of interest
    :type reviews: list
    :type topic: str
    :returns: list of relevant lines. Each entry is of form (line_number, line_text)
    :rtype: list
    """
    
    
    prompt_template_1 = """{review_text}

        The above is a numbered list of comments about minecraft.
        Based on the above comments answer the below question.


        What comments have suggestions to improve {topic}.
        List their comment numbers as a comma seperated list.

        Comment numbers:
        """

    print("Getting comments from single reviews...")
    n_jobs = 4
    print("Starting " + str(n_jobs) + " parallel jobs.")
    suggestions = Parallel(n_jobs = n_jobs)(delayed(get_topic_comments_single_review)(review, topic, prompt_template_1) for review in parsed_reviews)
    print(suggestions)
    #suggestions are of form (number, quoted_comment)
    suggestions = list(itertools.chain(*suggestions))
    print("Done.")
    
    return(suggestions)


def summarize_cluster(topic, comments):
    """Given a topic and lines that are relevant to the topic, summarize the lines
    
    :param topic: the topic of interest
    :param comments: list of relevant lines. Each entry is of form (line_number, line_text)
    :type topic: string
    :type comments: list
    :returns: summary of lines
    :rtype: string
    """
    
    prompt_template = """{comments_text}

The above is a list of comments about {topic} in a minecraft chat.
Based on the above comments answer the below question.


Summarize the comments.
Summarize top 3 suggestions to improve {topic}. If there are no suggestions, mention 'no suggestions'.

"""
    
    comments_text = ""
    for comment_number, comment_text in comments:
        comments_text += "\n" + comment_text
        
    prompt_parameters = {
            "comments_text" : comments_text,
            "topic" : topic
            }
    prompt = OpenAI(prompt_parameters, prompt_template)
    print(prompt)
    completion = OpenAI(prompt)
    
    return(completion)

def get_topic_summaries_with_references(topic, comments, top_cluster_size_threshold):
    """
    Given a topic and lines that are relevant to the topic, extract dominant clusters of lines and summarize each cluster.
    
    comments contains the text of lines that are relevant to topic.
    Use semantic clustering on these lines to get clusters that are:
    1. Semantically highly similar
    2. Have a significant size (number of lines in the cluster > top_cluster_size_threshold)
    Clusters that satisfy these two properties are called dominant clusters, and capture
    the major information in the lines.
    The lines that dont belong to a dominant cluster are discarded before summary.
    Next, we summarize each dominant cluster and include the lines of that cluster as references for that summary.
    Since the dominant cluster is quite homogenous, we will get a good summary.
    Since the lines were extracted from the corpus via line numbers, they are reproduced verbatim here
    and avoid hallucinations in the references.
    Hence we good focussed summary of the dominant points with verbatim / non-hallucinated references.
    
    :param topic: the topic of interest
    :param comments: list of relevant lines. Each entry is of form (line_number, line_text)
    :param top_cluster_size_threshold: minimum size for a cluster to be a dominant cluster
    :type topic: string
    :type comments: list
    :type top_cluster_size_threshold: int
    :returns: Dict of summaries. Keys are the cluster ids, values are the summaries of that cluster.
        The cluster summaries are dicts. Keys are comments, summary.
        The value of comments are the lines in that cluster.
        And summary is the summary of those lines (clusterwise summary).
    :rtype: dict
    """
    
    # 1) Calculate embeddings
    print("Calculating embeddings...")
    embeddings_model = OpenAIEmbeddings()
    embeddings = embeddings_model.embed_documents(texts=comments)
    print("done.")

    embeddings = np.array(embeddings)
    print(embeddings.shape)

    # 3) Do hierarchical clustering
    distance_threshold = 0.2
    hclust_model = AgglomerativeClustering(n_clusters = None, linkage = "average",
                                           metric = "cosine", distance_threshold = distance_threshold)
    clustering = hclust_model.fit(embeddings)

    # 4) Massage into clusters of comments
    clusters = defaultdict(list)
    for x, cluster_label in zip(comments, clustering.labels_):
        comment_number, comment_text = x
        clusters[int(cluster_label)].append((comment_number, comment_text))
    print("Num clusters: " + str(len(clusters.keys())))

    # 5) Select the top clusters and summarize them.
    top_clusters = {}
    for cluster_label, comments in clusters.items():
        num_comments_in_cluster = len(comments)
        if num_comments_in_cluster > top_cluster_size_threshold:
            top_cluster_comments = comments
            top_cluster_summary = summarize_cluster(topic, comments)
            top_cluster_info = {"comments" : top_cluster_comments,
                               "summary" : top_cluster_summary}
            top_clusters[cluster_label] = top_cluster_info
    print("Num top clusters: " + str(len(top_clusters.keys())))
            
    return(top_clusters)

def print_topic_summaries_with_references(topic, topic_summaries_with_references):
    """Pretty print the summaries with references of the topic.
    
    :param topic: the topic
    :param topic_summaries_with_references: Dict of summaries. Keys are the cluster ids, values are the summaries of that cluster.
        The cluster summaries are dicts. Keys are comments, summary.
        The value of comments are the lines in that cluster.
        And summary is the summary of those lines (clusterwise summary).
    :type topic: str
    :type topic_summaries_with_references: dict
    """

    print("Topic is: " + topic)
    for cluster_label, summary_with_references in topic_summaries_with_references.items():
        comments = summary_with_references["comments"]
        summary = summary_with_references["summary"]
        print("\nSummary with references:\n")
        print("Summary: \n" + summary + "\n")
        print("References: \n")
        for comment_number, comment_text in comments:
            print(comment_number + ":" + comment_text)

In [2]:
from src.semantic_clustering import semantic_clustering
import pandas as pd

In [24]:
df = pd.read_csv("../data/data_science_jobs_indeed_usa.csv")

In [5]:
print(df.shape)
df.head()

(1200, 10)


Unnamed: 0.1,Unnamed: 0,Title,Company,Location,Rating,Date,Salary,Description,Links,Descriptions
0,0,Data Scientist,Driven Brands,"Benicia, CA",2.4,PostedPosted 26 days ago,,You’ll be working alongside a team of eight an...,https://www.indeed.com/rc/clk?jk=74d176d595225...,We invite you to join us at Driven Brands!\nHe...
1,1,Business Analyst,Sabot Consulting,Remote,,PostedPosted 4 days ago,$80 - $120 an hour,Preferred candidates will have prior experienc...,https://www.indeed.com/rc/clk?jk=f662b2efb509b...,Sabot Consulting (Sabot) is a management consu...
2,2,IT Business Intelligence Developer (FT) Remote...,Ballad Health,"Remote in Blountville, TN",3.0,PostedPosted 30+ days ago,,Job Details Apply Save Print this job Email a…,https://www.indeed.com/rc/clk?jk=58612836c63b8...,Job Details\nApply\nSave\nPrint this job\nEmai...
3,3,Data Engineer,Longevity Holdings Inc.,"Remote in Minneapolis-Saint Paul, MN",,PostedPosted 3 days ago,"$90,000 - $110,000 a year",Incorporate core data management competencies ...,https://www.indeed.com/company/TwentyFirst/job...,Position: Data Engineer\nLocation: MN\nAs a Da...
4,4,Network Administrator/dba developer,WKI Kenworth,"Wichita, KS 67219",,EmployerActive 2 days ago,"$50,000 - $70,000 a year",The Network Administrator provides 2nd level e...,https://www.indeed.com/pagead/clk?mo=r&ad=-6NY...,Full Job Description\nThe Network Administrato...


In [20]:
text = df['Description']

In [21]:
print(text[0])

You’ll be working alongside a team of eight analysts & data scientists - collaborating to design datasets, develop analytical models & provide insights.


In [10]:
test = semantic_clustering(texts=text, distance_threshold=0.2)

Calculating embeddings...
done.
Num clusters: 67
Printing top clusters
Cluster id: 6
Size: 721
[
    "You\u2019ll be working alongside a team of eight analysts & data scientists - collaborating to design datasets, develop analytical models & provide insights.",
    "Preferred candidates will have prior experience in implementing Cloud-hosted business process migration in Software as a Service (SAAS) implementations\u2026",
    "Incorporate core data management competencies including data governance, data security, and data quality.\nPartner closely with our data scientists to ensure the\u2026",
    "Ensure adaption of data science product with the business users to drive outcome and patient impact.\nAbility to work with marketing agencies to define the data\u2026",
    "5+ years of relevant experience in report development, data science, business analytics, business intelligence, or comparable data engineering role, including\u2026",
    "Stay aware of emerging data science techniques,

In [15]:
df['cluster_label'] = test

In [33]:
df.reset_index(inplace=True)

In [44]:
lines =  df[['Unnamed: 0', 'Descriptions']]

In [40]:
df.head(2)

Unnamed: 0.1,index,Unnamed: 0,Title,Company,Location,Rating,Date,Salary,Description,Links,Descriptions,lines
0,0,0,Data Scientist,Driven Brands,"Benicia, CA",2.4,PostedPosted 26 days ago,,You’ll be working alongside a team of eight an...,https://www.indeed.com/rc/clk?jk=74d176d595225...,We invite you to join us at Driven Brands!\nHe...,0 0\n1 1\n2 2\n3 ...
1,1,1,Business Analyst,Sabot Consulting,Remote,,PostedPosted 4 days ago,$80 - $120 an hour,Preferred candidates will have prior experienc...,https://www.indeed.com/rc/clk?jk=f662b2efb509b...,Sabot Consulting (Sabot) is a management consu...,0 0\n1 1\n2 2\n3 ...


In [46]:
top_clusters = get_topic_summaries_with_references("Python ", lines, top_cluster_size_threshold=0.2)

Calculating embeddings...
done.
(1200, 1536)


ValueError: too many values to unpack (expected 2)