In [1]:
import os

import matplotlib.pyplot as plt
import networkx as nx
import nltk
import numpy as np
import pandas as pd
import scipy
from convokit import Corpus, download
from langchain.llms import OpenAI
from langchain.prompts.chat import (
    AIMessagePromptTemplate,
    ChatPromptTemplate,
    HumanMessagePromptTemplate,
    SystemMessagePromptTemplate,
)

**Note:** Worked with Chanteria Milner on this assignment.

<img src="misc/syllabus_segment.png" style="width:400px">

# Constants, Utility Functions, and Data Importing

In [2]:
# Constants and clients
GPT_MODEL = "gpt-3.5-turbo"
MAX_CHAR_LEN = 5000000
OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY")

openai_client = OpenAI(api_key=OPENAI_API_KEY)

                    api_key was transferred to model_kwargs.
                    Please confirm that api_key is what you intended.


In [3]:
# Utility functions


def kl_divergence(x, y):
    P = x.copy()
    Q = y.copy()
    P.columns = ["P"]
    Q.columns = ["Q"]
    df = Q.join(P).fillna(0)
    p = df.iloc[:, 1]
    q = df.iloc[:, 0]
    D_kl = scipy.stats.entropy(p, q)
    return D_kl


def chi2_divergence(x, y):
    P = x.copy()
    Q = y.copy()
    P.columns = ["P"]
    Q.columns = ["Q"]
    df = Q.join(P).fillna(0)
    p = df.iloc[:, 1]
    q = df.iloc[:, 0]
    return scipy.stats.chisquare(p, q).statistic


def corpus_divergence(corpus1, corpus2, difference="KL"):
    """Difference parameter can equal KL, Chi2, or Wass"""
    freqP = nltk.FreqDist(corpus1)
    P = pd.DataFrame(
        list(freqP.values()), columns=["frequency"], index=list(freqP.keys())
    )
    freqQ = nltk.FreqDist(corpus2)
    Q = pd.DataFrame(
        list(freqQ.values()), columns=["frequency"], index=list(freqQ.keys())
    )
    if difference == "KL":
        return kl_divergence(P, Q)
    elif difference == "Chi2":
        return chi2_divergence(P, Q)
    elif difference == "KS":
        try:
            return scipy.stats.ks_2samp(P["frequency"], Q["frequency"]).statistic
        except:
            return scipy.stats.ks_2samp(P["frequency"], Q["frequency"])
    elif difference == "Wasserstein":
        try:
            return scipy.stats.wasserstein_distance(
                P["frequency"], Q["frequency"], u_weights=None, v_weights=None
            ).statistic
        except:
            return scipy.stats.wasserstein_distance(
                P["frequency"], Q["frequency"], u_weights=None, v_weights=None
            )


def get_density(df):
    data = df
    density = scipy.stats.gaussian_kde(data)
    width = np.max(data) - np.min(data)
    xs = np.linspace(np.min(data) - width / 5, np.max(data) + width / 5, 600)
    density.covariance_factor = lambda: 0.25
    density._compute_covariance()
    return xs, density(xs)


def draw_network(df, title):
    plt.figure(figsize=(8, 8))
    G = nx.DiGraph()
    for from_ in df.index:
        for to_ in df.columns:
            G.add_edge(from_, to_, weight=df.loc[from_][to_])

    pos = nx.spring_layout(G, k=0.55, iterations=20)
    edges, weights = zip(*nx.get_edge_attributes(G, "weight").items())
    weights = np.array(weights)
    # weights = weights*weights
    weights = 6 * weights / np.max(weights)
    print(title)

    edge_colors = 20 * (weights / np.max(weights))
    edge_colors = edge_colors.astype(int)
    #     nx.draw_networkx_nodes(G,pos,node_size=1200,alpha=0.7,node_color='#99cef7')
    #     nx.draw_networkx_edges(G,pos,edge_color=edge_colors)
    #     nx.draw_networkx_labels(G,pos,font_weight='bold')
    nx.draw(
        G,
        pos,
        with_labels=True,
        font_weight="bold",
        width=weights,
        edge_color=255 - edge_colors,
        node_color="#99cef7",
        node_size=1200,
        alpha=0.75,
        arrows=True,
        arrowsize=20,
    )
    return edge_colors


def create_system_message_prompt():
    """Creates a system message prompt"""
    personality_template = """
    The following is a conversation with an AI assistant.
    """
    return SystemMessagePromptTemplate.from_template(personality_template)


def create_chat_prompt(human_history, ai_history):
    """Creates a chat prompt template with human history, and AI history."""
    messages = []
    create_system_message_prompt()

    for h, a in zip(human_history, ai_history):
        messages.append(HumanMessagePromptTemplate.from_template(h))
        messages.append(AIMessagePromptTemplate.from_template(a))

    messages.append(HumanMessagePromptTemplate.from_template("{input}"))
    return ChatPromptTemplate.from_messages(messages)


def query_chain(chain, input_text):
    """Queries the conversation chain with the given input."""
    return chain.run(input_text)

In [4]:
# Data importing
stack_exchange_corpus = Corpus(filename=download("stack-exchange-politeness-corpus"))

Dataset already exists at /Users/michaelp/.convokit/downloads/stack-exchange-politeness-corpus


## <font color="red">*Exercise 1*</font>

<font color="red">Construct cells immediately below this that use ConvoKit to analyze a Corpus other 
than 'subreddit-Cornell', including at least one function you find in the package 
not used above. You can also generate a ConvoKit Corpus from your own dataset based 
on [their Corpus from .txt files tutorial](https://github.com/CornellNLP/Cornell-Conversational-Analysis-Toolkit/blob/master/examples/converting_movie_corpus.ipynb) or [their Corpus from pandas tutorial](https://github.com/CornellNLP/Cornell-Conversational-Analysis-Toolkit/blob/master/examples/corpus_from_pandas.ipynb), but that may 
be time-consuming for a weekly assignment.

In [12]:
stack_exchange_corpus.print_summary_stats()

Number of Speakers: 1
Number of Utterances: 6603
Number of Conversations: 6603


In [13]:
# Number of utterances in the corpus (not used in homework)
len(stack_exchange_corpus.get_utterance_ids())

6603

In [14]:
# Show an utterance object (not used in homework)
stack_exchange_corpus.get_utterance("0")

Utterance({'obj_type': 'utterance', 'vectors': [], 'speaker_': Speaker({'obj_type': 'speaker', 'vectors': [], 'owner': <convokit.model.corpus.Corpus object at 0x28c545d50>, 'id': 'user', 'meta': ConvoKitMeta({})}), 'owner': <convokit.model.corpus.Corpus object at 0x28c545d50>, 'id': '0', 'meta': ConvoKitMeta({'Normalized Score': 0.21732585796906329, 'Binary': 0, 'Annotations': {'A33SMNMTMIOJ6T': 12, 'A2OXXHGAM7B0Y': 16, 'A28TXBSZPWMEU9': 15, 'A3EJ5TT2ZGBIDA': 16, 'A3OY0OL2M0HUTT': 13}, 'parsed': [{'rt': 2, 'toks': [{'tok': 'Can', 'tag': 'MD', 'dep': 'aux', 'up': 2, 'dn': []}, {'tok': 'you', 'tag': 'PRP', 'dep': 'nsubj', 'up': 2, 'dn': []}, {'tok': 'explain', 'tag': 'VB', 'dep': 'ROOT', 'dn': [0, 1, 3, 4, 6, 10, 11]}, {'tok': 'more', 'tag': 'RBR', 'dep': 'dobj', 'up': 2, 'dn': []}, {'tok': 'in', 'tag': 'IN', 'dep': 'prep', 'up': 2, 'dn': [5]}, {'tok': 'detail', 'tag': 'NN', 'dep': 'pobj', 'up': 4, 'dn': []}, {'tok': ',', 'tag': ',', 'dep': 'punct', 'up': 2, 'dn': []}, {'tok': 'what', 't

## <font color="red">*Exercise 2*</font>

<font color="red">Construct cells immediately below this that perform a similar social 
similarity or influence analysis on a dataset relevant to your final project (__or 
one from ConvoKit__). Create relationships between actors in a network based on your 
dataset (e.g., person to person or document to document), and perform analyses that 
interrogate the structure of their interactions, similarity, and/or influence on 
one another. (For example, if relevant to your final project, you could explore 
different soap operas, counting how many times a character may have used the word 
love in conversation with another character, and identify if characters in love 
speak like each other. Or do opposites attract?) What does that analysis and its 
output reveal about the relative influence of each actor on others? What does it 
reveal about the social game being played?

<font color="red">Stretch 1:
Render the social network with weights (e.g., based on the number of scenes in 
which actors appear together), then calculate the most central actors in the 
`show.Realtime` output can be viewed in shell.

<font color="red">Stretch 2:
Implement more complex measures of similarity based on the papers you have read.

## <font color="red">*Exercise 3*</font>

<font color="red">Review the documentation for tools and agents from LangChain. Use at 
least two tools with appropriate agents discovered during your review to construct a 
chain addressing questions pertinent to your final project. If your project dataset 
is unsuitable for this task, select an alternative small-sized dataset for 
implementation.

## <font color="red">*Exercise 4*</font>

<font color="red">Use LangChain(you're welcome to not use it) to set up conversations with LLM 
agents for questions related to your final project (if relevant), or think of a 
scenario that a simulated conversation could be useful to answer a research question 
and find a dataset to implement it. What does it reveal about the social game involved 
with your dataset?

<font color="red"> Stretch: Use the idea of memory retrieval(or other methods) to design better 
templates for the LLM conversation.