In [2]:
import snscrape.modules.twitter as sntwitter
import pandas as pd
from sentence_transformers import SentenceTransformer
import os
import openai
import time
import re
from IPython.display import clear_output
from tqdm import tqdm
model = SentenceTransformer("all-MiniLM-L6-v2")
values = open(".env", "r").read()
openai_api_key = re.findall(r"OPENAI_API_KEY=\"(.*)\"", values)[0]
openai_org = re.findall(r"OPENAI_ORG=\"(.*)\"", values)[0]
openai.api_key = openai_api_key
openai.organization = openai_org

In [4]:
attributes_container = []
users = ["louis030195", "naval"]

for user in users:
    for i, tweet in enumerate(sntwitter.TwitterSearchScraper(f"from:{user}").get_items()):
        if i > 100:
            break
        attributes_container.append(
            [user, tweet.date, tweet.likeCount, tweet.sourceLabel, tweet.content]
        )

# Creating a dataframe from the tweets list above
tweets_df = pd.DataFrame(
    attributes_container,
    columns=["User", "Date Created", "Number of Likes", "Source of Tweet", "Tweets"],
)
tweets_df

  [user, tweet.date, tweet.likeCount, tweet.sourceLabel, tweet.content]


Unnamed: 0,User,Date Created,Number of Likes,Source of Tweet,Tweets
0,louis030195,2022-10-31 10:58:34+00:00,1,Twitter Web App,@ZinaSarif Curious to know what algorithm is r...
1,louis030195,2022-10-30 16:44:11+00:00,0,Twitter Web App,I wrote a poem about linear algebra (@StableDi...
2,louis030195,2022-10-30 16:00:46+00:00,0,Twitter Web App,@fchollet Processes and culture are code for p...
3,louis030195,2022-10-30 15:35:47+00:00,1,Twitter Web App,@StableDiffusion &amp; @OpenAI #gpt3 in @obsdm...
4,louis030195,2022-10-16 07:53:41+00:00,0,Twitter Web App,@twitnickl I really appreciate your comment Ni...
...,...,...,...,...,...
184,naval,2022-10-02 05:54:06+00:00,62,Twitter for iPhone,@ramanjokhakar Good line. Nation-scale virtue ...
185,naval,2022-10-02 05:52:54+00:00,37,Twitter for iPhone,@aeyakovenko This is a very high risk strategy...
186,naval,2022-10-02 05:28:39+00:00,10,Twitter for iPhone,@ChristiaNagel @aeyakovenko Anyway I don’t hav...
187,naval,2022-10-02 05:27:37+00:00,15,Twitter for iPhone,@ChristiaNagel @aeyakovenko Nobody knows what ...


In [6]:
def build_prompt(tweet):
    return f"""Tweet content: A rational person can find peace by cultivating indifference to things outside of their control.

Short list of main topics: rationality,wisdom,stoicism

###

Tweet content: Good product design requires an obsessed artist - it’s not a part-time job.

Short list of main topics: product-design,art,product

###

Tweet content: {tweet}

Short list of main topics:"""


delay = 3
embeddings = []
for k, v in tqdm(tweets_df.iterrows()):
    clear_output(wait=True)
    tweet = v["Tweets"]
    prompt = build_prompt(tweet)
    print(f"Prompt: {prompt}")
    try:
        response = openai.Completion.create(
            model="code-davinci-002",
            prompt=prompt,
            temperature=0,
            max_tokens=256,
            top_p=1,
            frequency_penalty=0,
            presence_penalty=0,
            stop=["###", "\n"],
        )
    # in case of RateLimitError, wait 1 minute and try again
    except Exception as e:
        print(f"Error: {e}")
        time.sleep(60)
        continue
    text = response["choices"][0]["text"]
    # TODO: could listify & filter duplicate
    if not text:
        print("Could not generate topics for tweet: ", tweet)
        time.sleep(delay)
        continue
    print(f"Completion: {text}")
    tweets_df.at[k, "Topics"] = text
    embeddings.append(model.encode([text]))
    time.sleep(delay)


Prompt: Tweet content: A rational person can find peace by cultivating indifference to things outside of their control.

Short list of main topics: rationality,wisdom,stoicism

###

Tweet content: Good product design requires an obsessed artist - it’s not a part-time job.

Short list of main topics: product-design,art,product

###

Tweet content: @Andrei_Pavel_ When it’s rational to do so, yes.

Short list of main topics:
Completion:  rationality,rationality


189it [28:23,  9.01s/it]


In [7]:
tweets_df

Unnamed: 0,User,Date Created,Number of Likes,Source of Tweet,Tweets,Topics
0,louis030195,2022-10-31 10:58:34+00:00,1,Twitter Web App,@ZinaSarif Curious to know what algorithm is r...,"algorithm,running"
1,louis030195,2022-10-30 16:44:11+00:00,0,Twitter Web App,I wrote a poem about linear algebra (@StableDi...,"linear-algebra,gpt3,poem"
2,louis030195,2022-10-30 16:00:46+00:00,0,Twitter Web App,@fchollet Processes and culture are code for p...,"programming,culture,processes"
3,louis030195,2022-10-30 15:35:47+00:00,1,Twitter Web App,@StableDiffusion &amp; @OpenAI #gpt3 in @obsdm...,"gpt3,openai,obsdmd"
4,louis030195,2022-10-16 07:53:41+00:00,0,Twitter Web App,@twitnickl I really appreciate your comment Ni...,
...,...,...,...,...,...,...
184,naval,2022-10-02 05:54:06+00:00,62,Twitter for iPhone,@ramanjokhakar Good line. Nation-scale virtue ...,
185,naval,2022-10-02 05:52:54+00:00,37,Twitter for iPhone,@aeyakovenko This is a very high risk strategy...,"ukraine,russia,nuclear"
186,naval,2022-10-02 05:28:39+00:00,10,Twitter for iPhone,@ChristiaNagel @aeyakovenko Anyway I don’t hav...,"peace,game-theory,preferences"
187,naval,2022-10-02 05:27:37+00:00,15,Twitter for iPhone,@ChristiaNagel @aeyakovenko Nobody knows what ...,"politics,russia,ukraine"


In [8]:
# save the dataframe to a csv file
tweets_df.to_csv("tweets.csv", index=False)

In [3]:
tweets_df = pd.read_csv("tweets.csv")

In [6]:
# remove nan topics
tweets_df = tweets_df.dropna(subset=["Topics"])

In [7]:
# turn the topics column into a list by splitting on commas
tweets_df["Topics"] = tweets_df["Topics"].apply(lambda x: x.split(","))
tweets_df

Unnamed: 0,User,Date Created,Number of Likes,Source of Tweet,Tweets,Topics
0,louis030195,2022-10-31 10:58:34+00:00,1,Twitter Web App,@ZinaSarif Curious to know what algorithm is r...,"[ algorithm, running]"
1,louis030195,2022-10-30 16:44:11+00:00,0,Twitter Web App,I wrote a poem about linear algebra (@StableDi...,"[ linear-algebra, gpt3, poem]"
2,louis030195,2022-10-30 16:00:46+00:00,0,Twitter Web App,@fchollet Processes and culture are code for p...,"[ programming, culture, processes]"
3,louis030195,2022-10-30 15:35:47+00:00,1,Twitter Web App,@StableDiffusion &amp; @OpenAI #gpt3 in @obsdm...,"[ gpt3, openai, obsdmd]"
5,louis030195,2022-10-14 17:37:34+00:00,0,Twitter Web App,Programming is not different from poetry and t...,"[ programming, poetry, maths]"
...,...,...,...,...,...,...
182,naval,2022-10-02 06:04:21+00:00,8,Twitter for iPhone,@ComplexDigi The UN is a venue for negotiation...,"[ un, armed, negotiation]"
185,naval,2022-10-02 05:52:54+00:00,37,Twitter for iPhone,@aeyakovenko This is a very high risk strategy...,"[ ukraine, russia, nuclear]"
186,naval,2022-10-02 05:28:39+00:00,10,Twitter for iPhone,@ChristiaNagel @aeyakovenko Anyway I don’t hav...,"[ peace, game-theory, preferences]"
187,naval,2022-10-02 05:27:37+00:00,15,Twitter for iPhone,@ChristiaNagel @aeyakovenko Nobody knows what ...,"[ politics, russia, ukraine]"


In [21]:
from sentence_transformers import SentenceTransformer, util

# split dataset into a map of sets of topics per user
topics_per_user = {}
for k, v in tweets_df.iterrows():
    user = v["User"]
    if user not in topics_per_user:
        topics_per_user[user] = set()
    for e in v["Topics"]: topics_per_user[user].add(e)

In [26]:
# common topics
common_topics = topics_per_user["louis030195"].intersection(topics_per_user["naval"])
print(f"Common topics: {common_topics}")

Common topics: {'wisdom', 'books', 'product', ' art', 'startups', 'social', ' truth', 'startup', 'truth'}


In [30]:
# compute the average embedding per user
user_embeddings = {}
for user, topics in topics_per_user.items():
    user_embeddings[user] = model.encode([", ".join(topics)])

# compute the cosine similarity between the embeddings
cosine_scores = util.pytorch_cos_sim(
    user_embeddings["louis030195"], user_embeddings["naval"]
)
print(f"Similarity score: {cosine_scores}")

Similarity score: tensor([[0.7416]])


In [31]:
user_embeddings["louis030195"].shape

(1, 384)

In [32]:
user_embeddings["naval"].shape

(1, 384)