### Import libraries

In [1]:
import pandas as pd
import numpy as np
import openai
import faiss

### Load dataset

In [2]:
file_path = 'data/liked-tweets.csv'
df = pd.read_csv(file_path)

df.head()

Unnamed: 0,url,text
0,https://x.com/anothercohen/status/174527559909...,If you're questioning whether it's too early t...
1,https://x.com/diligentium/status/1744994086929...,Cool toy! 👍\n\nBut the R1 seems to rely on the...
2,https://x.com/naval/status/1002104154737684480...,Learn to sell. Learn to build. If you can do b...
3,https://x.com/jxmnop/status/173712987574646814...,Seen a lot of evidence that GPT-4 crushes Gemi...
4,https://x.com/historyinmemes/status/1736856310...,Kanye West on the phone with Taylor Swift befo...


### Generate embeddings

In [3]:
def get_embedding(text):
    response = openai.Embedding.create(input=[text], engine="text-embedding-ada-002")
    return np.array(response['data'][0]['embedding'])


In [6]:
# Generate embeddings for each tweet
embeddings = np.array([get_embedding(tweet) for tweet in df['text']])  

### Store embeddings in vector database

In [8]:
dimension = embeddings.shape[1]  # Dimension of the embeddings
index = faiss.IndexFlatL2(dimension)  # Using L2 distance for similarity
index.add(embeddings.astype(np.float32))  # Ensure the data type is float32

In [9]:
# Save the index
faiss.write_index(index, "data/liked-tweets-embeddings.index")

### Search relevant tweets

In [10]:
def search_tweets(query, k):
    embedding = get_embedding(query)
    distances, indices = index.search(np.array([embedding]).astype(np.float32), k)
    return (df.iloc[indices[0]], distances[0]) # returns the dataframe and the distances

In [11]:
relevant_tweets = search_tweets("How to find warm leads for my business?", 5)
print(relevant_tweets)

(                                                  url  \
23  https://x.com/natiakourdadze/status/1711776939...   
55  https://x.com/gregisenberg/status/173619402799...   
57  https://x.com/gregisenberg/status/173475822893...   
42  https://x.com/gregisenberg/status/174402039827...   
45  https://x.com/gregisenberg/status/174231412994...   

                                                 text  
23  How to find warm leads i.e. get customers easi...  
55  How to win on the internet:\n\n1. Marry the ni...  
57  Lifehacks: \n\n1. Use social media apps on des...  
42  Here are 50 realizations that changed my life ...  
45  27 phrases to remember for 2024 if you’re an e...  , array([0.28786168, 0.39365435, 0.4061313 , 0.40622246, 0.40932983],
      dtype=float32))


In [35]:
relevant_tweets = search_tweets("What are people saying about Kanye West or Taylor Swift?", 5)
print(relevant_tweets)

(                                                  url  \
4   https://x.com/historyinmemes/status/1736856310...   
53  https://x.com/gregisenberg/status/173711300761...   
18  https://x.com/bgurley/status/17266307243533151...   
33  https://x.com/gregisenberg/status/174698153268...   
42  https://x.com/gregisenberg/status/174402039827...   

                                                 text  
4   Kanye West on the phone with Taylor Swift befo...  
53  Adobe abandoning its $20b acquisition of Figma...  
18  If you told me 10 years ago that a group of th...  
33  Who lives in LA or SF?\n\nI'll be visiting soo...  
42  Here are 50 realizations that changed my life ...  , array([0.30583128, 0.50176364, 0.5033319 , 0.50669837, 0.50982416],
      dtype=float32))


In [36]:
relevant_tweets = search_tweets("What are people saying about OpenAI?", 5)
print(relevant_tweets)

(                                                  url  \
20  https://x.com/VCBrags/status/17260640509271818...   
26  https://x.com/holdenmatt/status/16904365380675...   
19  https://x.com/thecaptain_nemo/status/172654458...   
17  https://x.com/andykreed/status/172647134512558...   
21  https://x.com/varun_mathur/status/172597141823...   

                                                 text  
20  The number one thing on every venture capitali...  
26  OpenAI Functions is the coolest new tech I've ...  
19  drop the "OpenAI". just "Microsoft", its cleaner.  
17  Ex-Twitch CEO running OpenAI??? What’s he gonn...  
21  Dear Mira and Ilya - congratulations on your c...  , array([0.29594553, 0.301573  , 0.3490528 , 0.35370684, 0.35620123],
      dtype=float32))


In [40]:
relevant_tweets = search_tweets("What is the tweet about digital detox", 5)
print(relevant_tweets)

(                                                  url  \
56  https://x.com/gregisenberg/status/173605826090...   
35  https://x.com/gregisenberg/status/174688448317...   
57  https://x.com/gregisenberg/status/173475822893...   
38  https://x.com/gregisenberg/status/174601498932...   
42  https://x.com/gregisenberg/status/174402039827...   

                                                 text  
56  Someone will make $1B in the "digital detox" s...  
35  I figured out a way to make me 2x more product...  
57  Lifehacks: \n\n1. Use social media apps on des...  
38  Idea of the day:\n\nAcquire a struggling cruis...  
42  Here are 50 realizations that changed my life ...  , array([0.2945794 , 0.39345282, 0.43239492, 0.4785936 , 0.4874215 ],
      dtype=float32))


### Q&A + cite

In [12]:
def qna(question, k=5):
    # Search for relevant tweets
    relevant_tweets = search_tweets(question, k)[0]

    # Combine texts of relevant tweets
    combined_tweets_text = " ".join(relevant_tweets['text'])  

    # Using chat model for OpenAI completion
    response = openai.ChatCompletion.create(
        model="gpt-4",  
        messages=[
            {"role": "system", "content": "You are a helpful assistant helping the user leverage the information from the tweets they have saved."},
            {"role": "assistant", "content": f"Answer the question based on these tweets: {combined_tweets_text}. If there is not enough information, answer: 'Sorry, it seems like the Tweets do not provide enough information about this topic.' \n\nQuestion: {question}\nAnswer:"}
        ]
    )

    answer = response.choices[0].message['content'] if response.choices else "No response"

    # Check if the answer indicates sufficient relevant information
    if 'Sorry, it seems like the Tweets do not provide enough information about this topic.'.lower() not in answer.lower():
        # Add citation list at the end
        citations = "\n".join([f"[{i+1}] {row['url']}" for i, row in relevant_tweets.iterrows()])
        answer += f"\n\nCitations:\n{citations}"

    return answer


In [13]:
question = "Come up with a plan about bootstrapping a startup in the next 3 months."
print(qna(question))

Based on the insights gathered from the tweets, here is a potential 3-month plan for bootstrapping your startup:

Month 1: Idea Generation and Market Research
- Start by brainstorming your startup ideas. Remember, the best startup ideas are usually painfully obvious. Make sure the idea has a clear path to reach $10M+ per year.
- Study the market niche, learn from the community. According to tweet insights, your startup idea will come to you. 
- Identify your potential paying customers and engage with them to discover their needs and pain-points. Remember, paying customers are the best VCs.

Month 2: Building Prototype and Community Engagement
- Once your idea is refined, start creating your MVP (Minimum Viable Product). Remember, 90% of MVPs can be created in 24 hours or less.
- Begin building your community before building the product. Engage with people in your niche on social media, build a meme page if you have to, just understand them.
- Focus on customers' fulfillment and growth.

In [86]:
question = "What are people saying about OpenAI?"
print(qna(question))

The discussion about OpenAI is quite varied. There are a lot of people who think very highly of OpenAI's new technology, describing it as the "coolest new tech" they have used in a while and claiming that it has changed how they think about building software. The term "linguistic computation" is used in a positive manner here.

On the other hand, there seems to be quite a bit of controversy and concern surrounding the management and business aspects of OpenAI. There's speculation that the company is facing significant challenges, including poor unit economics, the loss of key staff including top researchers and a foremost dealmaker, and potential clashes with partners like Microsoft. Commentators suggest that new leadership at OpenAI might lack the necessary business experience to successfully raise capital and support their team.

There's anticipation of more difficulties ahead, with predictions that OpenAI may eventually be wholly acquired by Microsoft and its leaders relegated to le

In [42]:
# question = "What are people saying about Figma and Adobe?"
# print(qna(question))

According to the tweets, there was a potential acquisition of Figma by Adobe that was valued at $20 billion. However, the deal fell through reportedly due to issues with a regulatory body in the UK. The process of Adobe deciding not to buy Figma allegedly took 15 months because of the time it took for them to open up Acrobat Reader to check out the contract. This situation has resulted in discussions about the challenges of being a VC-backed founder and the potential benefits of bootstrapping or raising capital and issuing dividends.

Citations:
[8] https://x.com/darylginn/status/1736756570414010398?s=20
[54] https://x.com/gregisenberg/status/1737113007619264672?s=20
[13] https://x.com/jsngr/status/1731393088013131944?s=20
[42] https://x.com/gregisenberg/status/1744800699530039717?s=20
[19] https://x.com/bgurley/status/1726630724353315139?s=20


In [83]:
# question = "What was the tweet about someone's failures and learnings."
# print(qna(question))

The user discussed their considerable losses during the year 2023 in one of the tweets. The user mentioned a few key failures such as:

1. Listening to bad advice that recommended keeping the team lean with the anticipation of a looming recession, which resulted in considerable lost revenue (~$3M).
   
2. The focus on hiring mid-level employees instead of investing in hires that multiply the business.

3. Not organizing enough 2-3 day IRL (In Real Life) masterminds, which are industry-specific brainstorming sessions.

4. Not buying a company in 2023 despite initial attempts, which however eventually improved their process and thesis for M&A (Mergers and Acquisitions).

Citations:
[43] https://x.com/gregisenberg/status/1744020398272913770?s=20
[44] https://x.com/gregisenberg/status/1743648532278501424?s=20
[58] https://x.com/gregisenberg/status/1734758228930515272?s=20
[37] https://x.com/gregisenberg/status/1746581102720893340?s=20
[52] https://x.com/gregisenberg/status/1738261461783187

In [84]:
# question = "What was the tweet about IRL events?"
# print(qna(question))

The tweet about IRL (in real life) events suggests that there might be a shift back towards people meeting in-person. It insinuates that there might be a growing trend towards people wanting to disconnect from digital spaces and return to real life interactions and experiences. The author predicts 2024 as the turning point for this shift and asserts that being more "dopamine sober" than "dopamine drunk" might become a modern day flex. The idea is that real life is becoming a place of escape from the internet.

Citations:
[18] https://x.com/andykreed/status/1726471345125589460?s=20
[59] https://x.com/gregisenberg/status/1734921305969877258?s=20
[43] https://x.com/gregisenberg/status/1744020398272913770?s=20
[58] https://x.com/gregisenberg/status/1734758228930515272?s=20
[57] https://x.com/gregisenberg/status/1736058260908572793?s=20


In [85]:
# question = "What are interesting and profitable trends and opportunities?"
# print(qna(question))

Some of the profitable trends and opportunities mentioned in the tweets are:

1. Digital Detox: There's a growing trend of people seeking to disconnect from their digital devices, presenting a potential billion dollar opportunity in the digital detox space. The need to "reset your mind and earn your freedom" from digital addiction can be leveraged into profitable business ideas.

2. Emerging Niche Communities: Finding an active community around a specific niche could present a business opportunity. Understanding the community and their preferences through content like memes can provide valuable insights for creating a product or service that would resonate with them.

3. Content & Community: Adding value to people’s lives through content and community is highlighted as a strong trend. It could be in the form of good educational content or a supportive online community.

4. Hiring Global Talent: Expanding the search for talent beyond traditional tech hubs like NYC or Silicon Valley can 

In [13]:
# question = "What are people saying about Kanye West and/or Taylor Swift?"
# print(qna(question))

Kanye West is mentioned in the context of a tweet which recalls a situation where he was on the phone with Taylor Swift before releasing the song "Famous".

Citations:
[5] https://x.com/historyinmemes/status/1736856310757384262?s=20
[19] https://x.com/bgurley/status/1726630724353315139?s=20
[21] https://x.com/VCBrags/status/1726064050927181835?s=20
[18] https://x.com/andykreed/status/1726471345125589460?s=20
[15] https://x.com/Jack_Raines/status/1729607931094569020?s=20


In [15]:
# question = "What are people saying about the Super Bowl?"
# print(qna(question))

Sorry, it seems like the Tweets do not provide enough information about this topic.


In [16]:
# question = "What is the tweet about managing money?"
# print(qna(question))

The tweet about managing money suggests that ambitious people under 30 who have a high income can afford to treat themselves, like moving to a pricier city. The tweet encourages them to save money and cultivate a nest egg of over $100k, which will compound in the market over time. It advises focusing on aspects of life that can greatly improve quality of life rather than just incremental changes, highlighting that if someone under 30 has $250k in liquid assets, they're far ahead of many people financially. It also encourages long-term planning and focusing on high ROI activities. The issue of living paycheck to paycheck is also addressed, mentioning that most traditional advice is geared toward this group rather than ambitious young entrepreneurs.

Citations:
[6] https://x.com/philip_ruffini/status/1736948535344435619?s=20
[26] https://x.com/thepatwalls/status/1713965959705137629?s=20
[28] https://x.com/Chrisjjosephs/status/1656764644416172032?s=20
[12] https://x.com/blader/status/1681

In [17]:
# question = "What are the best opportunities for young ambitious individuals?"
# print(qna(question))

The best opportunities for young ambitious individuals, according to the tweets, seem to involve financial independence, following one's passion, and personal development. There's a mention of smart and ambitious individuals whose income tends to rise annually, and suggests moving to a more expensive city as a way of treating oneself when financially capable. It suggests building a decent savings balance and focusing on high ROI activities. Further, it encourages young people to think about their future but also enjoy their present. 

Another case presented is that of an individual who quit a highly paid job to start their own business, suggesting the pursuit of entrepreneurial dreams can be a rewarding opportunity. They believed in sacrificing comforts in pursuit of their ambitions. 

Additionally, the tweets emphasize learning and personal growth. For example, learning how to sell, build, and use an LLM properly can significantly boost one's productivity and learning speed. Thus, inv

### Clustering

In [43]:
# Cluster the Tweet Embeddings
from sklearn.cluster import KMeans

num_clusters = 5  # Adjust this based on your needs
kmeans = KMeans(n_clusters=num_clusters, random_state=0).fit(embeddings)
labels = kmeans.labels_

In [44]:
# Extract Representative Tweets for Each Cluster
def get_representative_tweets(embeddings, labels, num_representatives=3):
    representative_tweets = {}
    for i in range(num_clusters):
        # Get indices of tweets in this cluster
        indices = np.where(labels == i)[0]
        # Calculate distances from the centroid
        distances = np.linalg.norm(embeddings[indices] - kmeans.cluster_centers_[i], axis=1)
        # Get indices of closest tweets
        closest_indices = np.argsort(distances)[:num_representatives]
        representative_tweets[i] = closest_indices
    return representative_tweets

representatives = get_representative_tweets(embeddings, labels)

In [79]:
# Generate Themes Using GPT-4
def generate_theme(cluster_tweets):
    # Using chat model for OpenAI completion
    response = openai.ChatCompletion.create(
        model="gpt-4",  
        messages=[
            {"role": "system", "content": "You are a helpful assistant."},
            {"role": "assistant", "content": f"Generate a short theme (under 5 words) to categorize the following cluster of tweets: {cluster_tweets}. Only output the theme without any unecessary punctuation."}
        ]
    )
    answer = response.choices[0].message['content'] if response.choices else "No response"
    return answer

cluster_themes = {}
for cluster_id, tweet_indices in representatives.items():
    cluster_tweets = " ".join([df.iloc[idx]['text'] for idx in tweet_indices])  
    cluster_themes[cluster_id] = generate_theme(cluster_tweets)

In [63]:
# Compute Similarity Scores
def compute_similarity_score(cluster_id, embeddings, labels, centroid):
    indices = np.where(labels == cluster_id)[0]
    cluster_embeddings = embeddings[indices]
    distances = np.linalg.norm(cluster_embeddings - centroid, axis=1)
    return np.mean(distances)

similarity_scores = {}
for i in range(num_clusters):
    similarity_scores[i] = compute_similarity_score(i, embeddings, labels, kmeans.cluster_centers_[i])

In [80]:
# Compile the Results
for i in range(num_clusters):
    print(f"Cluster {i+1}:")
    print("Theme:", cluster_themes[i])
    print("Similarity Score:", similarity_scores[i])
    print("\n")

Cluster 1:
Theme: Product Usability and Consumer Experience
Similarity Score: 0.3559176395135934


Cluster 2:
Theme: Satire, Tech Critiques, and Strategic Advice
Similarity Score: 0.47518888317681746


Cluster 3:
Theme: Financial Growth and Personal Enjoyment
Similarity Score: 0.4372318076566746


Cluster 4:
Theme: Venture Capital and AI Development Insights
Similarity Score: 0.4439067139237174


Cluster 5:
Theme: AI Development and User Experience
Similarity Score: 0.4637020860805901


