### Import libraries

In [2]:
import pandas as pd
import openai
import faiss
import numpy as np

### Load dataset

In [3]:
file_path = 'data/liked-tweets.csv'
df = pd.read_csv(file_path)

df.head()

Unnamed: 0,url,text
0,https://x.com/anothercohen/status/174527559909...,If you're questioning whether it's too early t...
1,https://x.com/diligentium/status/1744994086929...,Cool toy! 👍\n\nBut the R1 seems to rely on the...
2,https://x.com/naval/status/1002104154737684480...,Learn to sell. Learn to build. If you can do b...
3,https://x.com/jxmnop/status/173712987574646814...,Seen a lot of evidence that GPT-4 crushes Gemi...
4,https://x.com/historyinmemes/status/1736856310...,Kanye West on the phone with Taylor Swift befo...


### Generate embeddings

In [4]:
def get_embedding(text):
    response = openai.Embedding.create(input=[text], engine="text-embedding-ada-002")
    return np.array(response['data'][0]['embedding'])


In [5]:
# Generate embeddings for each tweet
embeddings = np.array([get_embedding(tweet) for tweet in df['text']])  

### Store embeddings in vector database

In [6]:
dimension = embeddings.shape[1]  # Dimension of the embeddings
index = faiss.IndexFlatL2(dimension)  # Using L2 distance for similarity
index.add(embeddings.astype(np.float32))  # Ensure the data type is float32

In [7]:
# Save the index
faiss.write_index(index, "test_embeddings.index")

### Search relevant tweets

In [8]:
def search_tweets(query, k):
    embedding = get_embedding(query)
    distances, indices = index.search(np.array([embedding]).astype(np.float32), k)
    return (df.iloc[indices[0]], distances[0]) # returns the dataframe and the distances

In [9]:
relevant_tweets = search_tweets("How to find warm leads for my business?", 5)
print(relevant_tweets)

(                                                  url  \
23  https://x.com/natiakourdadze/status/1711776939...   
2   https://x.com/naval/status/1002104154737684480...   
25  https://x.com/thepatwalls/status/1713965959705...   
22  https://x.com/nikitabier/status/17255418267681...   
28  https://x.com/yongfook/status/1648466000147517...   

                                                 text  
23  How to find warm leads i.e. get customers easi...  
2   Learn to sell. Learn to build. If you can do b...  
25  5 years ago today, I quit my $125K/year softwa...  
22  For every founder that successfully exits, the...  
28  Feels like 90% of the indie hacker apps on my ...  , array([0.28786168, 0.4410772 , 0.4721748 , 0.48045477, 0.5005583 ],
      dtype=float32))


In [10]:
relevant_tweets = search_tweets("What are people saying about Kanye West or Taylor Swift?", 5)
print(relevant_tweets)

(                                                  url  \
4   https://x.com/historyinmemes/status/1736856310...   
18  https://x.com/bgurley/status/17266307243533151...   
20  https://x.com/VCBrags/status/17260640509271818...   
28  https://x.com/yongfook/status/1648466000147517...   
14  https://x.com/Jack_Raines/status/1729607931094...   

                                                 text  
4   Kanye West on the phone with Taylor Swift befo...  
18  If you told me 10 years ago that a group of th...  
20  The number one thing on every venture capitali...  
28  Feels like 90% of the indie hacker apps on my ...  
14  A few months ago, a Forbes reporter reached ou...  , array([0.30583128, 0.5034225 , 0.51719314, 0.5449016 , 0.5455991 ],
      dtype=float32))


In [11]:
relevant_tweets = search_tweets("What are people saying about OpenAI?", 5)
print(relevant_tweets)

(                                                  url  \
20  https://x.com/VCBrags/status/17260640509271818...   
26  https://x.com/holdenmatt/status/16904365380675...   
19  https://x.com/thecaptain_nemo/status/172654458...   
17  https://x.com/andykreed/status/172647134512558...   
21  https://x.com/varun_mathur/status/172597141823...   

                                                 text  
20  The number one thing on every venture capitali...  
26  OpenAI Functions is the coolest new tech I've ...  
19  drop the "OpenAI". just "Microsoft", its cleaner.  
17  Ex-Twitch CEO running OpenAI??? What’s he gonn...  
21  Dear Mira and Ilya - congratulations on your c...  , array([0.29594553, 0.301573  , 0.3490528 , 0.35370684, 0.35620123],
      dtype=float32))


### Q&A + cite

In [12]:
def qna(question, k=5):
    # Search for relevant tweets
    relevant_tweets = search_tweets(question, k)[0]

    # Combine texts of relevant tweets
    combined_tweets_text = " ".join(relevant_tweets['text'])  

    # Using chat model for OpenAI completion
    response = openai.ChatCompletion.create(
        model="gpt-4",  
        messages=[
            {"role": "system", "content": "You are a helpful assistant retrieving information from Tweets that the user saved."},
            {"role": "assistant", "content": f"Answer the question based on these tweets: {combined_tweets_text}. If there is not enough information, answer: 'Sorry, it seems like the Tweets do not provide enough information about this topic.' \n\nQuestion: {question}\nAnswer:"}
        ]
    )

    answer = response.choices[0].message['content'] if response.choices else "No response"

    # Check if the answer indicates sufficient relevant information
    if 'Sorry, it seems like the Tweets do not provide enough information about this topic.'.lower() not in answer.lower():
        # Add citation list at the end
        citations = "\n".join([f"[{i+1}] {row['url']}" for i, row in relevant_tweets.iterrows()])
        answer += f"\n\nCitations:\n{citations}"

    return answer


In [13]:
question = "What are people saying about Kanye West and/or Taylor Swift?"
print(qna(question))

Kanye West had a phone call with Taylor Swift before dropping "Famous".

Citations:
[5] https://x.com/historyinmemes/status/1736856310757384262?s=20
[19] https://x.com/bgurley/status/1726630724353315139?s=20
[21] https://x.com/VCBrags/status/1726064050927181835?s=20
[18] https://x.com/andykreed/status/1726471345125589460?s=20
[15] https://x.com/Jack_Raines/status/1729607931094569020?s=20


In [14]:
question = "What are people saying about OpenAI?"
print(qna(question))

The tweets indicate a mix of opinions about OpenAI. Some people are fascinated by the new technology it provides, calling it 'the coolest new tech' they've played with. However, others are criticizing the management and the unit economics of the company, stating that those at the helm lack entrepreneurial experience and predicting a dire future for the company. There is also mention of the strained relationship between OpenAI and Microsoft, speculating that the latter must be upset over recent developments. The ex-Twitch CEO's role at OpenAI is also being questioned. Overall, while there seems to be appreciation for the organization's technical advancements, there is also skepticism and concerns regarding its financial stability, managerial competence and strategic partnerships.


Citations:
[21] https://x.com/VCBrags/status/1726064050927181835?s=20
[27] https://x.com/holdenmatt/status/1690436538067570688?s=20
[20] https://x.com/thecaptain_nemo/status/1726544586380824932?s=20
[18] http

In [15]:
question = "What are people saying about the Super Bowl?"
print(qna(question))

Sorry, it seems like the Tweets do not provide enough information about this topic.


In [16]:
question = "What is the tweet about managing money?"
print(qna(question))

The tweet about managing money advises ambitious individuals under 30 who earn more than their parents. It suggests that it's okay to move to a more expensive city and treat oneself. It also notes that as an ambitious person, your income is likely to increase each year and once you've saved more than $100k, your ability to save will increase rapidly due to compounding market growth. The tweet emphasizes the importance of focusing on things that can significantly improve your life quality, rather than small incremental changes. Lastly, it encourages thinking long-term about the future while still spending money to enjoy the present.

Citations:
[6] https://x.com/philip_ruffini/status/1736948535344435619?s=20
[26] https://x.com/thepatwalls/status/1713965959705137629?s=20
[28] https://x.com/Chrisjjosephs/status/1656764644416172032?s=20
[12] https://x.com/blader/status/1681356168680321025?s=20
[22] https://x.com/varun_mathur/status/1725971418238849154?s=20


In [17]:
question = "What are the best opportunities for young ambitious individuals?"
print(qna(question))

According to the tweets, the best opportunities for young ambitious individuals are to take calculated risks and invest in oneself, like moving to a more expensive city if it offers better opportunities or experiences. It's suggested that they learn skills that increase their value, such as learning how to sell, how to build, and how to use an LLM (presumably, a kind of productivity tool or software). Adopting a long-term view, e.g. thinking 10-20 years into the future, is also recommended. Another opportunity mentioned is starting their own business, implying that entrepreneurship can be a worthwhile venture for ambitious young individuals.

Citations:
[6] https://x.com/philip_ruffini/status/1736948535344435619?s=20
[23] https://x.com/nikitabier/status/1725541826768130373?s=20
[3] https://x.com/naval/status/1002104154737684480?s=20
[26] https://x.com/thepatwalls/status/1713965959705137629?s=20
[17] https://x.com/localghost/status/1728343665351729187?s=20


### Clustering

In [39]:
# Import libraries
import numpy as np
from sklearn.cluster import KMeans
import pandas as pd

In [40]:
# Define the k-means clustering function
def generate_theme_clusters(num_clusters):
    # Perform K-means clustering
    kmeans = KMeans(n_clusters=num_clusters, random_state=0).fit(embeddings)

    # Assign the cluster labels to the DataFrame
    df['cluster'] = kmeans.labels_

    # Optionally, return centroids or other information
    return kmeans.cluster_centers_

In [41]:
# Generate 5 clusters
centers = generate_theme_clusters(5)  # Adjust the number of clusters as needed

  super()._check_params_vs_input(X, default_n_init=10)


In [57]:
# Define Function to Generate Theme and Summary
def get_cluster_theme_summary(cluster_tweets, engine="davinci"):
    # Combine tweet texts
    combined_tweets = " ".join(cluster_tweets)

    response = openai.ChatCompletion.create(
        model="gpt-4",  
        messages=[
            {"role": "system", "content": "You are a helpful assistant."},
            {"role": "assistant", "content": f"Generate a theme and a short summary (50 words max) for the following tweets: {combined_tweets}"}
        ]
    )
    
    answer = response.choices[0].message['content'] if response.choices else "No response"

    return answer

In [58]:
# Process Each Cluster and Compute Similarity Score
def compute_similarity_score(cluster_idx, kmeans, embeddings):
    indices = np.where(kmeans.labels_ == cluster_idx)[0]
    cluster_embeddings = embeddings[indices]
    centroid = kmeans.cluster_centers_[cluster_idx]
    distances = np.linalg.norm(cluster_embeddings - centroid, axis=1)
    return np.mean(distances)

for i in range(5):  # Adjust for the number of clusters
    cluster_tweets = df[df['cluster'] == i]['text'].tolist() 
    theme_summary = get_cluster_theme_summary(cluster_tweets)
    similarity_score = compute_similarity_score(i, kmeans, embeddings)
    
    print(f"Cluster {i+1}:")
    print("Theme and Summary:", theme_summary)
    print("Similarity Score:", similarity_score)
    print("\n")

Cluster 1:
Theme and Summary: Theme: Insights about Startups, Venture Capitalists, and Digital Marketing Strategies

Summary: The series of tweets provide insights about misunderstandings regarding VC job titles, offers a practical guide on lead generation via Twitter automation, criticizes the lack of originality among independent app developers, and compares returns of VC-funded startups with freelancer earnings. The tweets emphasize the importance of understanding different aspects of the startup ecosystem.
Similarity Score: 0.4496733374155669


Cluster 2:
Theme and Summary: Theme: Professional Growth and Financial Independence Journey

Summary: The tweets document the author's personal journey of entrepreneurship. They share wisdom on achieving financial stability and the importance of long-term planning. The individual learnt to sell and build, got ahead financially by their 30s, identified meaningful lifestyle changes, and took career risks to pursue their dream of starting a bus