# Setup

1. Create conda environment
- Create conda environment found in the folder `environment`. 
- Execute following command: `conda env create -f environment.yml`

2. Select conda environment as kernel in notebook
3. Start docker containers with docker compose in folder `docker`with `docker-compose up`

## Install needed libraries and import components

In [1]:
%pip install tqdm

import pandas as pd
import concurrent.futures
import random

from pymongo import MongoClient, InsertOne, UpdateOne
from pymongo.errors import ConnectionFailure
from datetime import datetime
from concurrent.futures import ThreadPoolExecutor
from tqdm import tqdm

Note: you may need to restart the kernel to use updated packages.


## Define the MongoDB server details

In [15]:
# Define the MongoDB server details
host = 'localhost'
port = 27017
username = 'devroot'  # Replace with your MongoDB username
password = 'devroot'  # Replace with your MongoDB password

# Define the MongoDB server details
port2 = 27018

# Create the connection string
connection_string = f'mongodb://{username}:{password}@{host}:{port}'
connection_string2 = f'mongodb://{username}:{password}@{host}:{port2}'

## Connect to DB and test connection

In [16]:
# Connect to the MongoDB server
client = MongoClient(connection_string)

# Connect to the MongoDB server
client2 = MongoClient(connection_string2)

In [17]:
try:
    # Verify connection
    client.admin.command('ping')
    print("Connected successfully to MongoDB")
    
    # List all databases
    databases = client.list_database_names()
    print("Databases:", databases)
        
except ConnectionFailure as e:
    print(f"Could not connect to MongoDB: {e}")

Connected successfully to MongoDB
Databases: ['admin', 'config', 'local', 'social_network']


In [18]:
try:
    # Verify connection
    client2.admin.command('ping')
    print("Connected successfully to MongoDB")
    
    # List all databases
    databases = client2.list_database_names()
    print("Databases:", databases)
        
except ConnectionFailure as e:
    print(f"Could not connect to MongoDB: {e}")

Connected successfully to MongoDB
Databases: ['admin', 'config', 'local']


# Global Definitions

## Collection definition

In [19]:
# Select the database and collections
db = client['social_network']
db2 = client2['social_network2']

users_collection = db['users']
followers_collection = db['followers']
posts_collection = db['posts']
likes_collection = db2['likes']
feeds_collection = db['feeds']

## Insertion Helper functions

In [9]:
def get_most_followed_users(n):
    return users_collection.find().sort("followers_count", -1).limit(n)

def add_post_without_likes(user_id, content, date):
    post = {
        "user_id": user_id,
        "content": content,
        "timestamp": date,
        "likes": 0
    }
    post_id = posts_collection.insert_one(post).inserted_id
    return post_id

def get_random_users(pool_size, exclude_user_id):
    pipeline = [
        {"$match": {"_id": {"$ne": exclude_user_id}}},
        {"$sample": {"size": pool_size}}
    ]
    return list(users_collection.aggregate(pipeline))

def add_likes_chunk(chunk, random_user_pool):
    like_operations = []
    like_updates = []

    for post_id, user_id, content, date in chunk:
        # Generate a random number of likes between 0 and 100
        number_of_likes = random.randint(0, 100)
        
        # Get a random sample of users from the pool
        random_users = random.sample(random_user_pool, number_of_likes)
        
        # Prepare bulk operations for likes
        like_operations.extend([
            InsertOne({"userid": user["_id"], "postid": post_id})
            for user in random_users
        ])

        like_updates.append((post_id, number_of_likes))

    return like_operations, like_updates

def add_likes_bulk(post_data, random_user_pool, max_workers=4):
    chunk_size = len(post_data) // max_workers
    chunks = [post_data[i:i + chunk_size] for i in range(0, len(post_data), chunk_size)]
    
    like_operations = []
    like_updates = []

    with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
        futures = [executor.submit(add_likes_chunk, chunk, random_user_pool) for chunk in chunks]
        for future in tqdm(concurrent.futures.as_completed(futures), total=len(futures), desc="Processing likes"):
            chunk_like_operations, chunk_like_updates = future.result()
            like_operations.extend(chunk_like_operations)
            like_updates.extend(chunk_like_updates)

    if like_operations:
        likes_collection.bulk_write(like_operations)

    return like_updates

## Queries Helper functions

In [10]:
def get_posts_with_words(words):
    # Construct a list of regex patterns for each word
    regex_patterns = [f"(?=.*\\b{word}\\b)" for word in words]
    regex_query = {"content": {"$regex": "".join(regex_patterns), "$options": "i"}}
    
    # Fetch all posts matching the regex query
    posts_cursor = posts_collection.find(regex_query)
    posts_list = list(posts_cursor)
    posts_count = len(posts_list)
    
    # Return a dictionary containing both posts and count
    result = {
        "posts": posts_list,
        "count": posts_count
    }
    return result


def find_top_users_following_influencers(top_influencers, n):    
    # Aggregation pipeline
    pipeline = [
        {"$match": {"followed_id": {"$in": top_influencers}}},
        {"$group": {
            "_id": "$follower_id",
            "count": {"$addToSet": "$followed_id"}
        }},
        {"$project": {
            "_id": 1,
            "count": {"$size": "$count"}
        }},
        {"$sort": {"count": -1}},
        {"$limit": n}
    ]
    
    # Execute aggregation pipeline and fetch results
    cursor = followers_collection.aggregate(pipeline)
    
    # Convert the cursor to a list to properly handle the results
    top_users = list(cursor)
    
    # Return the result
    return top_users

def get_user_profile(user_id):
    user = users_collection.find_one({"_id": user_id})
    if not user:
        return None  # Handle case where user with user_id doesn't exist
    
    followers_count = user["followers_count"]
    following_count = user["following_count"]

    user_posts = list(posts_collection.find({"user_id": user_id}))
    user_posts_count = len(user_posts)
    
    user_feed_posts = feeds_collection.find_one({"user_id": user_id}).get('posts', [])
    user_feed_posts_count = len(user_feed_posts)
    
    # Sort posts by date
    posts_sorted_by_date = sorted(user_feed_posts, key=lambda x: x.get('timestamp', ''), reverse=True)

    # Sort posts by likes count
    posts_sorted_by_likes = sorted(user_feed_posts, key=lambda x: x.get('likes', 0), reverse=True)
    
    profile = {
        "user_id": user_id,
        "followers_count": followers_count,
        "following_count": following_count,
        "user_posts": user_posts,
        "user_posts_count": user_posts_count,
        "feed": user_feed_posts,
        "feed_size": user_feed_posts_count,
        "recent_posts": posts_sorted_by_date,
        "popular_posts": posts_sorted_by_likes
    }
    return profile

def print_user_profile(user_profile, n):
    print("User profile with id:", user_profile["user_id"])
    print("Followers count:", user_profile["followers_count"])
    print("Following count:", user_profile["following_count"])
    print("______________________\n")
    
    print(f"User posts: ({user_profile['user_posts_count']})")
    for i, post in enumerate(user_profile["user_posts"][:n]):
        print(post["content"], "date:", post["timestamp"])
    print("______________________\n")
    
    print("Feed (recent):")
    for i, post in enumerate(user_profile["recent_posts"][:n]):
        print(post["content"], "date:", post["timestamp"])
    print("______________________\n")
    
    print("Feed (popular):")
    for i, post in enumerate(user_profile["popular_posts"][:n]):
        print(post["content"], "likes:", post.get("likes", "N/A"))
    print("______________________")

# Insert Data into DB

## Insert users and following relationships to db

In [11]:
file_path = './InputData/twitter_combined.txt'

# Read file content
with open(file_path, 'r') as file:
    lines = file.readlines()

# Prepare data
user_pairs = [tuple(map(int, line.strip().split())) for line in lines]

# Get a unique set of all users involved
all_users = {user for pair in user_pairs for user in pair}

# Check which users already exist in the database
existing_users = set(users_collection.distinct("_id", {"_id": {"$in": list(all_users)}}))

# Identify new users
new_users = all_users - existing_users

# Prepare bulk operations for new users
user_bulk_operations = [
    InsertOne({"_id": user_id, "following_count": 0, "followers_count": 0})
    for user_id in new_users
]

# Execute bulk insert for new users
if user_bulk_operations:
    users_collection.bulk_write(user_bulk_operations)

# Prepare bulk operations for relationships and updating counts
relationship_bulk_operations = []
user_update_operations = []

for user1, user2 in user_pairs:
    relationship_bulk_operations.append(InsertOne({"follower_id": user1, "followed_id": user2}))
    user_update_operations.append(UpdateOne({"_id": user1}, {"$inc": {"following_count": 1}}))
    user_update_operations.append(UpdateOne({"_id": user2}, {"$inc": {"followers_count": 1}}))

# Execute bulk insert for relationships
if relationship_bulk_operations:
    followers_collection.bulk_write(relationship_bulk_operations)

# Execute bulk update for user counts
if user_update_operations:
    users_collection.bulk_write(user_update_operations)

## Assign the input tweets to the influencers

In [12]:
top_influencers_list = list(get_most_followed_users(100))

# Path to the CSV file
file_path = './InputData/tweets.csv'

# Read the CSV file into a DataFrame
df = pd.read_csv(file_path)

# Generate random influencer selections once to avoid repeated random.choice calls
influencer_ids = [random.choice(top_influencers_list)['_id'] for _ in range(len(df))]

# Function to process each tweet
def process_tweet(idx):
    tweet_data = df.iloc[idx]
    influencer_id = influencer_ids[idx]
    date = datetime.strptime(tweet_data['date_time'], '%d/%m/%Y %H:%M')
    content = tweet_data['content']
    post_id = add_post_without_likes(influencer_id, content, date)
    return post_id, influencer_id, content, date

# Step 1: Insert all posts first
with ThreadPoolExecutor() as executor:
    post_data = list(tqdm(executor.map(process_tweet, range(len(df))), total=len(df), desc="Inserting posts"))

Inserting posts: 100%|██████████| 52542/52542 [00:10<00:00, 5126.01it/s]


## Add likes to post randomly

In [20]:
# Step 1: Fetch a pool of random users once
user_pool_size = 10000  # Adjust the pool size as needed
random_user_pool = get_random_users(user_pool_size, exclude_user_id=None)

# Step 2: Insert likes for posts in bulk
like_updates = add_likes_bulk(post_data, random_user_pool, max_workers=4)

# Step 3: Update post like counts in bulk
bulk_updates = [
    UpdateOne({"_id": post_id}, {"$set": {"likes": likes}})
    for post_id, likes in like_updates
]
if bulk_updates:
    posts_collection.bulk_write(bulk_updates)

Processing likes: 100%|██████████| 5/5 [00:01<00:00,  3.83it/s]


## Add indices to collection when needed

In [21]:
# Add indices to critical parameters on collections
followers_collection.create_index([("followed_id", 1), ("follower_id", 1)])

posts_collection.create_index([("user_id", 1)])

'user_id_1'

In [22]:
temp_feeds_collection = db['temp_feeds']

batch_size = 1000  # Define a suitable batch size
post_count = posts_collection.count_documents({})
num_batches = (post_count // batch_size) + 1

# Step 1: Aggregate data into a temporary collection
for batch in tqdm(range(num_batches), desc="Processing Batches"):
    pipeline = [
        {"$skip": batch * batch_size},
        {"$limit": batch_size},
        {"$lookup": {
            "from": "followers",
            "localField": "user_id",
            "foreignField": "followed_id",
            "as": "followers"
        }},
        {"$unwind": "$followers"},
        {"$project": {
            "_id": 0,
            "follower_id": "$followers.follower_id",
            "post": {
                "post_id": "$_id",
                "user_id": "$user_id",
                "content": "$content",
                "timestamp": "$timestamp",
                "likes": "$likes"
            }
        }},
        {"$group": {
            "_id": "$follower_id",
            "posts": {"$push": "$post"}
        }},
        {"$out": "temp_feeds"}
    ]
    posts_collection.aggregate(pipeline)

# Step 2: Process the temporary collection to split large documents
temp_docs = list(temp_feeds_collection.find())
for doc in tqdm(temp_docs, desc="Processing Temp Feeds"):
    follower_id = doc['_id']
    posts = doc['posts']
    
    chunk_size = 100
    chunked_posts = [posts[i:i + chunk_size] for i in range(0, len(posts), chunk_size)]
    
    for chunk in chunked_posts:
        feeds_collection.update_one(
            {"user_id": follower_id},
            {"$push": {"posts": {"$each": chunk}}},
            upsert=True
        )

# Step 3: Ensure posts arrays are unique and in order
# Use aggregation pipeline to deduplicate and sort posts
pipeline = [
    {
        "$addFields": {
            "unique_posts": {"$setUnion": "$posts"}
        }
    },
    {
        "$project": {
            "user_id": 1,
            "posts": {"$slice": [{"$sortArray": {"input": "$unique_posts", "sortBy": {"timestamp": -1}}}, 16793600]}
        }
    },
    {
        "$merge": {
            "into": "feeds",
            "whenMatched": "replace",
            "whenNotMatched": "insert"
        }
    }
]

feeds_collection.aggregate(pipeline)

# Clean up temporary collection
temp_feeds_collection.drop()

Processing Batches: 100%|██████████| 53/53 [07:12<00:00,  8.16s/it]
Processing Temp Feeds: 100%|██████████| 28401/28401 [02:47<00:00, 169.87it/s]


# Request data from DB

## Find out top 100 most followed

In [23]:
# Get and print top influencers
top_influencers = get_most_followed_users(100)
print("Top influencers:")
top_influencers_id_list = []
for influencer in top_influencers:
    print("user id:", influencer["_id"], "Follower count", influencer["followers_count"])
    top_influencers_id_list.append(influencer["_id"])

Top influencers:
user id: 40981798 Follower count 8660
user id: 43003845 Follower count 7700
user id: 22462180 Follower count 7623
user id: 34428380 Follower count 7558
user id: 115485051 Follower count 4798
user id: 15913 Follower count 4337
user id: 3359851 Follower count 3986
user id: 11348282 Follower count 3850
user id: 7861312 Follower count 3712
user id: 27633075 Follower count 3655
user id: 31331740 Follower count 3623
user id: 18996905 Follower count 3255
user id: 7860742 Follower count 3197
user id: 813286 Follower count 3172
user id: 22784458 Follower count 2974
user id: 17868918 Follower count 2904
user id: 10671602 Follower count 2874
user id: 117674417 Follower count 2858
user id: 48485771 Follower count 2725
user id: 34068984 Follower count 2693
user id: 18927441 Follower count 2680
user id: 83943787 Follower count 2678
user id: 15853668 Follower count 2634
user id: 1183041 Follower count 2593
user id: 238260874 Follower count 2560
user id: 8088112 Follower count 2539
us

## Find out top 100 influencer followers

In [24]:
top_influencer_followers = find_top_users_following_influencers(top_influencers_id_list, 100)

# Print the top users
for i, doc in enumerate(top_influencer_followers):
    print(f"#{i+1}: User ID {doc['_id']} follows {doc['count']} influencers")

#1: User ID 3359851 follows 51 influencers
#2: User ID 24641194 follows 43 influencers
#3: User ID 18581803 follows 42 influencers
#4: User ID 440963134 follows 40 influencers
#5: User ID 7872262 follows 40 influencers
#6: User ID 364917755 follows 40 influencers
#7: User ID 274153775 follows 39 influencers
#8: User ID 277649366 follows 38 influencers
#9: User ID 358775055 follows 38 influencers
#10: User ID 46209291 follows 38 influencers
#11: User ID 279787626 follows 38 influencers
#12: User ID 463952369 follows 38 influencers
#13: User ID 134940306 follows 38 influencers
#14: User ID 401313910 follows 38 influencers
#15: User ID 439788025 follows 38 influencers
#16: User ID 135218281 follows 38 influencers
#17: User ID 208132323 follows 38 influencers
#18: User ID 157829215 follows 37 influencers
#19: User ID 440341213 follows 37 influencers
#20: User ID 7860742 follows 37 influencers
#21: User ID 461410856 follows 37 influencers
#22: User ID 304462046 follows 37 influencers
#23: U

## Show user profile (User follower count, user followed count, 25 newest posts in feed, 25 most liked posts in feed)

In [25]:
# Get and print user profile
user_id = 22462180
tweets_to_show = 25
user_profile = get_user_profile(user_id)
print_user_profile(user_profile, tweets_to_show)

User profile with id: 22462180
Followers count: 7623
Following count: 701
______________________

User posts: (550)
HOPE YOU ARE ROARING YOUR WAY TO THE POLLS STILL! 👊🏼🦁🐯❤️🇺🇸: https://t.co/tNQit6ZIA6 date: 2016-11-08 20:04:00
@shannonwoodward he didn't go down to the people date: 2016-10-20 02:41:00
@HoliestKaty iconic date: 2016-09-18 07:39:00
❤️🐶I predict maj likes 🐶❤️ https://t.co/Z1HzbWpZ4g date: 2016-03-01 23:42:00
☀️⛄️💦 https://t.co/8fCcsmiNvQ https://t.co/dvbWHCYurc date: 2015-11-18 20:09:00
conciousness = creativity date: 2015-09-02 22:12:00
Happy Father's Day to a guy that did the best job he could. Love you. ❤️ https://t.co/UVwsIcGgN9 date: 2015-06-21 22:06:00
Coming for your brand @itsjeremyscott https://t.co/fpCuM19nus date: 2015-04-12 08:41:00
Goodnight Sweden: http://t.co/OZleBmLkLj date: 2015-03-22 00:06:00
Just had 1st FaceTime Thanksgiving! What I would give 2b scraping marshmallow off of sweet potato, wearing fuzzy socks &amp; holding my niece! 😩 date: 2014-11-28 02:5

## 25 most popular posts that contain given words

In [26]:
words_to_search = ["big", "data"]
result = get_posts_with_words(words_to_search)

# Print the count of posts found
print("Number of posts:", result["count"])

# Iterate over and print each post content
for post in result["posts"]:
    print("Post content:", post["content"])

Number of posts: 2
Post content: Several @twittereng folks teamed up with @UCBerkeley for a course about analyzing big data: @UCBTweeter. http://t.co/5SUkxQJr
Post content: Twitter + the Human Face of Big Data: http://t.co/HlpWw7rV @faceofbigdata #HFOBD / #BigDataChat 11am PT/2pm ET Friday 10/19 w/@isaach


# Cleanup

In [17]:
# users_collection.drop()
# followers_collection.drop()
# posts_collection.drop()
# likes_collection.drop()
# feeds_collection.drop()

In [18]:
# Close the connection
client.close()