In [1]:
%load_ext autoreload

In [2]:
%autoreload 2

# Add /src modules to path
import os
import sys
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

# Import modules
from src.utils import DbConnector, haversine_np
import pandas as pd
import numpy as np
from pprint import pprint
from datetime import datetime, timedelta
from collections import Counter


In [42]:
# Connect to MySQL database

connector = DbConnector()
db = connector.db

You are connected to the database: mongodb
-----------------------------------------------



----

In [43]:
activities = db["ActivityCollection"]

activities_df = pd.DataFrame(list(activities.find({})))


In [44]:
activities_df[activities_df.track_points.astype(str) == "[]"]

Unnamed: 0,_id,user,start_date_time,end_date_time,transportation_mode,track_points
11,101041,104,2007-11-30 09:59:00,2007-11-30 10:38:00,bus,[]
12,111041,104,2007-11-30 12:40:00,2007-11-30 13:18:00,bus,[]
13,121041,104,2007-12-02 12:27:00,2007-12-02 12:57:00,bus,[]
14,131041,104,2007-12-02 13:00:00,2007-12-02 13:25:00,bus,[]
15,141041,104,2007-12-04 07:45:00,2007-12-04 08:30:00,car,[]
...,...,...,...,...,...,...
20690,12021471,147,2011-05-18 13:00:25,2011-05-18 13:07:45,walk,[]
20691,12031471,147,2011-05-18 13:07:50,2011-05-18 13:21:55,bus,[]
20692,12041471,147,2011-05-19 00:44:58,2011-05-19 01:00:43,walk,[]
20693,12051471,147,2011-05-19 01:00:48,2011-05-19 01:17:48,bus,[]


In [41]:
activities_df2 = activities_df.drop(columns=["track_points"])
activities_df2[activities_df2.transportation_mode == "taxi"]["user"].unique().shape

(35,)

In [51]:
trackpoints = activities_df[activities_df.track_points.astype(str) != "[]"]["track_points"]

In [114]:
count = 0
# {'track_points.lat': 39.916, 'track_points.lon': 116.397})
for i, e in enumerate(trackpoints):
    for tp in e:
        if round(tp["latitude"], 3) == 39.916 and round(tp["longitude"], 3) == 116.397:
            count += 1

In [115]:
count

42

### Task 1
How many users, activities and trackpoints are there in the dataset?

In [45]:
collection = db["ActivityCollection"]

users_count = db.UserCollection.count_documents({})
activities_count = db.ActivityCollection.count_documents({})
trackpoints_count = db.ActivityCollection.aggregate(
    [{"$unwind": "$track_points"}, {"$count": "trackpoint_count"}]
)
trackpoints_count = next(trackpoints_count, {}).get("trackpoint_count", 0)


print(f"Total Users: {users_count}")
print(f"Total Activities: {activities_count}")
print(f"Total Trackpoints: {trackpoints_count}")

Total Users: 182
Total Activities: 21054
Total Trackpoints: 8501885


### Task 2
Find the average number of activities per user

In [17]:
average_activities_per_user = activities_count / users_count
print(f"Average Activities/User: {average_activities_per_user:.2f}")

Average Activities/User: 80.85


### Task 3
Find the top 20 users with the highest number of activities. 

In [19]:
users_activities_count = list(
    db["ActivityCollection"].aggregate(
        [
            {"$group": {"_id": "$user", "activity_count": {"$sum": 1}}},
            {"$sort": {"activity_count": -1}},
            {"$limit": 20},
        ]
    )
)

print("Top 20 Users with Most Activities:")

for user in users_activities_count:
    print(f"User: {user['_id']}, Activities: {user['activity_count']}")

Top 20 Users with Most Activities:
User: 163, Activities: 3182
User: 085, Activities: 1298
User: 153, Activities: 1123
User: 068, Activities: 969
User: 167, Activities: 944
User: 128, Activities: 937
User: 062, Activities: 782
User: 075, Activities: 509
User: 126, Activities: 468
User: 010, Activities: 434
User: 052, Activities: 425
User: 084, Activities: 422
User: 179, Activities: 319
User: 020, Activities: 223
User: 112, Activities: 220
User: 147, Activities: 207
User: 065, Activities: 206
User: 091, Activities: 170
User: 125, Activities: 153
User: 115, Activities: 139


### Task 4
Find all users who have taken a taxi. 

In [39]:
users_taken_taxi = db["ActivityCollection"].distinct(
    "user", {"transportation_mode": "taxi"}
)

print("Users who have taken a taxi:")
pprint(users_taken_taxi)

Users who have taken a taxi:
['010',
 '020',
 '021',
 '052',
 '056',
 '058',
 '062',
 '065',
 '068',
 '075',
 '078',
 '080',
 '082',
 '084',
 '085',
 '091',
 '098',
 '100',
 '102',
 '104',
 '105',
 '111',
 '114',
 '118',
 '126',
 '128',
 '139',
 '147',
 '153',
 '154',
 '161',
 '163',
 '167',
 '175',
 '179']


### Task 5
Find all types of transportation modes and count how many activities that are
tagged with these transportation mode labels. Do not count the rows where
the mode is null.

In [28]:
transportation_modes = list(
    db["ActivityCollection"].aggregate(
        [
            {"$group": {"_id": "$transportation_mode", "count": {"$sum": 1}}},
            {"$match": {"_id": {"$ne": None}}},
        ]
    )
)

print("Transportation Modes and Their Counts:")

for mode in transportation_modes:
    print(f"Mode: {mode['_id']}, Count: {mode['count']}")

Transportation Modes and Their Counts:
Mode: boat, Count: 7
Mode: taxi, Count: 1179
Mode: bus, Count: 2851
Mode: walk, Count: 6459
Mode: train, Count: 299
Mode: airplane, Count: 17
Mode: bike, Count: 2089
Mode: motorcycle, Count: 2
Mode: subway, Count: 813
Mode: run, Count: 6
Mode: car, Count: 993


### Task 6

#### a) Find the year with the most activities. 

In [29]:
year_most_activities = list(
    db.ActivityCollection.aggregate(
        [
            {"$project": {"year": {"$year": "$start_date_time"}}},
            {"$group": {"_id": "$year", "activity_count": {"$sum": 1}}},
            {"$sort": {"activity_count": -1}},
            {"$limit": 1},
        ]
    )
)[0]

print(f"Year with most activities: {year_most_activities['_id']}")

Year with most activities: 2008


#### b) Is this also the year with most recorded hours?

This aggregation pipeline first calculates the number of hours for each activity by subtracting the start_date_time from the end_date_time (resulting in a duration in milliseconds) and then dividing by the number of milliseconds in an hour (3,600,000). It then groups the data by year and sums the total hours, sorts by total hours in descending order, and limits the results to the year with the highest number of hours.

In [31]:
year_most_hours = list(
    db.ActivityCollection.aggregate(
        [
            {
                "$project": {
                    "year": {"$year": "$start_date_time"},
                    "hours": {
                        "$divide": [
                            {"$subtract": ["$end_date_time", "$start_date_time"]},
                            3600000,  # Number of milliseconds in an hour
                        ]
                    },
                }
            },
            {"$group": {"_id": "$year", "total_hours": {"$sum": "$hours"}}},
            {"$sort": {"total_hours": -1}},
            {"$limit": 1},
        ]
    )
)[0]

print(
    f"Year with most recorded hours: {year_most_hours['_id']}, Hours: {year_most_hours['total_hours']:.2f}"
)

Year with most recorded hours: 2008, Hours: 7421.26


### Task 7
Find the total distance (in km) walked in 2008, by user with id=112.

In [35]:
user_id = "112"
activities_for_user = db.ActivityCollection.find({
    "user": user_id,
    "transportation_mode": "walk",
    "start_date_time": {"$gte": datetime(2008, 1, 1), "$lt": datetime(2009, 1, 1)}
})

total_distance = 0.0

for activity in activities_for_user:
    track_points = activity["track_points"]
    
    # Split track points into latitudes, longitudes for easier computation
    lats = np.array([point["latitude"] for point in track_points])
    longs = np.array([point["longitude"] for point in track_points])
    
    # Calculate the haversine distance for each consecutive track point and sum them up
    total_distance += np.sum(haversine_np(longs[:-1], lats[:-1], longs[1:], lats[1:]))

print(f"Total distance walked by user {user_id} in 2008: {total_distance:.2f} km")


Total distance walked by user 112 in 2008: 223.15 km


### Task 8
Find the top 20 users who have gained the most altitude meters.
- Output should be a field with (id, total meters gained per user).
- Remember that some altitude-values are invalid

In [36]:
def calculate_altitude_gain(track_points):
    """
    Calculate the altitude gain from a list of track points.
    """
    total_gain = 0
    for i in range(1, len(track_points)):
        altitude_difference = track_points[i]['altitude'] - track_points[i-1]['altitude']
        if altitude_difference > 0:
            total_gain += altitude_difference
    return total_gain

def top_users_by_altitude_gain():
    # Connect to the MongoDB instance
    activity_collection = db['ActivityCollection']

    user_altitude_gains = {}

    # Iterate over all activities in the ActivityCollection
    for activity in activity_collection.find():
        user = activity['user']
        altitude_gain = calculate_altitude_gain(activity['track_points'])
        
        if user in user_altitude_gains:
            user_altitude_gains[user] += altitude_gain
        else:
            user_altitude_gains[user] = altitude_gain

    # Sort users by their total altitude gain
    sorted_users = sorted(user_altitude_gains.items(), key=lambda x: x[1], reverse=True)

    # Get top 20 users
    top_20_users = sorted_users[:20]

    for user, gain in top_20_users:
        print(f"User ID: {user}, Altitude Gain: {gain} meters")

top_users_by_altitude_gain()


User ID: 128, Altitude Gain: 873882.6328083987 meters
User ID: 085, Altitude Gain: 852330 meters
User ID: 062, Altitude Gain: 500368.4000000001 meters
User ID: 084, Altitude Gain: 435454 meters
User ID: 167, Altitude Gain: 426178.593175853 meters
User ID: 153, Altitude Gain: 370663.81679790025 meters
User ID: 052, Altitude Gain: 268203 meters
User ID: 126, Altitude Gain: 180439.95013123358 meters
User ID: 163, Altitude Gain: 164278.3812237533 meters
User ID: 010, Altitude Gain: 163642 meters
User ID: 179, Altitude Gain: 150679 meters
User ID: 115, Altitude Gain: 107619.45616797873 meters
User ID: 125, Altitude Gain: 99607.20997375328 meters
User ID: 096, Altitude Gain: 77246 meters
User ID: 082, Altitude Gain: 75699 meters
User ID: 106, Altitude Gain: 65685.6955380578 meters
User ID: 064, Altitude Gain: 49868 meters
User ID: 081, Altitude Gain: 40373.209973753284 meters
User ID: 111, Altitude Gain: 32975.721784776884 meters
User ID: 105, Altitude Gain: 29294.61942257217 meters


### Task 9
Find all users who have invalid activities, and the number of invalid activities
per user.
- An invalid activity is defined as an activity with consecutive trackpoints
where the timestamps deviate with at least 5 minutes. 

In [40]:
def has_invalid_activity(track_points):
    """
    Check if the activity has any consecutive trackpoints with a timestamp deviation of at least 5 minutes.
    """
    for i in range(1, len(track_points)):
        time_difference = track_points[i]['date_time'] - track_points[i-1]['date_time']
        if time_difference >= timedelta(minutes=5):
            return True
    return False

def users_with_invalid_activities():
    # Connect to the MongoDB instance
    activity_collection = db['ActivityCollection']

    user_invalid_activity_counts = {}

    # Iterate over all activities in the ActivityCollection
    for activity in activity_collection.find():
        user = activity['user']
        if has_invalid_activity(activity['track_points']):
            if user in user_invalid_activity_counts:
                user_invalid_activity_counts[user] += 1
            else:
                user_invalid_activity_counts[user] = 1

    # Filter for users with at least one invalid activity
    users_with_invalids = {user: count for user, count in user_invalid_activity_counts.items() if count > 0}

    for user, invalid_count in users_with_invalids.items():
        print(f"User ID: {user}, Invalid Activities: {invalid_count}")

users_with_invalid_activities()

User ID: 161, Invalid Activities: 2
User ID: 102, Invalid Activities: 9
User ID: 105, Invalid Activities: 14
User ID: 167, Invalid Activities: 135
User ID: 084, Invalid Activities: 155
User ID: 085, Invalid Activities: 219
User ID: 082, Invalid Activities: 15
User ID: 076, Invalid Activities: 7
User ID: 078, Invalid Activities: 7
User ID: 065, Invalid Activities: 23
User ID: 091, Invalid Activities: 12
User ID: 096, Invalid Activities: 10
User ID: 062, Invalid Activities: 184
User ID: 053, Invalid Activities: 8
User ID: 098, Invalid Activities: 7
User ID: 052, Invalid Activities: 117
User ID: 097, Invalid Activities: 14
User ID: 064, Invalid Activities: 9
User ID: 174, Invalid Activities: 2
User ID: 129, Invalid Activities: 5
User ID: 111, Invalid Activities: 10
User ID: 144, Invalid Activities: 5
User ID: 175, Invalid Activities: 5
User ID: 126, Invalid Activities: 97
User ID: 128, Invalid Activities: 238
User ID: 117, Invalid Activities: 1
User ID: 153, Invalid Activities: 148
User I

### Task 10
Find the users who have tracked an activity in the Forbidden City of Beijing.
- In this question you can consider the Forbidden City to have
coordinates that correspond to: lat 39.916, lon 116.397.

In [155]:
coordinates = {"latitude": 39.916, "longitude": 116.397}

pipeline = [
    {"$unwind": "$track_points"},
    {
        "$match": {
            "track_points.latitude": {
                "$gte": coordinates["latitude"],
                "$lte": coordinates["latitude"] + 0.001,
            },
            "track_points.longitude": {
                "$gte": coordinates["longitude"],
                "$lte": coordinates["longitude"] + 0.001,
            },
        }
    },
    {"$group": {"_id": "$user"}},
]

output = db.ActivityCollection.aggregate(pipeline)
output = [user["_id"] for user in output]
output

['018', '019']

### Task 11
Find all users who have registered transportation_mode and their most used
transportation_mode. 
- The answer should be on format (user_id,
most_used_transportation_mode) sorted on user_id.
- Some users may have the same number of activities tagged with e.g.
walk and car. In this case it is up to you to decide which transportation
mode to include in your answer (choose one).
- Do not count the rows where the mode is null.

In [52]:
users_most_used_mode = {}
all_users = db.UserCollection.find({})

for user in all_users:
    user_id = user['_id']
    user_activities = list(db.ActivityCollection.find({'user': user_id, 'transportation_mode': {'$ne': None}}))
    modes = [activity['transportation_mode'] for activity in user_activities]

    modes_count = Counter(modes)
    if modes_count:
        users_most_used_mode[user_id] = modes_count.most_common(1)[0][0]

print("Users and their most used transportation mode:")

for user, mode in users_most_used_mode.items():
    print(f"User: {user}, Most Used Mode: {mode}")

Users and their most used transportation mode:
User: 104, Most Used Mode: bus
User: 161, Most Used Mode: walk
User: 102, Most Used Mode: walk
User: 105, Most Used Mode: walk
User: 167, Most Used Mode: walk
User: 084, Most Used Mode: walk
User: 085, Most Used Mode: walk
User: 082, Most Used Mode: walk
User: 076, Most Used Mode: car
User: 078, Most Used Mode: walk
User: 065, Most Used Mode: bike
User: 091, Most Used Mode: walk
User: 096, Most Used Mode: bike
User: 062, Most Used Mode: bus
User: 053, Most Used Mode: walk
User: 098, Most Used Mode: walk
User: 052, Most Used Mode: bus
User: 097, Most Used Mode: bike
User: 064, Most Used Mode: walk
User: 174, Most Used Mode: car
User: 129, Most Used Mode: walk
User: 116, Most Used Mode: bike
User: 111, Most Used Mode: taxi
User: 118, Most Used Mode: car
User: 144, Most Used Mode: car
User: 175, Most Used Mode: walk
User: 126, Most Used Mode: walk
User: 110, Most Used Mode: walk
User: 128, Most Used Mode: car
User: 117, Most Used Mode: walk
U

----

In [5]:
connector.close_connection()


-----------------------------------------------
Connection to mongodb-db is closed
