In [1]:
%load_ext autoreload

In [2]:
%autoreload 2

# Add /src modules to path
import os
import sys
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

# Import modules
from src.utils import DbConnector, haversine_np
import pandas as pd
import numpy as np
from pprint import pprint
from datetime import datetime, timedelta
from collections import Counter




In [3]:
# Connect to MySQL database

connector = DbConnector()
db = connector.db

You are connected to the database: mongodb
-----------------------------------------------



----

In [4]:
activities = db["ActivityCollection"]

activities_df = pd.DataFrame(list(activities.find({})))


In [5]:
activities_df[activities_df.track_points.astype(str) == "[]"]

Unnamed: 0,_id,user,start_date_time,end_date_time,transportation_mode,track_points


In [6]:
activities_df2 = activities_df.drop(columns=["track_points"])
activities_df2[activities_df2.transportation_mode == "taxi"]["user"].unique().shape

(29,)

In [7]:
trackpoints = activities_df[activities_df.track_points.astype(str) != "[]"]["track_points"]

In [8]:
count = 0
# {'track_points.lat': 39.916, 'track_points.lon': 116.397})
for i, e in enumerate(trackpoints):
    for tp in e:
        if round(tp["latitude"], 3) == 39.916 and round(tp["longitude"], 3) == 116.397:
            count += 1

In [9]:
count

42

### Task 1
How many users, activities and trackpoints are there in the dataset?

In [10]:
collection = db["ActivityCollection"]

users_count = db.UserCollection.count_documents({})
activities_count = db.ActivityCollection.count_documents({})
trackpoints_count = db.ActivityCollection.aggregate(
    [{"$unwind": "$track_points"}, {"$count": "trackpoint_count"}]
)
trackpoints_count = next(trackpoints_count, {}).get("trackpoint_count", 0)


print(f"Total Users: {users_count}")
print(f"Total Activities: {activities_count}")
print(f"Total Trackpoints: {trackpoints_count}")

Total Users: 182
Total Activities: 15641
Total Trackpoints: 8501885


### Task 2
Find the average number of activities per user

In [11]:
average_activities_per_user = activities_count / users_count
print(f"Average Activities/User: {average_activities_per_user:.2f}")

Average Activities/User: 85.94


### Task 3
Find the top 20 users with the highest number of activities. 

In [39]:
users_activities_count = list(
    db["ActivityCollection"].aggregate(
        [
            {"$group": {"_id": "$user", "activity_count": {"$sum": 1}}},
            {"$sort": {"activity_count": -1}},
            {"$limit": 20},
        ]
    )
)

print("Top 20 Users with Most Activities:")

for user in users_activities_count:
    print(f"User: {user['_id']}, Activities: {user['activity_count']}")

Top 20 Users with Most Activities:
User: 085, Activities: 1089
User: 153, Activities: 977
User: 068, Activities: 920
User: 128, Activities: 876
User: 167, Activities: 810
User: 025, Activities: 715
User: 062, Activities: 538
User: 126, Activities: 420
User: 084, Activities: 411
User: 010, Activities: 402
User: 041, Activities: 399
User: 163, Activities: 354
User: 004, Activities: 346
User: 140, Activities: 345
User: 179, Activities: 305
User: 052, Activities: 282
User: 017, Activities: 265
User: 003, Activities: 261
User: 014, Activities: 236
User: 030, Activities: 210


In [43]:
df = pd.DataFrame(users_activities_count)
df = df.rename(columns={'_id':'user_id'})
df

Unnamed: 0,user_id,activity_count
0,85,1089
1,153,977
2,68,920
3,128,876
4,167,810
5,25,715
6,62,538
7,126,420
8,84,411
9,10,402


### Task 4
Find all users who have taken a taxi. 

In [15]:
users_taken_taxi = db["ActivityCollection"].distinct(
    "user", {"transportation_mode": "taxi"}
)

print("Users who have taken a taxi:")
pprint(users_taken_taxi)

Users who have taken a taxi:
['010',
 '021',
 '052',
 '056',
 '058',
 '062',
 '065',
 '068',
 '075',
 '078',
 '080',
 '082',
 '084',
 '085',
 '098',
 '102',
 '105',
 '111',
 '114',
 '126',
 '128',
 '139',
 '153',
 '154',
 '161',
 '163',
 '167',
 '175',
 '179']


### Task 5
Find all types of transportation modes and count how many activities that are
tagged with these transportation mode labels. Do not count the rows where
the mode is null.

In [17]:
transportation_modes = list(
    db["ActivityCollection"].aggregate(
        [
            {"$group": {"_id": "$transportation_mode", "count": {"$sum": 1}}},
            {"$match": {"_id": {"$ne": None}}},
        ]
    )
)

print("Transportation Modes and Their Counts:")

for mode in transportation_modes:
    print(f"Mode: {mode['_id']}, Count: {mode['count']}")

Transportation Modes and Their Counts:
Mode: walk, Count: 3927
Mode: bike, Count: 1519
Mode: run, Count: 4
Mode: car, Count: 751
Mode: subway, Count: 613
Mode: boat, Count: 7
Mode: bus, Count: 1820
Mode: motorcycle, Count: 2
Mode: train, Count: 134
Mode: airplane, Count: 13
Mode: taxi, Count: 512


In [37]:
df = pd.DataFrame(transportation_modes)
df = df.sort_values('count', ascending=False)
df = df.rename(columns={'_id':'transportation_mode', 'count':'tagged_count'})
df = df.reset_index(drop=True)
df

Unnamed: 0,transportation_mode,tagged_count
0,walk,3927
1,bus,1820
2,bike,1519
3,car,751
4,subway,613
5,taxi,512
6,train,134
7,airplane,13
8,boat,7
9,run,4


### Task 6

#### a) Find the year with the most activities. 

In [44]:
year_most_activities = list(
    db.ActivityCollection.aggregate(
        [
            {"$project": {"year": {"$year": "$start_date_time"}}},
            {"$group": {"_id": "$year", "activity_count": {"$sum": 1}}},
            {"$sort": {"activity_count": -1}},
            {"$limit": 1},
        ]
    )
)[0]

print(f"Year with most activities: {year_most_activities['_id']}")

Year with most activities: 2008


#### b) Is this also the year with most recorded hours?

This aggregation pipeline first calculates the number of hours for each activity by subtracting the start_date_time from the end_date_time (resulting in a duration in milliseconds) and then dividing by the number of milliseconds in an hour (3,600,000). It then groups the data by year and sums the total hours, sorts by total hours in descending order, and limits the results to the year with the highest number of hours.

In [45]:
year_most_hours = list(
    db.ActivityCollection.aggregate(
        [
            {
                "$project": {
                    "year": {"$year": "$start_date_time"},
                    "hours": {
                        "$divide": [
                            {"$subtract": ["$end_date_time", "$start_date_time"]},
                            3600000,  # Number of milliseconds in an hour
                        ]
                    },
                }
            },
            {"$group": {"_id": "$year", "total_hours": {"$sum": "$hours"}}},
            {"$sort": {"total_hours": -1}},
            {"$limit": 1},
        ]
    )
)[0]

print(
    f"Year with most recorded hours: {year_most_hours['_id']}, Hours: {year_most_hours['total_hours']:.2f}"
)

Year with most recorded hours: 2009, Hours: 9765.33


### Task 7
Find the total distance (in km) walked in 2008, by user with id=112.

In [35]:
user_id = "112"
activities_for_user = db.ActivityCollection.find({
    "user": user_id,
    "transportation_mode": "walk",
    "start_date_time": {"$gte": datetime(2008, 1, 1), "$lt": datetime(2009, 1, 1)}
})

total_distance = 0.0

for activity in activities_for_user:
    track_points = activity["track_points"]
    
    # Split track points into latitudes, longitudes for easier computation
    lats = np.array([point["latitude"] for point in track_points])
    longs = np.array([point["longitude"] for point in track_points])
    
    # Calculate the haversine distance for each consecutive track point and sum them up
    total_distance += np.sum(haversine_np(longs[:-1], lats[:-1], longs[1:], lats[1:]))

print(f"Total distance walked by user {user_id} in 2008: {total_distance:.2f} km")


Total distance walked by user 112 in 2008: 223.15 km


### Task 8
Find the top 20 users who have gained the most altitude meters.
- Output should be a field with (id, total meters gained per user).
- Remember that some altitude-values are invalid

In [54]:
def calculate_altitude_gain(track_points):
    """
    Calculate the altitude gain from a list of track points.
    """
    total_gain = 0
    for i in range(1, len(track_points)):
        altitude_difference = track_points[i]['altitude'] - track_points[i-1]['altitude']
        if altitude_difference > 0:
            total_gain += altitude_difference
    return total_gain

def top_users_by_altitude_gain():
    # Connect to the MongoDB instance
    activity_collection = db['ActivityCollection']

    user_altitude_gains = {}

    # Iterate over all activities in the ActivityCollection
    for activity in activity_collection.find():
        user = activity['user']
        altitude_gain = calculate_altitude_gain(activity['track_points'])
        
        if user in user_altitude_gains:
            user_altitude_gains[user] += altitude_gain
        else:
            user_altitude_gains[user] = altitude_gain

    # Sort users by their total altitude gain
    sorted_users = sorted(user_altitude_gains.items(), key=lambda x: x[1], reverse=True)

    # Get top 20 users
    top_20_users = sorted_users[:20]

    #for user, gain in top_20_users:
        #print(f"User ID: {user}, Altitude Gain: {gain} meters")

    return top_20_users

top_users_by_altitude_gain()


[('004', 1089358),
 ('128', 873882.6328083987),
 ('085', 852330),
 ('041', 789924.0999999996),
 ('003', 766613),
 ('030', 576377),
 ('062', 500368.4000000001),
 ('039', 481311),
 ('084', 435454),
 ('167', 426178.593175853),
 ('000', 398638),
 ('002', 377947),
 ('153', 370663.81679790025),
 ('025', 358131.79999999993),
 ('037', 325572.79999999946),
 ('140', 311175.52283464593),
 ('052', 268203),
 ('017', 205319.39999999988),
 ('034', 201543.4999999998),
 ('042', 201220.9000000001)]

In [69]:
df = pd.DataFrame(top_users_by_altitude_gain())
df = df.rename(columns={0:'user_id',1:'meters_altitude_gain'})
pd.options.display.float_format = '{:.0f}'.format
df.sort_values('meters_altitude_gain', ascending=False)

Unnamed: 0,user_id,meters_altitude_gain
0,4,1089358
1,128,873883
2,85,852330
3,41,789924
4,3,766613
5,30,576377
6,62,500368
7,39,481311
8,84,435454
9,167,426179


### Task 9
Find all users who have invalid activities, and the number of invalid activities
per user.
- An invalid activity is defined as an activity with consecutive trackpoints
where the timestamps deviate with at least 5 minutes. 

In [84]:
def has_invalid_activity(track_points):
    """
    Check if the activity has any consecutive trackpoints with a timestamp deviation of at least 5 minutes.
    """
    for i in range(1, len(track_points)):
        time_difference = track_points[i]['date_time'] - track_points[i-1]['date_time']
        if time_difference >= timedelta(minutes=5):
            return True
    return False

def users_with_invalid_activities():
    # Connect to the MongoDB instance
    activity_collection = db['ActivityCollection']

    user_invalid_activity_counts = {}

    # Iterate over all activities in the ActivityCollection
    for activity in activity_collection.find():
        user = activity['user']
        if has_invalid_activity(activity['track_points']):
            if user in user_invalid_activity_counts:
                user_invalid_activity_counts[user] += 1
            else:
                user_invalid_activity_counts[user] = 1

    # Filter for users with at least one invalid activity
    users_with_invalids = {user: count for user, count in user_invalid_activity_counts.items() if count > 0}

    #for user, invalid_count in users_with_invalids.items():
        #print(f"User ID: {user}, Invalid Activities: {invalid_count}")
    
    return users_with_invalids

users_with_invalid_activities()

{'135': 5,
 '132': 3,
 '103': 24,
 '168': 19,
 '157': 9,
 '150': 16,
 '159': 5,
 '166': 2,
 '161': 2,
 '102': 9,
 '105': 14,
 '133': 4,
 '134': 31,
 '158': 9,
 '167': 135,
 '151': 1,
 '169': 9,
 '024': 27,
 '023': 11,
 '015': 46,
 '012': 43,
 '079': 2,
 '046': 13,
 '041': 201,
 '048': 1,
 '077': 3,
 '083': 15,
 '084': 155,
 '070': 5,
 '013': 29,
 '014': 118,
 '022': 55,
 '025': 263,
 '071': 29,
 '085': 219,
 '082': 15,
 '076': 7,
 '040': 17,
 '078': 7,
 '047': 6,
 '065': 23,
 '091': 12,
 '096': 10,
 '062': 184,
 '054': 2,
 '053': 8,
 '098': 7,
 '038': 58,
 '007': 30,
 '000': 101,
 '009': 31,
 '036': 34,
 '031': 3,
 '052': 117,
 '099': 11,
 '055': 15,
 '063': 8,
 '097': 14,
 '090': 3,
 '064': 9,
 '030': 112,
 '008': 16,
 '037': 100,
 '001': 45,
 '039': 147,
 '006': 17,
 '174': 2,
 '180': 2,
 '173': 5,
 '145': 5,
 '142': 52,
 '129': 5,
 '111': 10,
 '127': 4,
 '144': 5,
 '172': 9,
 '181': 14,
 '175': 5,
 '121': 4,
 '119': 22,
 '126': 97,
 '128': 238,
 '117': 1,
 '153': 148,
 '154': 10,
 '

In [92]:
df = pd.DataFrame(list(users_with_invalid_activities().items()))
df = df.rename(columns={0:'user_id',1:'invalid_activities_count'})
df.sort_values('user_id').reset_index(drop=True)

Unnamed: 0,user_id,invalid_activities_count
0,000,101
1,001,45
2,002,98
3,003,179
4,004,219
...,...,...
157,175,5
158,176,8
159,179,53
160,180,2


### Task 10
Find the users who have tracked an activity in the Forbidden City of Beijing.
- In this question you can consider the Forbidden City to have
coordinates that correspond to: lat 39.916, lon 116.397.

In [155]:
coordinates = {"latitude": 39.916, "longitude": 116.397}

pipeline = [
    {"$unwind": "$track_points"},
    {
        "$match": {
            "track_points.latitude": {
                "$gte": coordinates["latitude"],
                "$lte": coordinates["latitude"] + 0.001,
            },
            "track_points.longitude": {
                "$gte": coordinates["longitude"],
                "$lte": coordinates["longitude"] + 0.001,
            },
        }
    },
    {"$group": {"_id": "$user"}},
]

output = db.ActivityCollection.aggregate(pipeline)
output = [user["_id"] for user in output]
output

['018', '019']

### Task 11
Find all users who have registered transportation_mode and their most used
transportation_mode. 
- The answer should be on format (user_id,
most_used_transportation_mode) sorted on user_id.
- Some users may have the same number of activities tagged with e.g.
walk and car. In this case it is up to you to decide which transportation
mode to include in your answer (choose one).
- Do not count the rows where the mode is null.

In [79]:
users_most_used_mode = {}
all_users = db.UserCollection.find({})

for user in all_users:
    user_id = user['_id']
    user_activities = list(db.ActivityCollection.find({'user': user_id, 'transportation_mode': {'$ne': None}}))
    modes = [activity['transportation_mode'] for activity in user_activities]

    modes_count = Counter(modes)
    if modes_count:
        users_most_used_mode[user_id] = modes_count.most_common(1)[0][0]

print("Users and their most used transportation mode:")

for user, mode in users_most_used_mode.items():
    print(f"User: {user}, Most Used Mode: {mode}")

Users and their most used transportation mode:
User: 104, Most Used Mode: bus
User: 161, Most Used Mode: walk
User: 102, Most Used Mode: walk
User: 105, Most Used Mode: walk
User: 167, Most Used Mode: walk
User: 084, Most Used Mode: walk
User: 085, Most Used Mode: walk
User: 082, Most Used Mode: walk
User: 076, Most Used Mode: car
User: 078, Most Used Mode: walk
User: 065, Most Used Mode: walk
User: 091, Most Used Mode: walk
User: 096, Most Used Mode: bike
User: 062, Most Used Mode: bus
User: 053, Most Used Mode: walk
User: 098, Most Used Mode: walk
User: 052, Most Used Mode: bus
User: 097, Most Used Mode: bike
User: 064, Most Used Mode: walk
User: 174, Most Used Mode: car
User: 129, Most Used Mode: bike
User: 111, Most Used Mode: taxi
User: 144, Most Used Mode: car
User: 175, Most Used Mode: walk
User: 126, Most Used Mode: walk
User: 110, Most Used Mode: bus
User: 128, Most Used Mode: car
User: 117, Most Used Mode: walk
User: 153, Most Used Mode: walk
User: 154, Most Used Mode: walk
U

In [93]:
df = pd.DataFrame(users_most_used_mode.items())
df = df.rename(columns={0:'user_id',1:'most_used_mode'})
df = df.sort_values('user_id')
df = df.reset_index(drop=True)
df

Unnamed: 0,user_id,most_used_mode
0,010,walk
1,020,bike
2,021,car
3,052,bus
4,053,walk
...,...,...
59,167,walk
60,170,walk
61,174,car
62,175,walk


----

In [5]:
connector.close_connection()


-----------------------------------------------
Connection to mongodb-db is closed
