In [7]:
#1. How many users, activities and trackpoints are there in the dataset (after it is inserted into the database).

from DbConnector import DbConnector
db = DbConnector() 
cursor = db.cursor

cursor.execute("SELECT COUNT(*) FROM User")
num_users = cursor.fetchone()[0]

cursor.execute("SELECT COUNT(*) FROM Activity")
num_activities = cursor.fetchone()[0]

cursor.execute("SELECT COUNT(*) FROM TrackPoint")
num_trackpoints = cursor.fetchone()[0]

print(f"Users: {num_users}, Activities: {num_activities}, Trackpoints: {num_trackpoints}")

Users: 182, Activities: 16048, Trackpoints: 9681756


In [10]:

#2. Find the average number of activities per user.

from DbConnector import DbConnector
db = DbConnector() 
cursor = db.cursor

cursor.execute("SELECT AVG(activity_count) FROM (SELECT COUNT(*) as activity_count FROM Activity GROUP BY user_id) as temp")
average_activities = cursor.fetchone()[0]

print(f"Average number of activities per user: {average_activities}")



Average number of activities per user: 92.7630


In [11]:
#3. Find the top 20 users with the highest number of activities. 

cursor.execute("""
    SELECT user_id, COUNT(*) as activity_count
    FROM Activity
    GROUP BY user_id
    ORDER BY activity_count DESC
    LIMIT 20
""")
top_users = cursor.fetchall()

for user in top_users:
    print(f"User ID: {user[0]}, Activity Count: {user[1]}")

User ID: 128, Activity Count: 2102
User ID: 153, Activity Count: 1793
User ID: 25, Activity Count: 715
User ID: 163, Activity Count: 704
User ID: 62, Activity Count: 691
User ID: 144, Activity Count: 563
User ID: 41, Activity Count: 399
User ID: 85, Activity Count: 364
User ID: 4, Activity Count: 346
User ID: 140, Activity Count: 345
User ID: 167, Activity Count: 320
User ID: 68, Activity Count: 280
User ID: 17, Activity Count: 265
User ID: 3, Activity Count: 261
User ID: 14, Activity Count: 236
User ID: 126, Activity Count: 215
User ID: 30, Activity Count: 210
User ID: 112, Activity Count: 208
User ID: 11, Activity Count: 201
User ID: 39, Activity Count: 198


In [12]:
#4. Find all users who have taken a taxi. 

cursor.execute("""
    SELECT DISTINCT user_id
    FROM Activity
    WHERE transportation_mode = 'taxi'
""")
taxi_users = cursor.fetchall()

print("Users who have taken a taxi:")
for user in taxi_users:
    print(user[0])

Users who have taken a taxi:
10
111
114
126
128
139
153
163
167
175
21
52
56
58
62
65
78
80
84
85
98


In [13]:
#5. Find all types of transportation modes and count how many activities that are tagged with these transportation mode labels. Do not count the rows where the mode is null.
cursor.execute("""
    SELECT transportation_mode, COUNT(*)
    FROM Activity
    WHERE transportation_mode IS NOT NULL
    GROUP BY transportation_mode
""")
modes_count = cursor.fetchall()

for mode, count in modes_count:
    print(f"{mode}: {count} activities")

taxi: 125 activities
walk: 1008 activities
bus: 471 activities
bike: 619 activities
car: 493 activities
run: 1 activities
train: 2 activities
subway: 190 activities
airplane: 4 activities
boat: 1 activities


In [15]:
#6. a) Find the year with the most activities. 

cursor.execute("""
    SELECT YEAR(start_date_time) as year, COUNT(*) as activity_count
    FROM Activity
    GROUP BY year
    ORDER BY activity_count DESC
    LIMIT 1
""")
most_activities_year = cursor.fetchone()

print(f"Year with the most activities: {most_activities_year[0]} with {most_activities_year[1]} activities")

Year with the most activities: 2008 with 5895 activities


In [19]:
#6. b) Is this also the year with most recorded hours?

cursor.execute("""
    SELECT YEAR(start_date_time) as year, SUM(TIMESTAMPDIFF(HOUR, start_date_time, end_date_time)) as total_hours
    FROM Activity
    GROUP BY year
    ORDER BY total_hours DESC
    LIMIT 1
""")
most_hours_year = cursor.fetchone()

print(f"Year with the most recorded hours: {most_hours_year[0]} with {most_hours_year[1]} hours")

Year with the most recorded hours: 2009 with 9165 hours


In [62]:
#7. Find the total distance (in km) walked in 2008, by user with id=112.


from DbConnector import DbConnector
db = DbConnector() 
cursor = db.cursor

import math
from haversine import haversine, Unit
#import haversine

# Haversine formula to calculate distance between two latitude/longitude points
#def haversine(lat1, lon1, lat2, lon2):
    #R = 6371  # radius of Earth in kilometers
    #dlat = math.radians(lat2 - lat1)
    #dlon = math.radians(lon2 - lon1)
    #a = math.sin(dlat / 2) * 2 + math.cos(math.radians(lat1)) * math.cos(math.radians(lat2)) * math.sin(dlon / 2) * 2
    #c = 2 * math.atan2(math.sqrt(a), math.sqrt(1 - a))
    #return R * c

# Query to get all trackpoints for user 112 in 2008 for walking activities
cursor.execute("""
    SELECT lat, lon
    FROM TrackPoint
    JOIN Activity ON TrackPoint.activity_id = Activity.id
    WHERE Activity.user_id = 112 AND YEAR(Activity.start_date_time) = 2008 AND Activity.transportation_mode = 'walk'
    ORDER BY TrackPoint.date_time
""")
trackpoints = cursor.fetchall()

# Calculate total distance walked
total_distance = 0
for i in range(1, len(trackpoints)):
    lat1, lon1 = trackpoints[i - 1]
    lat2, lon2 = trackpoints[i]
    total_distance += haversine((lat1, lon1), (lat2, lon2), unit=Unit.KILOMETERS)

print(f"Total distance walked by user 112 in 2008: {total_distance} km")

Total distance walked by user 112 in 2008: 305.8173753597801 km


In [63]:
#8. Find the top 20 users who have gained the most altitude meters.
from DbConnector import DbConnector
db = DbConnector() 
cursor = db.cursor


cursor.execute("""
    SELECT user_id, SUM(altitude_gain) as total_gain
    FROM (
        SELECT user_id, GREATEST(0, altitude - LAG(altitude) OVER (PARTITION BY activity_id ORDER BY date_time)) as altitude_gain
        FROM TrackPoint
        JOIN Activity ON TrackPoint.activity_id = Activity.id
        WHERE altitude >0
    ) as temp
    GROUP BY user_id
    ORDER BY total_gain DESC
    LIMIT 20
""")
altitude_users = cursor.fetchall()

print("Top 20 users by altitude gain:")
for user in altitude_users:
    print(f"User ID: {user[0]}, Total Altitude Gain: {user[1]}")

KeyboardInterrupt: 

In [54]:
#9. Find all users who have invalid activities, and the number of invalid activities per user

cursor.execute("""
    SELECT A.user_id, COUNT(DISTINCT A.id) AS invalid_activities
    FROM Activity A
    JOIN (
        SELECT activity_id, 
               TIMESTAMPDIFF(SECOND, LAG(date_time) OVER (PARTITION BY activity_id ORDER BY date_time), date_time) AS time_diff
        FROM TrackPoint
    ) TP ON A.id = TP.activity_id
    WHERE TP.time_diff > 300
    GROUP BY A.user_id
""")
invalid_activities_per_user = cursor.fetchall()

print("Users with invalid activities and the number of invalid activities:")
for user in invalid_activities_per_user:
    print(f"User ID: {user[0]}, Invalid Activities: {user[1]}")


KeyboardInterrupt: 

In [59]:
cursor.execute("""
    SELECT DISTINCT user_id
    FROM Activity
    JOIN Activity ON TrackPoint.activity_id = Activity.id
    WHERE ABS(TrackPoint.lat - 39.916) < 0.001 
      AND ABS(TrackPoint.lon - 116.397) < 0.001
""")
forbidden_city_users = cursor.fetchall()

print("Users who have tracked an activity in the Forbidden City:")
for user in forbidden_city_users:
    print(f"User ID: {user[0]}")


KeyboardInterrupt: 