In [1]:
from pprint import pprint
from DbConnector import DbConnector

In [51]:
class GeolifeQueries:

    def __init__(self):
        # Set up the database connection
        self.connection = DbConnector()
        self.client = self.connection.client
        self.db = self.connection.db

    def count_users_activities_trackpoints(self):
        """Task 1: Count users, activities, and trackpoints."""
        print("Task 1")
        num_users = self.db.User.count_documents({})
        num_activities = self.db.Activity.count_documents({})
        num_trackpoints = self.db.TrackPoint.count_documents({})
        print(f"Users: {num_users}, Activities: {num_activities}, Trackpoints: {num_trackpoints}")

    def average_activities_per_user(self):
        """Task 2: Find the average number of activities per user."""
        print("Task 2")
        num_activities = self.db.Activity.count_documents({})
        num_users = self.db.User.count_documents({})
        avg_activities = num_activities / num_users if num_users != 0 else 0
        print(f"Average activities per user: {avg_activities}")

    def top_20_users_by_activities(self):
        """Task 3: Find the top 20 users with the highest number of activities."""
        print("Task 3")
        pipeline = [
            {"$group": {"_id": "$user_id", "activity_count": {"$sum": 1}}},
            {"$sort": {"activity_count": -1}},
            {"$limit": 20}
        ]
        top_users = self.db.Activity.aggregate(pipeline)
        pprint(list(top_users))

    def users_who_took_taxi(self):
        """Task 4: Find and print all users who have taken a taxi."""
        print("Task 4")
        
        # Query to find documents where transportation_mode is "taxi"
        documents = self.db.Activity.find({"transportation_mode": "bus"}, {"user_id": 1, "_id": 0})
        
        # Set to hold unique user IDs
        taxi_users = set()
        
        # Loop through the documents and add user_ids to the set
        for doc in documents:
            taxi_users.add(doc["user_id"])
        
        # Print the unique user_ids
        pprint(taxi_users)

    def count_transportation_modes(self):
        """Task 5: Count activities with each transportation mode."""
        print("Task 5")
        pipeline = [
            {"$match": {"transportation_mode": {"$ne": None}}},
            {"$group": {"_id": "$transportation_mode", "count": {"$sum": 1}}}
        ]
        transport_modes = self.db.Activity.aggregate(pipeline)
        pprint(list(transport_modes))

    def year_with_most_activities(self):
        """Task 6a: Find the year with the most activities."""
        print("Task 6a")
        pipeline = [
            {"$group": {"_id": {"$year": "$start_time"}, "activity_count": {"$sum": 1}}},
            {"$sort": {"activity_count": -1}},
            {"$limit": 1}
        ]
        year = list(self.db.Activity.aggregate(pipeline))[0]
        print(f"Year with most activities: {year['_id']} ({year['activity_count']} activities)")

    def year_with_most_recorded_hours(self):
        """Task 6b: Find the year with the most recorded hours."""
        print("Task 6b")
        pipeline = [
            {"$project": {
                "year": {"$year": "$start_time"},
                "duration": {"$subtract": ["$end_time", "$start_time"]}
            }},
            {"$group": {"_id": "$year", "total_duration": {"$sum": "$duration"}}},
            {"$sort": {"total_duration": -1}},
            {"$limit": 1}
        ]
        year = list(self.db.Activity.aggregate(pipeline))[0]
        print(f"Year with most recorded hours: {year['_id']}")

    def total_distance_walked_by_user_112_in_2008(self):
        """Task 7: Find the total distance walked by user 112 in 2008."""
        print("Task 7")
        user_id = 112
        pipeline = [
            {"$match": {
                "user_id": user_id,
                "transportation_mode": "walk",
                "$expr": {"$eq": [{"$year": "$start_time"}, 2008]}
            }},
            {"$group": {"_id": None, "total_distance": {"$sum": "$distance"}}}
        ]
        total_distance = list(self.db.Activity.aggregate(pipeline))[0]["total_distance"]
        print(f"Total distance walked by user {user_id} in 2008: {total_distance} km")

    def top_20_users_by_altitude_gain(self):
        """Task 8: Find the top 20 users who have gained the most altitude."""
        print("Task 8")
        pipeline = [
            {"$unwind": "$trackpoints"},
            {"$group": {
                "_id": "$user_id",
                "total_altitude_gain": {
                    "$sum": {
                        "$cond": [
                            {"$gt": ["$trackpoints.altitude", 0]},
                            {"$subtract": ["$trackpoints.altitude", "$$ROOT.trackpoints.prev_altitude"]},
                            0
                        ]
                    }
                }
            }},
            {"$sort": {"total_altitude_gain": -1}},
            {"$limit": 20}
        ]
        users = self.db.Activity.aggregate(pipeline)
        pprint(list(users))

    def users_with_invalid_activities(self):
        """Task 9: Find all users with invalid activities."""
        print("Task 9")
        pipeline = [
            {"$unwind": "$trackpoints"},
            {"$group": {
                "_id": "$user_id",
                "invalid_activities_count": {"$sum": {
                    "$cond": [{"$gt": ["$trackpoints.time_diff", 300000]}, 1, 0]  # 5 minutes in milliseconds
                }}
            }},
            {"$match": {"invalid_activities_count": {"$gt": 0}}}
        ]
        invalid_users = self.db.Activity.aggregate(pipeline)
        pprint(list(invalid_users))

    def users_in_forbidden_city(self):
        """Task 10: Find users who have tracked activity in the Forbidden City of Beijing."""
        print("Task 10")
        forbidden_city_coords = {"lat": 39.916, "lon": 116.397}
        pipeline = [
            {"$unwind": "$trackpoints"},
            {"$match": {
                "trackpoints.latitude": forbidden_city_coords["lat"],
                "trackpoints.longitude": forbidden_city_coords["lon"]
            }},
            {"$group": {"_id": "$user_id"}}
        ]
        users = self.db.Activity.aggregate(pipeline)
        pprint(list(users))

    def most_used_transportation_mode_per_user(self):
        """Task 11: Find all users with registered transportation mode and their most used mode."""
        print("Task 11")
        pipeline = [
            {"$match": {"transportation_mode": {"$ne": None}}},
            {"$group": {"_id": {"user_id": "$user_id", "mode": "$transportation_mode"}, "count": {"$sum": 1}}},
            {"$sort": {"_id.user_id": 1, "count": -1}},
            {"$group": {"_id": "$_id.user_id", "most_used_mode": {"$first": "$_id.mode"}}}
        ]
        users_modes = self.db.Activity.aggregate(pipeline)
        pprint(list(users_modes))

    def close(self):
        """Close the database connection."""
        self.connection.close_connection()

In [53]:
program = GeolifeQueries()

You are connected to the database: my_db
-----------------------------------------------



In [44]:
program.count_users_activities_trackpoints()
program.average_activities_per_user()

Task 1
Users: 182, Activities: 16048, Trackpoints: 9681756
Task 2
Average activities per user: 88.17582417582418


In [45]:
program.top_20_users_by_activities()



Task 3
[{'_id': '128', 'activity_count': 2102},
 {'_id': '153', 'activity_count': 1793},
 {'_id': '025', 'activity_count': 715},
 {'_id': '163', 'activity_count': 704},
 {'_id': '062', 'activity_count': 691},
 {'_id': '144', 'activity_count': 563},
 {'_id': '041', 'activity_count': 399},
 {'_id': '085', 'activity_count': 364},
 {'_id': '004', 'activity_count': 346},
 {'_id': '140', 'activity_count': 345},
 {'_id': '167', 'activity_count': 320},
 {'_id': '068', 'activity_count': 280},
 {'_id': '017', 'activity_count': 265},
 {'_id': '003', 'activity_count': 261},
 {'_id': '014', 'activity_count': 236},
 {'_id': '126', 'activity_count': 215},
 {'_id': '030', 'activity_count': 210},
 {'_id': '112', 'activity_count': 208},
 {'_id': '011', 'activity_count': 201},
 {'_id': '039', 'activity_count': 198}]


In [54]:
# Doesnt work as all transportation modes are ''
program.users_who_took_taxi()


Task 4
set()


In [55]:
# Doesnt work as all transportation modes are ''
program.count_transportation_modes()


Task 5
[{'_id': '', 'count': 16048}]


In [56]:
program.year_with_most_activities()


Task 6a
Year with most activities: None (16048 activities)
