In [31]:
import sys
sys.path.append("C:/Users/aminp/OneDrive - NTNU/Dokumenter/NTNU/5år/Store, distribuerte datamengder/tdt4225_assignment3/")

In [46]:

from connector.mongo_connector import MongoConnector
from haversine import haversine, Unit
import pandas as pd
from datetime import datetime


In [47]:
def task_1(connection: MongoConnector): 
    users_count = connection.db["User"].count_documents({})
    activities_count = connection.db["Activity"].count_documents({})
    trackpoints_count  = connection.db["TrackPoint"].count_documents({})
    print(f'Number of users: {users_count}')
    print(f'Number of activities: {activities_count}')
    print(f'Number of trackpoints: {trackpoints_count}')
    
    

In [8]:
def task_2(connection: MongoConnector): 
    users = connection.db["User"].count_documents({})
    activities = connection.db["Activity"].count_documents({})
    avg_numb = activities/users
    print(f"Average activity count per user: {round(avg_numb,2)}")

In [9]:
def task_3(connection: MongoConnector): 
    pipeline = [
         {"$unwind": "$activities"},
    {
        "$group": {
            "_id": "$_id",
            "activity_count": { "$sum": 1 }
        }
    },
    {
        "$sort": {"activity_count": -1}
    },
    {
        "$limit": 20
    }
    ]

    users = connection.db["User"].aggregate(pipeline)
    for doc in users: 
        print(f'User: {doc["_id"]}, Activity_count: {doc["activity_count"]} ')
    

In [10]:
def task_4(connection: MongoConnector):
    unique_user_ids = set()
    activities = connection.db["Activity"].find({"transportation_mode": "taxi"})
    for doc in activities: 
        user_id = doc["user_id"]
        trans_mode = doc["transportation_mode"]

        if user_id not in unique_user_ids: 
            print(f'User: {user_id}, Transportation_mode: {trans_mode}')
            unique_user_ids.add(user_id)


In [11]:
def task_5(connection: MongoConnector): 
    pipeline = [
        {
            "$match": {
                "transportation_mode": {"$ne": None}
            }
        }, 
        {
            "$group": {
                "_id": "$transportation_mode", 
                "count": {"$sum": 1}
            }
        }, 
        {
            "$sort": {"count": -1}
        }
        
    ]

    trans_modes = connection.db["Activity"].aggregate(pipeline)

    for doc in trans_modes:
        print(f'Transport mode: {doc["_id"]}, Count: {doc["count"]}') 

In [12]:
def task_6a(connection: MongoConnector): 
    pipeline = [
        {
            "$project": {
                "start_year": {"$year": "$start_date_time"}
            }
        }, 
        {
            "$group": {
                "_id": "$start_year", 
                "count": {"$sum" : 1}
            }
        }, 
        {
            "$sort": {"count": -1}
        }
    ]

    activity_year_count = connection.db["Activity"].aggregate(pipeline)
    for year in activity_year_count: 
        print(f'Year: {year["_id"]}, num_activites: {year["count"]}')

In [13]:
def task_6b(connection: MongoConnector): 
    time_diff = {
        "$project": {
            "start_year": {"$year": "$start_date_time"},
            "duration": {
                "$divide": [
                    {"$dateDiff": {
                        "startDate": "$start_date_time",
                        "endDate": "$end_date_time",
                        "unit": "second"
                    }},
                    3600
                ]
            }
        }
    }

    pipeline = [
        time_diff,
        {
            "$group": {
                "_id": "$start_year",
                "hours_count": {"$sum": "$duration"}
            }
        },
        {
            "$sort": {"hours_count": -1}
        }
    ]

    activity_year_durations = connection.db["Activity"].aggregate(pipeline)
    for year in activity_year_durations: 
        print(f'Year: {year["_id"]}, Recorded hours: {year["hours_count"]}')

In [14]:


def haversine_distance(point1, point2): 
    return haversine(point1, point2, unit=Unit.KILOMETERS)

def extract_tp_info(connection: MongoConnector): 
    start_date_time = datetime(2008, 1, 1, 0, 0, 0)
    end_date_time = datetime(2008, 12, 31, 23, 59, 59)

    tp = connection.db["TrackPoint"].aggregate([{
        "$match": {
            "user_id": "112",
            "transportation_mode": "walk",
            "date_time": {
                "$gte": start_date_time, 
                "$lte": end_date_time
            }
        }
    }])

    return tp
    

def task_7(connection: MongoConnector): 
    track_points_cursor = extract_tp_info(connection)
    track_points = list(track_points_cursor)

    total_distance = 0

    for i in range(len(track_points) -1): 
        coord1 = track_points[i]["location"]["coordinates"]
        coord2 = track_points[i+1]["location"]["coordinates"]
        activity_id_1 = track_points[i]["activity_id"]
        activity_id_2 = track_points[i+1]["activity_id"]

        if activity_id_1 == activity_id_2: 
            pos1 = [(coord1[1], coord1[0])]
            pos2 = [(coord2[1], coord2[0])]
            distance = haversine_distance(pos1[0], pos2[0])
            total_distance += distance
    
    print(f'Distance walked by user 112 in 2008 in total:  {total_distance} km')


In [12]:
def task_8(connection: MongoConnector):
    pipeline = [
        {
            "$match": {
                "altitude": {"$ne": -777}
            }
        }, 
         {
            "$sort": {
                "user_id": 1,
                "activity_id": 1,
                "date_time": 1
            }
        },
        {
            "$project": {
                "user_id": 1,
                "activity_id": 1,
                "altitude": 1,
                "date_time": 1
            }
        }
    ]

    output = list(connection.db["TrackPoint"].aggregate(pipeline))

    altitudes = {}
    for i in range(len(output) - 1): 
        tp = output[i]
        next_tp = output[i+1]
        if tp["activity_id"] == next_tp["activity_id"] and tp["user_id"] == next_tp["user_id"]:
            if tp["altitude"] < next_tp["altitude"]: 
                altitude_difference_meters = (next_tp["altitude"] - tp["altitude"]) * 0.3048
                altitudes.setdefault(tp["user_id"], 0)
                altitudes[tp["user_id"]] += altitude_difference_meters
    
    user_with_most_altitude_meters = sorted(altitudes.items(), key = lambda item: item[1], reverse=True)

    for a in user_with_most_altitude_meters[:20]: 
        print(f'User_id:  {a[0]}, total_altitude_meters_gained: {a[1]}')





In [53]:
def task_9(connection: MongoConnector): 
    pipeline = [
        {
            "$match": {
                "prev_date_time": {"$ne": None}
            }
        },
        {
            "$project": {
                "user_id": 1,
                "activity_id": 1,
                "time_difference": {
                    "$dateDiff": {
                        "startDate": "$prev_date_time", 
                        "endDate": "$date_time", 
                        "unit": "minute"
                    }
                }
            }
        }, 
        {
            "$match": {
                "time_difference": {"$gte": 5}
            }
        },
        {
            "$group": {
                "_id": {"user_id": "$user_id", "activity_id": "$activity_id"},
                "invalid": {"$max": 1}
            }
        },
        {
            "$group": {
                "_id": "$_id.user_id", 
                "invalid_activity_count": {"$sum": 1}
            }
        },
        {
            "$sort": {"invalid_activity_count": -1}
        }
    ]

    invalid_activities = connection.db["TrackPoint"].aggregate(pipeline)
    for activity in invalid_activities: 
        print(f'User_id: {activity["_id"]}, Invalid_activities_number: {activity["invalid_activity_count"]}')


In [16]:
def task_10(connection: MongoConnector): 
    pipeline = [
        {
            "$match": {
                "location.coordinates.0": {
                    "$gte": 116.397,
                    "$lte": 116.398
                },
                "location.coordinates.1": {
                    "$gte": 39.916,
                    "$lte": 39.917
                }
            }
        }, 
        {
            "$group": {
                "_id": "$user_id"
            }
        }
    ]
    
    print("Users that have been in forbidden city are: ") 
    user_match = connection.db["TrackPoint"].aggregate(pipeline)
    for user in user_match:
        print(f'User: {user["_id"]}')

In [17]:
"""
We first group by both user_id and transportation_mode to count occurrences for each mode per user. We then sort the results by user_id and count in descending order. 
After that, we use another $group stage to select the first (most used) mode for each user. Finally, we sort the results by user_id.
This approach ensures that we get the most used mode for each user, and if there are ties, it selects the first one based on the sort order.
"""

def task_11(connection: MongoConnector): 
    pipeline = [
        {
            "$match": {
                "transportation_mode": {"$ne": None}
            }
        }, 
        {
            "$group": {
                "_id": {"user_id": "$user_id", "trans_mode": "$transportation_mode"},
                "count": {"$sum" : 1}
            }
        }, 
        {
            "$sort": {"_id.user_id": 1, "count": -1}
        },
        {
            "$group": {
                "_id": "$_id.user_id", 
                "most_used_trans_mode": {"$first": "$_id.trans_mode"},
                "count": {"$first": "$count"}
            }
        }, 
        {
            "$sort": {"_id": 1}
        }
    ]

    user_most_used_trans_mode = connection.db["Activity"].aggregate(pipeline)
    for doc in user_most_used_trans_mode: 
        user_id = doc["_id"]
        trans_mode = doc["most_used_trans_mode"]
        count = doc["count"]
        print(f'User: {user_id}, Most used tran_mode is: {trans_mode} with count {count}')
        

In [57]:
def main(): 
    conn: MongoConnector = MongoConnector()
    
    #task_1(conn)
    #task_2(conn)
    #task_3(conn)
    #task_4(conn)
    #task_5(conn)
    #task_6a(conn)
    #task_6b(conn)
    #task_7(conn)
    task_8(conn)
    #task_9(conn)
    #task_10(conn)
    #task_11(conn)
    
    conn.close_connection()

main()


You are connected to the database: mongo_db
-----------------------------------------------

User_id:  128, total_altitude_meters_gained: 615465.6600795912
User_id:  153, total_altitude_meters_gained: 495868.20985812164
User_id:  004, total_altitude_meters_gained: 294546.832800116
User_id:  041, total_altitude_meters_gained: 227222.79167998934
User_id:  003, total_altitude_meters_gained: 200865.3336000703
User_id:  085, total_altitude_meters_gained: 196490.81352013102
User_id:  163, total_altitude_meters_gained: 190135.57305897324
User_id:  144, total_altitude_meters_gained: 175432.6203200511
User_id:  030, total_altitude_meters_gained: 154214.16960000098
User_id:  062, total_altitude_meters_gained: 145087.23839997425
User_id:  039, total_altitude_meters_gained: 136360.50959998736
User_id:  084, total_altitude_meters_gained: 122486.31839999456
User_id:  002, total_altitude_meters_gained: 106269.73919997527
User_id:  000, total_altitude_meters_gained: 105622.64879999145
User_id:  167, t