In [12]:
%load_ext autoreload

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [13]:
%autoreload 2

import os
import sys
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

from src.utils import MySQLConnector
import pandas as pd
from haversine import haversine

In [14]:
connector = MySQLConnector()
cursor = connector.cursor
db_connection = connector.db_connection

Connected to: 8.1.0
You are connected to the database: ('mysql',)
-----------------------------------------------



### Task 1
How many users, activities and trackpoints are there in the dataset?

In [16]:
query = """
    SELECT 
    (SELECT COUNT(*) FROM UserTable) AS user_count,
    (SELECT COUNT(*) FROM ActivityTable) AS activity_count,
    (SELECT COUNT(*) FROM TrackPointTable) AS trackpoint_count;
"""

cursor.execute(query)
result = cursor.fetchall()
result_df = pd.DataFrame(result, columns=['user_count', 'activity_count', 'trackpoint_count'])
result_df

Unnamed: 0,user_count,activity_count,trackpoint_count
0,182,14715,3851557


### Task 2
Find the average, maximum and minimum number of trackpoints per user.

In [20]:
query = """
    SELECT
    AVG(count) AS average,
    MAX(count) AS max,
    MIN(count) AS min
    
    FROM 
    (SELECT user_id, COUNT(*) AS count
    FROM TrackPointTable 
    JOIN ActivityTable ON TrackPointTable.activity_id = ActivityTable.id
    GROUP BY user_id) AS TPperUser;
"""

cursor.execute(query)
result = cursor.fetchall()
result_df = pd.DataFrame(result, columns=['average', 'max', 'min'])
result_df

Unnamed: 0,average,max,min
0,60180.5781,603380,7


### Task 3
Find the top 15 users with the highest number of activities.

In [22]:
query = """
    SELECT user_id, COUNT(*) AS activity_count
    FROM ActivityTable
    GROUP BY user_id
    ORDER BY activity_count DESC
    LIMIT 15; 
"""

cursor.execute(query)
result = cursor.fetchall()

result_df = pd.DataFrame(result, columns=['user_id', 'number_of_activities'])
result_df

Unnamed: 0,user_id,number_of_activities
0,163,3182
1,85,1298
2,153,1123
3,68,969
4,167,944
5,128,937
6,62,782
7,75,509
8,126,468
9,10,434


### Task 4
Find all users who have taken a bus.

In [23]:
query = """
    SELECT DISTINCT user_id
    FROM ActivityTable
    WHERE transportation_mode = 'bus';
"""

cursor.execute(query)
result = cursor.fetchall()

print(result)

[('010',), ('020',), ('052',), ('053',), ('058',), ('062',), ('064',), ('065',), ('067',), ('068',), ('069',), ('073',), ('075',), ('078',), ('080',), ('081',), ('082',), ('084',), ('085',), ('091',), ('092',), ('096',), ('098',), ('100',), ('101',), ('102',), ('104',), ('105',), ('108',), ('110',), ('111',), ('112',), ('114',), ('125',), ('126',), ('128',), ('129',), ('138',), ('139',), ('141',), ('147',), ('153',), ('154',), ('161',), ('163',), ('167',), ('174',), ('175',), ('179',)]


### Task 5
List the top 10 users by their amount of different transportation modes.

In [24]:
query = """
    SELECT user_id, COUNT(DISTINCT transportation_mode) AS distinct_transportation_mode_count
    FROM ActivityTable
    GROUP BY user_id
    ORDER BY distinct_transportation_mode_count DESC
    LIMIT 10;
"""

cursor.execute(query)
result = cursor.fetchall()

result_df = pd.DataFrame(result, columns=['user_id', 'amount_of_different_transportation_modes'])
result_df

Unnamed: 0,user_id,amount_of_different_transportation_modes
0,128,10
1,75,9
2,62,9
3,163,9
4,167,8
5,153,8
6,84,8
7,126,8
8,20,7
9,85,7


### Task 6
Find activities that are registered multiple times. You should find the query even
if it gives zero result.

In [25]:
query = """
    SELECT user_id, transportation_mode, start_date_time, end_date_time, COUNT(*)
    FROM ActivityTable
    GROUP BY user_id, transportation_mode, start_date_time, end_date_time
    HAVING COUNT(*) > 1;
"""

cursor.execute(query)
result = cursor.fetchall()

result

[]

### Task 7

#### a) 
Find the number of users that have started an activity in one day and ended
the activity the next day.

In [32]:
query = """
    SELECT user_id, transportation_mode, TIMESTAMPDIFF(MINUTE, start_date_time, end_date_time) AS duration_minutes
    FROM ActivityTable
    WHERE DATE(start_date_time) != DATE(end_date_time);
"""

cursor.execute(query)
result = cursor.fetchall()

result_df = pd.DataFrame(result, columns=['user_id', 'transportation mode', 'duration (min)'])
print("Number of users", result_df["user_id"].unique().shape[0])


Number of users 38


#### b)
List the transportation mode, user id and duration for these activities.

In [33]:
result_df

Unnamed: 0,user_id,transportation mode,duration (min)
0,098,taxi,730
1,106,car,1439
2,153,taxi,556
3,021,car,360
4,076,car,538
...,...,...,...
398,163,car,1079
399,163,car,1079
400,163,car,1019
401,163,car,1019


### Task 8
Find the number of users which have been close to each other in time and space.
Close is defined as the same space (50 meters) and for the same half minute (30
seconds)

In [52]:
query = """
    ALTER TABLE TrackPointTable
    ADD COLUMN grid_lat INT,
    ADD COLUMN grid_lon INT;

    UPDATE TrackPointTable
    SET grid_lat = FLOOR(ST_Y(geom_point) / 0.09),
        grid_lon = FLOOR(ST_X(geom_point) / 0.09);
"""

cursor.execute(query)
result = cursor.fetchall()

In [None]:
query = """
    CREATE TEMPORARY TABLE ActivityGrids AS
    SELECT DISTINCT activity_id, grid_lat, grid_lon
    FROM TrackPointTable;
"""

cursor.execute(query)
result = cursor.fetchall()

In [None]:
query = """
    SELECT DISTINCT a1.activity_id AS activity1, a2.activity_id AS activity2
    FROM TrackPointTable tp1
    JOIN ActivityGrids a1 ON tp1.activity_id = a1.activity_id
    JOIN TrackPointTable tp2 ON a1.grid_lat = tp2.grid_lat AND a1.grid_lon = tp2.grid_lon
    JOIN ActivityGrids a2 ON tp2.activity_id = a2.activity_id
    WHERE 
    tp1.activity_id != tp2.activity_id
    AND ST_Distance_Sphere(tp1.geom_point, tp2.geom_point) < 50
    AND ABS(TIMESTAMPDIFF(SECOND, tp1.date_time, tp2.date_time)) <= 30;
"""

cursor.execute(query)
result = cursor.fetchall()

In [12]:
query = """
    SELECT id, activity_id, date_time, lat, lon, altitude, ST_AsText(geom_point) FROM TrackPointTable
    LIMIT 4000000;
"""

cursor.execute(query)
result = cursor.fetchall()

In [13]:
result_df = pd.DataFrame(result, columns=["id", "activity_id", "date_time", "lat", "lon", "altitude", "point"])
result_df.sort_values(by="lon")

Unnamed: 0,id,activity_id,date_time,lat,lon,point
1992190,1992191,17251281,2009-03-06 02:34:00,57.6287,-179.970,POINT(-179.9695933 57.6286499)
1992189,1992190,17251281,2009-03-06 02:33:50,57.6380,-179.936,POINT(-179.9360666 57.6380333)
1992188,1992189,17251281,2009-03-06 02:33:40,57.6474,-179.902,POINT(-179.9024933 57.6473833)
1992187,1992188,17251281,2009-03-06 02:33:30,57.6567,-179.869,POINT(-179.8688633 57.6566933)
1992186,1992187,17251281,2009-03-06 02:33:20,57.6660,-179.835,POINT(-179.8352416 57.6660132)
...,...,...,...,...,...,...
1992195,1992196,17251281,2009-03-06 02:34:50,57.5817,179.864,POINT(179.863745 57.5817232)
1992194,1992195,17251281,2009-03-06 02:34:40,57.5910,179.897,POINT(179.8970266 57.5910416)
1992193,1992194,17251281,2009-03-06 02:34:30,57.6004,179.930,POINT(179.93025 57.6004416)
1992192,1992193,17251281,2009-03-06 02:34:20,57.6098,179.964,POINT(179.9635583 57.6098399)


In [35]:
import rtree
import itertools

In [37]:
import rtree
import itertools

# Initialize an R-tree spatial index
spatial_index = rtree.index.Index()

# Assuming you have a list of user data points in the format (timestamp, x, y, altitude)
user_data = result_df[["date_time", "lat", "lon", "altitude"]].values.tolist()

# Insert data points into the spatial index
for idx, (timestamp, x, y, altitude) in enumerate(user_data):
    spatial_index.insert(idx, (x, y, altitude, x, y, altitude))

# Function to find close users within a batch
def find_close_users_in_batch(batch):
    close_user_count = 0
    for i, j in itertools.combinations(batch, 2):
        idx1, (ts1, x1, y1) = i
        idx2, (ts2, x2, y2) = j
        if abs(ts1 - ts2) <= 30 and haversine((x1, y1), (x2, y2)) <= 50:
            close_user_count += 1
    return close_user_count

batch_size = 1000  # Adjust batch size as needed
total_close_user_count = 0

# Process data in batches
for batch_start in range(0, len(user_data), batch_size):
    batch = user_data[batch_start:batch_start+batch_size]
    close_user_count = find_close_users_in_batch(batch)
    total_close_user_count += close_user_count

print("Number of users close in time and space:", total_close_user_count)


KeyError: "['altitude'] not in index"

In [26]:
lat_step = (result_df["lat"].max() - result_df["lat"].min()) // 20
lon_step = (result_df["lon"].max() - result_df["lon"].min()) // 50
lat_min = result_df["lat"].min()
lat_max = result_df["lat"].max()
lon_min = result_df["lon"].min()
lon_max = result_df["lon"].max()

In [30]:
result_df["date_time"] = pd.to_datetime(result_df["date_time"])

In [33]:
clusterer = DBSCAN()

44694
2605708
898127
6360


In [29]:
shapes.sort(key=lambda x: x.shape[0], reverse=True)
shapes[0].shape

(3619814, 6)

In [51]:
x = 6
y = 13

query = f"""
    SELECT id, date_time, lat, lon
    FROM TrackPointTable a WHERE 
        a.lat < {lat_min + lat_step * (x + 1)} AND 
        a.lon < {lon_min + lon_step * (y + 1)} AND 
        a.lat > {lat_min + lat_step * x} AND 
        a.lon > {lon_min + lon_step * y};
"""

cursor.execute(query)
new_result = cursor.fetchall()

new_result_df = pd.DataFrame(new_result, columns=["id", "date_time", "lat", "lon"])
new_result_df

Unnamed: 0,id,date_time,lat,lon
0,1836244,2008-09-30 05:46:39,48.2524,120.249
1,1836245,2008-09-30 05:46:49,48.2679,120.242
2,1836246,2008-09-30 05:46:59,48.2834,120.235
3,1836247,2008-09-30 05:47:09,48.2988,120.228
4,1836248,2008-09-30 05:47:19,48.3142,120.222
...,...,...,...,...
7151,1843395,2008-10-05 03:43:52,49.2132,119.741
7152,1843396,2008-10-05 03:43:58,49.2130,119.741
7153,1843397,2008-10-05 03:44:00,49.2131,119.741
7154,1843398,2008-10-05 03:44:02,49.2130,119.741


In [31]:
query = """
    SELECT user_id, geom_point
    FROM ActivityTable
    JOIN TrackPointTable ON ActivityTable.id = TrackPointTable.activity_id
    WHERE user_id = "128"
"""

cursor.execute(query)
result = cursor.fetchall()
result_df = pd.DataFrame(result, columns=['user_id', 'point'])
result_df

Unnamed: 0,user_id,point
0,128,"[0, 0, 0, 0, 1, 1, 0, 0, 0, 201, 154, 193, 242..."
1,128,"[0, 0, 0, 0, 1, 1, 0, 0, 0, 80, 216, 21, 32, 2..."
2,128,"[0, 0, 0, 0, 1, 1, 0, 0, 0, 12, 38, 46, 218, 2..."
3,128,"[0, 0, 0, 0, 1, 1, 0, 0, 0, 41, 92, 143, 194, ..."
4,128,"[0, 0, 0, 0, 1, 1, 0, 0, 0, 2, 224, 8, 101, 24..."
...,...,...
371304,128,"[0, 0, 0, 0, 1, 1, 0, 0, 0, 77, 118, 125, 48, ..."
371305,128,"[0, 0, 0, 0, 1, 1, 0, 0, 0, 20, 192, 131, 85, ..."
371306,128,"[0, 0, 0, 0, 1, 1, 0, 0, 0, 249, 124, 57, 88, ..."
371307,128,"[0, 0, 0, 0, 1, 1, 0, 0, 0, 166, 8, 21, 193, 3..."


In [44]:
query = """
    SELECT a.activity_id, b.activity_id
    FROM TrackPointTable a, TrackPointTable b
    WHERE ST_Distance_Sphere(a.geom_point, b.geom_point) < 50
    AND ABS(TIMESTAMPDIFF(SECOND, a.date_time, b.date_time)) <= 30;
"""

cursor.execute(query)
result = cursor.fetchall()

In [25]:
import threading

# Step 1: Query distinct activity_id values

# Function to process a group of activity_ids concurrently
def process_activity_group(activity_group):
    connector = MySQLConnector()
    cursor = connector.cursor
    db_connection = connector.db_connection
    
    for activity_id in activity_group:
        # Step 2: Create a temporary table for each group
        create_temp_table_query = f"CREATE TEMPORARY TABLE TempTable AS SELECT * FROM TrackPointTable WHERE activity_id = '{activity_id}';"
        cursor.execute(create_temp_table_query)
        
        # Step 3: Add a spatial index to the temporary table
        add_spatial_index_query = f"CREATE INDEX ON TempTable USING GIST (point);"
        cursor.execute(add_spatial_index_query)
        
        # Merge the temporary table back into the main table
        merge_data_query = f"INSERT INTO TrackPointTable (activity_id, lon, lat, point) SELECT activity_id, lon, lat, point FROM TempTable;"
        cursor.execute(merge_data_query)
        
        # Optionally, you can also delete data from the temporary table
        delete_temp_table_query = f"DROP TABLE TempTable;"
        cursor.execute(delete_temp_table_query)

    db_connection.commit()
    connector.close_connection()

# Split the activity_ids into groups
num_threads = 4  # Number of concurrent threads
activity_groups = [activity_ids[i:i + len(activity_ids) // num_threads] for i in range(0, len(activity_ids), len(activity_ids) // num_threads)]

# Create and start threads to process each group
threads = []
for activity_group in activity_groups:
    thread = threading.Thread(target=process_activity_group, args=(activity_group,))
    threads.append(thread)
    thread.start()

# Wait for all threads to finish
for thread in threads:
    thread.join()

Exception in thread Thread-20 (process_activity_group):
Traceback (most recent call last):
  File "/Users/mathiasraa/anaconda3/envs/mysql/lib/python3.11/site-packages/mysql/connector/connection_cext.py", line 608, in cmd_query
    self._cmysql.query(
_mysql_connector.MySQLInterfaceError: Truncated incorrect DOUBLE value: '(120101,)'

The above exception was the direct cause of the following exception:

Traceback (most recent call last):
  File "/Users/mathiasraa/anaconda3/envs/mysql/lib/python3.11/threading.py", line 1038, in _bootstrap_inner
    self.run()
  File "/Users/mathiasraa/anaconda3/envs/mysql/lib/python3.11/threading.py", line 975, in run
    self._target(*self._args, **self._kwargs)
  File "/var/folders/xd/z_ptq9v136q7kj9lf2f4sblh0000gn/T/ipykernel_51033/3751700698.py", line 14, in process_activity_group
  File "/Users/mathiasraa/anaconda3/envs/mysql/lib/python3.11/site-packages/mysql/connector/cursor_cext.py", line 330, in execute
    result = self._cnx.cmd_query(
        

Connected to:Connected to: 8.1.0
Connected to: 8.1.0
You are connected to the database: ('mysql',)
-----------------------------------------------

Connected to: 8.1.0
 8.1.0
You are connected to the database: ('mysql',)
-----------------------------------------------

You are connected to the database: ('mysql',)
-----------------------------------------------

You are connected to the database: ('mysql',)
-----------------------------------------------



In [6]:
query = """
    ALTER TABLE TrackPointTable ADD COLUMN point POINT;
    UPDATE TrackPointTable SET point = POINT(lon, lat);
    ALTER TABLE TrackPointTable ADD SPATIAL INDEX(point);
"""

cursor.execute(query)


In [13]:
query = """
    SELECT date_time, lat, lon, activity_id
    FROM TrackPointTable
"""

trackpoints = pd.read_sql(query, db_connection)
trackpoints

  trackpoints = pd.read_sql(query, db_connection)


Unnamed: 0,date_time,lat,lon,activity_id
0,2008-01-01 09:42:31,39.9661,116.341,1141041
1,2008-01-01 09:42:34,39.9661,116.341,1141041
2,2008-01-01 09:42:37,39.9661,116.342,1141041
3,2008-01-01 09:42:40,39.9661,116.342,1141041
4,2008-01-01 09:42:43,39.9661,116.342,1141041
...,...,...,...,...
3851552,2011-03-06 11:18:40,39.8921,116.329,1341471
3851553,2011-03-06 11:18:45,39.8921,116.329,1341471
3851554,2011-03-06 11:18:50,39.8921,116.329,1341471
3851555,2011-03-06 11:18:55,39.8921,116.329,1341471


In [28]:
test = set(overlapping_activities["activity1"].unique().tolist() + overlapping_activities["activity2"].unique().tolist())

In [32]:

trackpoints["activity_id"].unique().shape[0]
len([t for t in trackpoints["activity_id"].unique().tolist() if t in test])

6533

In [41]:
close_pairs = set()

# Now, for each pair, assess the distances
for _, row in overlapping_activities.iterrows():
    activity1 = row['activity1']
    activity2 = row['activity2']

    # query2 = f"""
    #     SELECT a.lat as lat1, a.lon as lon1, b.lat as lat2, b.lon as lon2
    #     FROM TrackPointTable a
    #     JOIN TrackPointTable b ON a.activity_id = {activity1} AND b.activity_id = {activity2}
    #     WHERE ABS(TIMESTAMPDIFF(SECOND, a.date_time, b.date_time)) <= 30;
    # """
    
    # trackpoints = pd.read_sql(query2, db_connection)
    # for _, tp_row in trackpoints.iterrows():
    #     distance = haversine((tp_row['lat1'], tp_row['lon1']), (tp_row['lat2'], tp_row['lon2']), unit='m')
    #     if distance <= 50:
    #         close_pairs.add((row['user1'], row['user2']))

print(f"Number of close user pairs: {len(close_pairs)}")

Number of close user pairs: 0


In [36]:
test = overlapping_activities.copy()

temp = pd.DataFrame(columns=["user1", "user2", "activity1", "activity2"])

display(test[test.user1 == "114"])
display(test[test.user2 == "114"])

Unnamed: 0,user1,user2,activity1,activity2
177,114,138,111141,1331381
275,114,128,121141,1961281
276,114,128,121141,1971281
409,114,128,131141,1971281
410,114,128,131141,1981281
649,114,175,151141,101751
650,114,175,151141,111751
651,114,163,151141,15211631
652,114,163,151141,15221631
746,114,175,161141,111751


Unnamed: 0,user1,user2,activity1,activity2
2,10,114,100101,101141
25,75,114,100751,101141
43,91,114,100911,101141
1301,56,114,1160561,151141
1302,56,114,1160561,161141
1393,56,114,1170561,161141


In [6]:
connector.close_connection()

NameError: name 'connector' is not defined