In [433]:
import pandas as pd
import numpy as np
# read csv file into a dataframe
df = pd.read_csv('manual.csv',dtype={'id': str})
# display the first few rows of the dataframe
print(df.head())


  id   latitude  longitude            lastlocatedon  entitytype
0  1  19.001965  73.055221  2023-03-23T22:26:57.00Z       event
1  2  19.076743  72.951533  2023-02-07T17:59:57.00Z   associate
2  3  19.328718  72.783555  2023-03-03T05:32:25.00Z         org
3  4  19.523080  73.200119  2023-01-26T19:32:29.00Z  individual
4  5  22.589555  88.411400  2023-01-27T19:32:29.00Z  individual


In [434]:
# df['TimeStamp'] = pd.to_datetime(df['lastlocatedon'], format="%d/%m/%y %H:%M:%S.%f")
df['TimeStamp'] = pd.to_datetime(df['lastlocatedon'], utc=True)
# sort dataframe by timestamp column
df = df.sort_values(by='TimeStamp')
print(df.tail())
print(len(df))

      id   latitude  longitude            lastlocatedon  entitytype  \
51    52  12.906321  77.835011  2023-04-25T07:54:14.00Z  individual   
63    64  20.100880  86.058362  2023-04-25T18:30:31.00Z   associate   
43    44  16.883660  78.298946  2023-04-26T18:58:15.00Z       event   
45    46  12.578808  77.373729  2023-04-28T18:56:38.00Z       event   
107  108  22.589938  88.411550  2023-05-13T00:13:29.00Z  individual   

                    TimeStamp  
51  2023-04-25 07:54:14+00:00  
63  2023-04-25 18:30:31+00:00  
43  2023-04-26 18:58:15+00:00  
45  2023-04-28 18:56:38+00:00  
107 2023-05-13 00:13:29+00:00  
108


In [435]:
import folium
from folium.plugins import PolyLineTextPath
map_plot = folium.Map(location=[df['latitude'].mean(), df['longitude'].mean()], zoom_start=5)
individual_index = 0
arrow_loc=[]
individual_array = []
entity_color = {'individual': "blue",'associate': "red",'org': "green","event":  "orange"}
for index, row in df.iterrows():
    # folium.Marker(location=[row['latitude'], row['longitude']], popup=row['TimeStamp']).add_to(map_plot)
    if 'individual' in row['entitytype']:
        individual_index += 1
        arrow_loc.append([row['latitude'], row['longitude']])
        individual_array.append([index, row['id']])
        folium.CircleMarker(location=[row['latitude'], row['longitude']], radius=5, color=entity_color[row["entitytype"]], fill_color='red',popup=folium.Popup(individual_index, max_width=100)).add_to(map_plot)
    else:
        folium.CircleMarker(location=[row['latitude'], row['longitude']], radius=4, color=entity_color[row["entitytype"]],popup=row['TimeStamp']).add_to(map_plot)
# define the arrow locations and add it to the map
arrow_line = folium.PolyLine(locations=arrow_loc, color='blue', weight=1, opacity=0.7).add_to(map_plot)
# arrow_head = PolyLineTextPath(arrow_line, text='►', offset=-3, repeat=True, attributes={'fill': 'blue', 'font-weight': 'bold'}).add_to(map_plot)
arrow_head = PolyLineTextPath(arrow_line, text='-> ', offset=-3, repeat=True, attributes={'fill': 'blue', 'font-weight': 'bold'}).add_to(map_plot)

map_plot

In [436]:
ind_array_len = len(individual_array)
for i, element in enumerate(individual_array):
    if i == 0:
        df.loc[element[0], 'pos'] = i+1
        df.loc[element[0], 'nextLoc'] = individual_array[i+1][1]
    elif i == ind_array_len - 1:
        df.loc[element[0], 'pos'] = i+1
    else :
        df.loc[element[0], 'pos'] = i+1
        df.loc[element[0], 'nextLoc'] = individual_array[i+1][1]
df['pos'] = df['pos'].fillna(0).astype(int)
df['nextLoc'] = df['nextLoc'].fillna('')
print(df[df['entitytype'] == 'individual'])

      id   latitude  longitude            lastlocatedon  entitytype  \
82    83  26.183069  85.467850  2023-01-01T19:21:51.00Z  individual   
68    69  26.001861  83.353114  2023-01-04T22:13:57.00Z  individual   
96    97  27.268622  75.980907  2023-01-13T00:13:29.00Z  individual   
80    81  26.455641  85.449732  2023-01-15T12:51:23.00Z  individual   
37    38  17.776954  78.550865  2023-01-18T14:20:29.00Z  individual   
49    50  12.972557  77.655264  2023-01-22T15:58:23.00Z  individual   
29    30  29.015758  77.484772  2023-01-23T14:04:03.00Z  individual   
60    61  20.423618  86.141632  2023-01-24T03:44:42.00Z  individual   
78    79  26.893145  79.940691  2023-01-24T08:39:25.00Z  individual   
3      4  19.523080  73.200119  2023-01-26T19:32:29.00Z  individual   
4      5  22.589555  88.411400  2023-01-27T19:32:29.00Z  individual   
34    35  17.051449  78.186149  2023-01-29T19:55:00.00Z  individual   
31    32  28.276492  77.094026  2023-01-30T06:14:44.00Z  individual   
94    

In [437]:
import pandas as pd
from sklearn.cluster import DBSCAN
from sklearn.neighbors import DistanceMetric
# Define the distance metric to use for clustering
dist = DistanceMetric.get_metric("haversine")

# Convert the latitude and longitude columns to radians
df["lat_rad"] = df["latitude"].apply(lambda x: x * (3.1415/180))
df["lon_rad"] = df["longitude"].apply(lambda x: x * (3.1415/180))

# Compute the pairwise distances between all data points
X = df[["lat_rad", "lon_rad"]].to_numpy()
distances = dist.pairwise(X)

# Use DBSCAN to cluster the data points based on their geolocation
dbscan = DBSCAN(eps=0.02, min_samples=2, metric="precomputed")
labels = dbscan.fit_predict(distances)

# Add the cluster labels to the DataFrame
df["cluster"] = labels

# Print the resulting clusters
print(df[["latitude", "longitude", "cluster"]])


      latitude  longitude  cluster
82   26.183069  85.467850        0
72   18.176494  73.691996        1
68   26.001861  83.353114        2
66   25.969023  83.460910        2
84   26.414168  85.481511        0
..         ...        ...      ...
51   12.906321  77.835011       11
63   20.100880  86.058362        8
43   16.883660  78.298946        5
45   12.578808  77.373729       11
107  22.589938  88.411550        6

[108 rows x 3 columns]




In [438]:
labels

array([ 0,  1,  2,  2,  0,  3,  4,  5,  1,  6,  6,  1,  7,  0,  1,  5,  3,
        5,  8,  7,  9, 10, 10, 11, 11, 10,  8, 12,  1,  6,  5, 10,  7, 10,
        3,  9, 10,  1,  4,  4,  4,  1,  8,  8, 12,  6,  1,  6,  2, 10,  3,
        6, 10,  1,  2,  6,  3,  0, 11, 10, 12, 11,  6,  5,  3,  6,  6,  1,
        3,  5, 11,  1,  3,  4,  1,  6,  1,  9,  6,  7, 12, 12,  0,  1,  7,
       11,  5,  1,  2,  3,  9, 11,  6, 10, 11,  6,  9,  5,  5,  3,  1, 10,
        5, 11,  8,  5, 11,  6], dtype=int64)

In [439]:
# cluster_color = ['red', 'blue', 'green', 'purple', 'orange', 'darkred', 'lightred', 'beige', 'darkblue', 'darkgreen', 'cadetblue', 'darkpurple','pink', 'lightblue', 'lightgreen', 'gray', 'black', 'lightgray']
cluster_color =  ["red", "green", "blue", "yellow", "orange", "purple", "pink", "black", "coral", "gray","magenta", "cyan", "maroon", "navy", "olive", "teal", "indigo", "skyblue", "turquoise", "gold"]

map_plot = folium.Map(location=[df['latitude'].mean(), df['longitude'].mean()], zoom_start=5)
for index, row in df.iterrows():
    # folium.Marker(location=[row['latitude'], row['longitude']], popup=row['TimeStamp']).add_to(map_plot)
    folium.CircleMarker(location=[row['latitude'], row['longitude']], radius=2, color=cluster_color[row["cluster"]%20], popup=row['TimeStamp']).add_to(map_plot)
    # folium.CircleMarker(location=[row['latitude'], row['longitude']], radius=2, color="skyblue", popup=row['TimeStamp']).add_to(map_plot)

    
map_plot

In [440]:
# round off latitude and longitude values to 2 decimal places (1.11 KM)
df['lat_round'] = df['latitude'].round(2)
df['long_round'] = df['longitude'].round(2)
df

Unnamed: 0,id,latitude,longitude,lastlocatedon,entitytype,TimeStamp,pos,nextLoc,lat_rad,lon_rad,cluster,lat_round,long_round
82,83,26.183069,85.467850,2023-01-01T19:21:51.00Z,individual,2023-01-01 19:21:51+00:00,1,69,0.456967,1.491651,0,26.18,85.47
72,73,18.176494,73.691996,2023-01-04T09:15:24.00Z,associate,2023-01-04 09:15:24+00:00,0,,0.317230,1.286130,1,18.18,73.69
68,69,26.001861,83.353114,2023-01-04T22:13:57.00Z,individual,2023-01-04 22:13:57+00:00,2,97,0.453805,1.454743,2,26.00,83.35
66,67,25.969023,83.460910,2023-01-05T11:08:01.00Z,associate,2023-01-05 11:08:01+00:00,0,,0.453232,1.456625,2,25.97,83.46
84,85,26.414168,85.481511,2023-01-07T15:13:34.00Z,org,2023-01-07 15:13:34+00:00,0,,0.461001,1.491890,0,26.41,85.48
...,...,...,...,...,...,...,...,...,...,...,...,...,...
51,52,12.906321,77.835011,2023-04-25T07:54:14.00Z,individual,2023-04-25 07:54:14+00:00,29,108,0.225251,1.358437,11,12.91,77.84
63,64,20.100880,86.058362,2023-04-25T18:30:31.00Z,associate,2023-04-25 18:30:31+00:00,0,,0.350816,1.501957,8,20.10,86.06
43,44,16.883660,78.298946,2023-04-26T18:58:15.00Z,event,2023-04-26 18:58:15+00:00,0,,0.294667,1.366534,5,16.88,78.30
45,46,12.578808,77.373729,2023-04-28T18:56:38.00Z,event,2023-04-28 18:56:38+00:00,0,,0.219535,1.350387,11,12.58,77.37


In [441]:
#creating a unique place ID
# create a dictionary that maps unique (lat, long) combinations to integer IDs
id_dict = {}
id_count = 0
for row in df.itertuples(index=False):
    lat_long = (row.lat_round, row.long_round)
    if lat_long not in id_dict:
        id_count += 1
        id_dict[lat_long] = id_count
# create a new column 'place_id' by applying the id_dict mapping to each (lat, long) combination
df['place_id'] = df.apply(lambda row: id_dict[(row.lat_round, row.long_round)], axis=1)
df

Unnamed: 0,id,latitude,longitude,lastlocatedon,entitytype,TimeStamp,pos,nextLoc,lat_rad,lon_rad,cluster,lat_round,long_round,place_id
82,83,26.183069,85.467850,2023-01-01T19:21:51.00Z,individual,2023-01-01 19:21:51+00:00,1,69,0.456967,1.491651,0,26.18,85.47,1
72,73,18.176494,73.691996,2023-01-04T09:15:24.00Z,associate,2023-01-04 09:15:24+00:00,0,,0.317230,1.286130,1,18.18,73.69,2
68,69,26.001861,83.353114,2023-01-04T22:13:57.00Z,individual,2023-01-04 22:13:57+00:00,2,97,0.453805,1.454743,2,26.00,83.35,3
66,67,25.969023,83.460910,2023-01-05T11:08:01.00Z,associate,2023-01-05 11:08:01+00:00,0,,0.453232,1.456625,2,25.97,83.46,4
84,85,26.414168,85.481511,2023-01-07T15:13:34.00Z,org,2023-01-07 15:13:34+00:00,0,,0.461001,1.491890,0,26.41,85.48,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
51,52,12.906321,77.835011,2023-04-25T07:54:14.00Z,individual,2023-04-25 07:54:14+00:00,29,108,0.225251,1.358437,11,12.91,77.84,98
63,64,20.100880,86.058362,2023-04-25T18:30:31.00Z,associate,2023-04-25 18:30:31+00:00,0,,0.350816,1.501957,8,20.10,86.06,99
43,44,16.883660,78.298946,2023-04-26T18:58:15.00Z,event,2023-04-26 18:58:15+00:00,0,,0.294667,1.366534,5,16.88,78.30,100
45,46,12.578808,77.373729,2023-04-28T18:56:38.00Z,event,2023-04-28 18:56:38+00:00,0,,0.219535,1.350387,11,12.58,77.37,101


In [442]:
individual_Location = df[df['entitytype'] == 'individual']
individual_Location

Unnamed: 0,id,latitude,longitude,lastlocatedon,entitytype,TimeStamp,pos,nextLoc,lat_rad,lon_rad,cluster,lat_round,long_round,place_id
82,83,26.183069,85.46785,2023-01-01T19:21:51.00Z,individual,2023-01-01 19:21:51+00:00,1,69.0,0.456967,1.491651,0,26.18,85.47,1
68,69,26.001861,83.353114,2023-01-04T22:13:57.00Z,individual,2023-01-04 22:13:57+00:00,2,97.0,0.453805,1.454743,2,26.0,83.35,3
96,97,27.268622,75.980907,2023-01-13T00:13:29.00Z,individual,2023-01-13 00:13:29+00:00,3,81.0,0.475913,1.326078,7,27.27,75.98,13
80,81,26.455641,85.449732,2023-01-15T12:51:23.00Z,individual,2023-01-15 12:51:23+00:00,4,38.0,0.461724,1.491335,0,26.46,85.45,14
37,38,17.776954,78.550865,2023-01-18T14:20:29.00Z,individual,2023-01-18 14:20:29+00:00,5,50.0,0.310257,1.370931,5,17.78,78.55,16
49,50,12.972557,77.655264,2023-01-22T15:58:23.00Z,individual,2023-01-22 15:58:23+00:00,6,30.0,0.226407,1.3553,11,12.97,77.66,24
29,30,29.015758,77.484772,2023-01-23T14:04:03.00Z,individual,2023-01-23 14:04:03+00:00,7,61.0,0.506406,1.352324,10,29.02,77.48,26
60,61,20.423618,86.141632,2023-01-24T03:44:42.00Z,individual,2023-01-24 03:44:42+00:00,8,79.0,0.356449,1.503411,8,20.42,86.14,27
78,79,26.893145,79.940691,2023-01-24T08:39:25.00Z,individual,2023-01-24 08:39:25+00:00,9,4.0,0.46936,1.395187,12,26.89,79.94,28
3,4,19.52308,73.200119,2023-01-26T19:32:29.00Z,individual,2023-01-26 19:32:29+00:00,10,5.0,0.340732,1.277545,1,19.52,73.2,29


In [443]:
# current_Location = df[df['entitytype'] == 'individual'].index[-1]
current_Location = individual_Location.index[-1]
current_Location

107

In [444]:
#Create transition matrix for the current location
current_placeID =individual_Location.loc[current_Location,'place_id']
individual_Location_len = len(individual_Location)
individual_Location_len
transition_matrix = []

loop_index = 0
next_loc = False
for index, row in individual_Location.iterrows():
    if loop_index < (individual_Location_len - 2) and (row['place_id'] == current_placeID):
        next_loc = True
        # transition_matrix.append()
    if next_loc == True and (row['place_id'] != current_placeID):
        next_loc = False
        transition_matrix.append(row)
    loop_index += 1
prediction_df = pd.DataFrame(transition_matrix)

In [445]:
prediction_df

Unnamed: 0,id,latitude,longitude,lastlocatedon,entitytype,TimeStamp,pos,nextLoc,lat_rad,lon_rad,cluster,lat_round,long_round,place_id
34,35,17.051449,78.186149,2023-01-29T19:55:00.00Z,individual,2023-01-29 19:55:00+00:00,12,32,0.297595,1.364565,5,17.05,78.19,31
18,19,22.542114,88.700567,2023-03-17T18:07:05.00Z,individual,2023-03-17 18:07:05+00:00,18,7,0.393423,1.548071,6,22.54,88.7,66
44,45,13.254099,77.394496,2023-04-17T10:08:34.00Z,individual,2023-04-17 10:08:34+00:00,23,102,0.231321,1.350749,11,13.25,77.39,92
102,103,17.051449,78.186149,2023-04-18T19:55:00.00Z,individual,2023-04-18 19:55:00+00:00,25,39,0.297595,1.364565,5,17.05,78.19,31


In [446]:
# group the data by lat_round and long_round, and apply the nunique function to count column
# count the latest occurrence of each row by 1
prediction_df['count'] = prediction_df.groupby('place_id').cumcount() + 1
# create a boolean mask of rows that have a previous occurrence in the dataframe
mask = prediction_df.duplicated(subset='place_id', keep='last')
# drop the previous occurrence of each row in the dataframe
prediction_df = prediction_df[~mask]
prediction_df

Unnamed: 0,id,latitude,longitude,lastlocatedon,entitytype,TimeStamp,pos,nextLoc,lat_rad,lon_rad,cluster,lat_round,long_round,place_id,count
18,19,22.542114,88.700567,2023-03-17T18:07:05.00Z,individual,2023-03-17 18:07:05+00:00,18,7,0.393423,1.548071,6,22.54,88.7,66,1
44,45,13.254099,77.394496,2023-04-17T10:08:34.00Z,individual,2023-04-17 10:08:34+00:00,23,102,0.231321,1.350749,11,13.25,77.39,92,1
102,103,17.051449,78.186149,2023-04-18T19:55:00.00Z,individual,2023-04-18 19:55:00+00:00,25,39,0.297595,1.364565,5,17.05,78.19,31,2


In [447]:
# prediction
# create a weighted distribution of events
weighted_events = {'individual': 0.20, 'event': 0.20,'org': 0.20, 'associate': 0.20}
# get the sum of the 'count' column
total_sum = prediction_df['count'].sum()
# divide each value in the 'numbers' column by the sum of the values in the column
# prediction_df['probability'] = prediction_df['count'].div(total_sum)
prediction_df['probability'] = prediction_df['count'].apply(lambda x: (x * weighted_events['individual'])/ total_sum)
prediction_df 

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


Unnamed: 0,id,latitude,longitude,lastlocatedon,entitytype,TimeStamp,pos,nextLoc,lat_rad,lon_rad,cluster,lat_round,long_round,place_id,count,probability
18,19,22.542114,88.700567,2023-03-17T18:07:05.00Z,individual,2023-03-17 18:07:05+00:00,18,7,0.393423,1.548071,6,22.54,88.7,66,1,0.05
44,45,13.254099,77.394496,2023-04-17T10:08:34.00Z,individual,2023-04-17 10:08:34+00:00,23,102,0.231321,1.350749,11,13.25,77.39,92,1,0.05
102,103,17.051449,78.186149,2023-04-18T19:55:00.00Z,individual,2023-04-18 19:55:00+00:00,25,39,0.297595,1.364565,5,17.05,78.19,31,2,0.1


In [448]:

# select all rows where places are in the same cluster of the current location
current_cluster =df.loc[current_Location,'cluster']
cluster_df = df[df['cluster'] == current_cluster]
cluster_df

Unnamed: 0,id,latitude,longitude,lastlocatedon,entitytype,TimeStamp,pos,nextLoc,lat_rad,lon_rad,cluster,lat_round,long_round,place_id
17,18,22.78624,88.16081,2023-01-10T11:16:45.00Z,org,2023-01-10 11:16:45+00:00,0,,0.397683,1.538651,6,22.79,88.16,10
14,15,22.614827,88.148117,2023-01-10T18:50:46.00Z,event,2023-01-10 18:50:46+00:00,0,,0.394692,1.538429,6,22.61,88.15,11
4,5,22.589555,88.4114,2023-01-27T19:32:29.00Z,individual,2023-01-27 19:32:29+00:00,11,35.0,0.39425,1.543025,6,22.59,88.41,30
15,16,22.764204,88.629726,2023-02-15T13:37:50.00Z,associate,2023-02-15 13:37:50+00:00,0,,0.397299,1.546835,6,22.76,88.63,46
22,23,22.219838,87.993175,2023-02-19T15:53:35.00Z,org,2023-02-19 15:53:35+00:00,0,,0.387798,1.535725,6,22.22,87.99,48
16,17,22.147154,88.289466,2023-02-26T10:23:29.00Z,associate,2023-02-26 10:23:29+00:00,0,,0.386529,1.540896,6,22.15,88.29,52
12,13,22.492147,88.471564,2023-03-04T05:33:51.00Z,associate,2023-03-04 05:33:51+00:00,0,,0.39255,1.544075,6,22.49,88.47,56
19,20,22.477847,88.586615,2023-03-11T14:14:01.00Z,org,2023-03-11 14:14:01+00:00,0,,0.392301,1.546083,6,22.48,88.59,63
36,37,22.5895,88.411444,2023-03-15T15:45:17.00Z,individual,2023-03-15 15:45:17+00:00,17,19.0,0.39425,1.543025,6,22.59,88.41,30
18,19,22.542114,88.700567,2023-03-17T18:07:05.00Z,individual,2023-03-17 18:07:05+00:00,18,7.0,0.393423,1.548071,6,22.54,88.7,66


In [449]:
current_location_place_id =df.loc[current_Location,'place_id']
# drop rows with value of curent location of 'place_id' in 'place_id' column
cluster_df = cluster_df[cluster_df['place_id'] != current_location_place_id]
cluster_df

Unnamed: 0,id,latitude,longitude,lastlocatedon,entitytype,TimeStamp,pos,nextLoc,lat_rad,lon_rad,cluster,lat_round,long_round,place_id
17,18,22.78624,88.16081,2023-01-10T11:16:45.00Z,org,2023-01-10 11:16:45+00:00,0,,0.397683,1.538651,6,22.79,88.16,10
14,15,22.614827,88.148117,2023-01-10T18:50:46.00Z,event,2023-01-10 18:50:46+00:00,0,,0.394692,1.538429,6,22.61,88.15,11
15,16,22.764204,88.629726,2023-02-15T13:37:50.00Z,associate,2023-02-15 13:37:50+00:00,0,,0.397299,1.546835,6,22.76,88.63,46
22,23,22.219838,87.993175,2023-02-19T15:53:35.00Z,org,2023-02-19 15:53:35+00:00,0,,0.387798,1.535725,6,22.22,87.99,48
16,17,22.147154,88.289466,2023-02-26T10:23:29.00Z,associate,2023-02-26 10:23:29+00:00,0,,0.386529,1.540896,6,22.15,88.29,52
12,13,22.492147,88.471564,2023-03-04T05:33:51.00Z,associate,2023-03-04 05:33:51+00:00,0,,0.39255,1.544075,6,22.49,88.47,56
19,20,22.477847,88.586615,2023-03-11T14:14:01.00Z,org,2023-03-11 14:14:01+00:00,0,,0.392301,1.546083,6,22.48,88.59,63
18,19,22.542114,88.700567,2023-03-17T18:07:05.00Z,individual,2023-03-17 18:07:05+00:00,18,7.0,0.393423,1.548071,6,22.54,88.7,66
20,21,22.938976,88.479571,2023-03-26T21:07:29.00Z,individual,2023-03-26 21:07:29+00:00,21,22.0,0.400349,1.544214,6,22.94,88.48,74
11,12,22.441665,88.698856,2023-04-15T07:08:14.00Z,event,2023-04-15 07:08:14+00:00,0,,0.391669,1.548041,6,22.44,88.7,90


In [450]:
cluster_df_individual = cluster_df[cluster_df['entitytype'] == 'individual']
cluster_df_individual['count'] = cluster_df_individual.groupby('place_id').cumcount() + 1
# create a boolean mask of rows that have a previous occurrence in the dataframe
mask = cluster_df_individual.duplicated(subset='place_id', keep='last')
# drop the previous occurrence of each row in the dataframe
cluster_df_individual = cluster_df_individual[~mask]
total_sum = cluster_df_individual['count'].sum()
# divide each value in the 'numbers' column by the sum of the values in the column
# prediction_df['probability'] = prediction_df['count'].div(total_sum)
cluster_df_individual['probability'] = cluster_df_individual['count'].apply(lambda x: (x * weighted_events['individual'])/ total_sum)
cluster_df_individual

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


Unnamed: 0,id,latitude,longitude,lastlocatedon,entitytype,TimeStamp,pos,nextLoc,lat_rad,lon_rad,cluster,lat_round,long_round,place_id,count,probability
18,19,22.542114,88.700567,2023-03-17T18:07:05.00Z,individual,2023-03-17 18:07:05+00:00,18,7,0.393423,1.548071,6,22.54,88.7,66,1,0.1
20,21,22.938976,88.479571,2023-03-26T21:07:29.00Z,individual,2023-03-26 21:07:29+00:00,21,22,0.400349,1.544214,6,22.94,88.48,74,1,0.1


In [451]:
cluster_df_event = cluster_df[cluster_df['entitytype'] == 'event']
cluster_df_event['count'] = cluster_df_event.groupby('place_id').cumcount() + 1
# create a boolean mask of rows that have a previous occurrence in the dataframe
mask = cluster_df_event.duplicated(subset='place_id', keep='last')
# drop the previous occurrence of each row in the dataframe
cluster_df_event = cluster_df_event[~mask]
total_sum = cluster_df_event['count'].sum()
# divide each value in the 'numbers' column by the sum of the values in the column
# prediction_df['probability'] = prediction_df['count'].div(total_sum)
cluster_df_event['probability'] = cluster_df_event['count'].apply(lambda x: (x * weighted_events['event'])/ total_sum)
cluster_df_event

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


Unnamed: 0,id,latitude,longitude,lastlocatedon,entitytype,TimeStamp,pos,nextLoc,lat_rad,lon_rad,cluster,lat_round,long_round,place_id,count,probability
14,15,22.614827,88.148117,2023-01-10T18:50:46.00Z,event,2023-01-10 18:50:46+00:00,0,,0.394692,1.538429,6,22.61,88.15,11,1,0.1
11,12,22.441665,88.698856,2023-04-15T07:08:14.00Z,event,2023-04-15 07:08:14+00:00,0,,0.391669,1.548041,6,22.44,88.7,90,1,0.1


In [452]:
cluster_df_associate = cluster_df[cluster_df['entitytype'] == 'associate']
cluster_df_associate['count'] = cluster_df_associate.groupby('place_id').cumcount() + 1
# create a boolean mask of rows that have a previous occurrence in the dataframe
mask = cluster_df_associate.duplicated(subset='place_id', keep='last')
# drop the previous occurrence of each row in the dataframe
cluster_df_associate = cluster_df_associate[~mask]
total_sum = cluster_df_associate['count'].sum()
# divide each value in the 'numbers' column by the sum of the values in the column
# prediction_df['probability'] = prediction_df['count'].div(total_sum)
cluster_df_associate['probability'] = cluster_df_associate['count'].apply(lambda x: (x * weighted_events['associate'])/ total_sum)
cluster_df_associate

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


Unnamed: 0,id,latitude,longitude,lastlocatedon,entitytype,TimeStamp,pos,nextLoc,lat_rad,lon_rad,cluster,lat_round,long_round,place_id,count,probability
15,16,22.764204,88.629726,2023-02-15T13:37:50.00Z,associate,2023-02-15 13:37:50+00:00,0,,0.397299,1.546835,6,22.76,88.63,46,1,0.066667
16,17,22.147154,88.289466,2023-02-26T10:23:29.00Z,associate,2023-02-26 10:23:29+00:00,0,,0.386529,1.540896,6,22.15,88.29,52,1,0.066667
12,13,22.492147,88.471564,2023-03-04T05:33:51.00Z,associate,2023-03-04 05:33:51+00:00,0,,0.39255,1.544075,6,22.49,88.47,56,1,0.066667


In [453]:
cluster_df_org = cluster_df[cluster_df['entitytype'] == 'org']
cluster_df_org['count'] = cluster_df_org.groupby('place_id').cumcount() + 1
# create a boolean mask of rows that have a previous occurrence in the dataframe
mask = cluster_df_org.duplicated(subset='place_id', keep='last')
# drop the previous occurrence of each row in the dataframe
cluster_df_org = cluster_df_org[~mask]
total_sum = cluster_df_org['count'].sum()
# divide each value in the 'numbers' column by the sum of the values in the column
# prediction_df['probability'] = prediction_df['count'].div(total_sum)
cluster_df_org['probability'] = cluster_df_org['count'].apply(lambda x: (x * weighted_events['associate'])/ total_sum)
cluster_df_org

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


Unnamed: 0,id,latitude,longitude,lastlocatedon,entitytype,TimeStamp,pos,nextLoc,lat_rad,lon_rad,cluster,lat_round,long_round,place_id,count,probability
17,18,22.78624,88.16081,2023-01-10T11:16:45.00Z,org,2023-01-10 11:16:45+00:00,0,,0.397683,1.538651,6,22.79,88.16,10,1,0.066667
22,23,22.219838,87.993175,2023-02-19T15:53:35.00Z,org,2023-02-19 15:53:35+00:00,0,,0.387798,1.535725,6,22.22,87.99,48,1,0.066667
19,20,22.477847,88.586615,2023-03-11T14:14:01.00Z,org,2023-03-11 14:14:01+00:00,0,,0.392301,1.546083,6,22.48,88.59,63,1,0.066667


In [454]:
df['probability'] = 0
for index, row in prediction_df.iterrows():
    df.loc[index,'probability'] += row['probability']
for index, row in cluster_df_individual.iterrows():
    df.loc[index,'probability'] += row['probability']
for index, row in cluster_df_event.iterrows():
    df.loc[index,'probability'] += row['probability']
for index, row in cluster_df_associate.iterrows():
    df.loc[index,'probability'] += row['probability']
for index, row in cluster_df_org.iterrows():
    df.loc[index,'probability'] += row['probability']
# df[(df['entitytype']=='individual') & (df['probability'] > 0)]
df[(df['probability'] > 0)]

Unnamed: 0,id,latitude,longitude,lastlocatedon,entitytype,TimeStamp,pos,nextLoc,lat_rad,lon_rad,cluster,lat_round,long_round,place_id,probability
17,18,22.78624,88.16081,2023-01-10T11:16:45.00Z,org,2023-01-10 11:16:45+00:00,0,,0.397683,1.538651,6,22.79,88.16,10,0.066667
14,15,22.614827,88.148117,2023-01-10T18:50:46.00Z,event,2023-01-10 18:50:46+00:00,0,,0.394692,1.538429,6,22.61,88.15,11,0.1
15,16,22.764204,88.629726,2023-02-15T13:37:50.00Z,associate,2023-02-15 13:37:50+00:00,0,,0.397299,1.546835,6,22.76,88.63,46,0.066667
22,23,22.219838,87.993175,2023-02-19T15:53:35.00Z,org,2023-02-19 15:53:35+00:00,0,,0.387798,1.535725,6,22.22,87.99,48,0.066667
16,17,22.147154,88.289466,2023-02-26T10:23:29.00Z,associate,2023-02-26 10:23:29+00:00,0,,0.386529,1.540896,6,22.15,88.29,52,0.066667
12,13,22.492147,88.471564,2023-03-04T05:33:51.00Z,associate,2023-03-04 05:33:51+00:00,0,,0.39255,1.544075,6,22.49,88.47,56,0.066667
19,20,22.477847,88.586615,2023-03-11T14:14:01.00Z,org,2023-03-11 14:14:01+00:00,0,,0.392301,1.546083,6,22.48,88.59,63,0.066667
18,19,22.542114,88.700567,2023-03-17T18:07:05.00Z,individual,2023-03-17 18:07:05+00:00,18,7.0,0.393423,1.548071,6,22.54,88.7,66,0.15
20,21,22.938976,88.479571,2023-03-26T21:07:29.00Z,individual,2023-03-26 21:07:29+00:00,21,22.0,0.400349,1.544214,6,22.94,88.48,74,0.1
11,12,22.441665,88.698856,2023-04-15T07:08:14.00Z,event,2023-04-15 07:08:14+00:00,0,,0.391669,1.548041,6,22.44,88.7,90,0.1


In [478]:
import folium
from folium.plugins import PolyLineTextPath
# from folium.plugins import PolyLineDecorator

map_plot = folium.Map(location=[df['latitude'].mean(), df['longitude'].mean()], zoom_start=5)
individual_index = 0
arrow_loc=[]
destinations_prob = []
entity_color = {'individual': "blue",'associate': "red",'org': "green","event":  "orange"}
for index, row in df.iterrows():
    # folium.Marker(location=[row['latitude'], row['longitude']], popup=row['TimeStamp']).add_to(map_plot)
    if 'individual' in row['entitytype']:
        individual_index += 1
        arrow_loc.append([row['latitude'], row['longitude']])
        folium.CircleMarker(location=[row['latitude'], row['longitude']], radius=5, color=entity_color[row["entitytype"]], fill_color='red',popup=folium.Popup(individual_index, max_width=100)).add_to(map_plot)
    else:
        folium.CircleMarker(location=[row['latitude'], row['longitude']], radius=4, color=entity_color[row["entitytype"]],popup=row['TimeStamp']).add_to(map_plot)
    if row['probability'] > 0:
        prob = row['probability']*100
        # Define list of destination points
        destinations_prob.append([row['latitude'], row['longitude'], row['probability']])
        icon_html = f'<div style="font-weight:bold;font-size:16px;color:red">{prob:.1f}%</div>'
        marker_icon = folium.features.DivIcon(html=icon_html)
        # Add a marker to the map with the custom icon
        marker_loc = [row['latitude'], row['longitude']]  # Marker location
        marker = folium.Marker(location=marker_loc, icon=marker_icon).add_to(map_plot)

# define the arrow locations and add it to the map
arrow_line = folium.PolyLine(locations=arrow_loc, color='blue', weight=1, opacity=0.7).add_to(map_plot)
# arrow_head = PolyLineTextPath(arrow_line, text='►', offset=-3, repeat=True, attributes={'fill': 'blue', 'font-weight': 'bold'}).add_to(map_plot)
arrow_head = PolyLineTextPath(arrow_line, text=' ► ', offset=3.5, repeat=True, attributes={'fill': 'blue', 'font-weight': 'bold'}).add_to(map_plot)
# arrow_head = PolyLineDecorator(arrow_line,offset='25%',end_offset='25%', repeat=10, color='blue').add_to(map_plot)
start_coords = [df.loc[current_Location,'latitude'],df.loc[current_Location,'longitude']]
for end_coord in destinations_prob:
    # Create marker for end point with text
    end_marker = folium.Marker(location=[end_coord[0], end_coord[1]], popup=end_coord[2])
    # Add marker for end point to map
    end_marker.add_to(map_plot)
    # Create polyline from single point to end point
    polyline = folium.PolyLine(locations=[start_coords, [end_coord[0], end_coord[1]]],color='red').add_to(map_plot)
map_plot

In [489]:
start_coords = [df.loc[current_Location,'latitude'],df.loc[current_Location,'longitude']]
map_plot = folium.Map(location=start_coords, zoom_start=5)
individual_index = 0
arrow_loc=[]
destinations_prob = []
entity_color = {'individual': "blue",'associate': "red",'org': "green","event":  "orange"}
for index, row in df.iterrows():
    
    # folium.CircleMarker(location=[row['latitude'], row['longitude']], radius=4, color=entity_color[row["entitytype"]]).add_to(map_plot)
    if row['probability'] > 0:
        prob = row['probability']*100
        # Define list of destination points
        # destinations_prob.append([row['latitude'], row['longitude'], row['probability']])
        folium.CircleMarker(location=[row['latitude'], row['longitude']], radius=4, color=entity_color[row["entitytype"]]).add_to(map_plot)
        icon_html = f'<div style="font-weight:bold;font-size:16px;color:red">{prob:.1f}%</div>'
        marker_icon = folium.features.DivIcon(html=icon_html)
        # Add a marker to the map with the custom icon
        marker_loc = [row['latitude'], row['longitude']]  # Marker location
        marker = folium.Marker(location=marker_loc, icon=marker_icon,popup=f'{prob:.1f}%').add_to(map_plot)
        polyline = folium.PolyLine(locations=[start_coords, marker_loc],color='red').add_to(map_plot)
        # arrow_head = PolyLineTextPath([start_coords, marker_loc], text=' ► ', offset=3.5, repeat=True, attributes={'fill': 'red', 'font-weight': 'bold'}).add_to(map_plot)

# define the arrow locations and add it to the map
# arrow_line = folium.PolyLine(locations=arrow_loc, color='blue', weight=1, opacity=0.7).add_to(map_plot)
# arrow_head = PolyLineTextPath(arrow_line, text='►', offset=-3, repeat=True, attributes={'fill': 'blue', 'font-weight': 'bold'}).add_to(map_plot)
# arrow_head = PolyLineTextPath(arrow_line, text=' ► ', offset=3.5, repeat=True, attributes={'fill': 'blue', 'font-weight': 'bold'}).add_to(map_plot)
# arrow_head = PolyLineDecorator(arrow_line,offset='25%',end_offset='25%', repeat=10, color='blue').add_to(map_plot)

# for end_coord in destinations_prob:
#     # Create marker for end point with text
#     end_marker = folium.Marker(location=[end_coord[0], end_coord[1]], popup=end_coord[2])
#     # Add marker for end point to map
#     end_marker.add_to(map_plot)
#     # Create polyline from single point to end point
#     polyline = folium.PolyLine(locations=[start_coords, [end_coord[0], end_coord[1]]],color='red').add_to(map_plot)
map_plot