In [166]:
import pickle

import numpy as np
import pandas as pd
from sklearn.cluster import KMeans
import folium
import matplotlib.pyplot as plt
from sklearn.cluster import DBSCAN

In [2]:
df = pd.read_csv(r"C:\Users\223099055.HCAD\Downloads\DANA\dana_transactions.csv")

In [129]:
def generate_latitude(row):
    l = row['geo_location'].split(',')
    return float(l[0])

def generate_longitude(row):
    l = row['geo_location'].split(',')
    return float(l[1])

df['latitude'] = df.apply(generate_latitude, axis=1)
df['longitude'] = df.apply(generate_longitude, axis=1)

In [204]:
def get_home_work_loc(user_id, df_u):
    if len(df_u) < 2:
        return None
    
    earth_rad = 6371
    epsilon = 10 / earth_rad
    dbscan = DBSCAN(eps=epsilon, min_samples=2, algorithm='ball_tree', metric='haversine')
    coords = df_u[['latitude', 'longitude']].to_numpy()
    dbscan.fit(np.radians(coords))
    df_u['cluster'] = dbscan.labels_
    n_clusters = len(set(dbscan.labels_))
    
    groupby_cluster_ordered = df_u.groupby(['cluster'])['cluster'].count().sort_values(ascending=False)
    home_cluster = work_cluster = groupby_cluster_ordered.index[0]
    if n_clusters > 1:
        work_cluster = groupby_cluster_ordered.index[1]
    
    home_latitude = df_u[df_u['cluster'] == home_cluster]['latitude'].mean() 
    home_longitude = df_u[df_u['cluster'] == home_cluster]['longitude'].mean()
    work_latitude = df_u[df_u['cluster'] == work_cluster]['latitude'].mean() 
    work_longitude = df_u[df_u['cluster'] == work_cluster]['longitude'].mean()
    return home_latitude, home_longitude, work_latitude, work_longitude

In [205]:
user_locations = []

for i in df['user_id'].unique():
    df_u = df[df['user_id'] == i].copy()
    loc = get_home_work_loc(i, df_u)
    if loc:
        u = {
            'user_id': i,
            'home_latitude': loc[0],
            'home_longitude': loc[1],
            'work_latitude': loc[2],
            'work_longitude': loc[3],
        }
        user_locations.append(u)
    else:
        continue

In [206]:
with open('home_and_work_location.pickle', 'wb') as f:
    pickle.dump(user_locations, f, protocol=pickle.HIGHEST_PROTOCOL)

# Plot

In [198]:
lat = df_u.iloc[0]['lat']
lng = df_u.iloc[0]['long']
map = folium.Map(location=[lat, lng], zoom_start=10)

colors = {
    -1: "red",
    0: "green",
    1: "blue",
}

for _, row in df_u.iterrows():
    folium.CircleMarker(
        location=[row["lat"], row["long"]], 
        fill=True, 
        fill_color=colors[row["cluster"]],
        color=colors[row["cluster"]]
    ).add_to(map)

In [199]:
map