In [None]:
import pandas as pd
import numpy as np 

from src.visualization import plot_scatter_persons, create_heatmap, plot_daywise_activity_with_mean, plot_activity_by_part_of_day
from src.data_analysis import *

import geopandas as gpd
import matplotlib.pyplot as plt

## Задание 1
- Посещали пользователи одинаковые места? Знают ли они друг друга?
- Ввести метрику социальной активности - она должна учитывать количество мест, которое посетил пользователь, как далеко они друг от друга и как долго человек оставался там - метрика должна позволять сравнивать пользователей и находить наиболее активных

### задание 1.1. Посещали пользователи одинаковые места? Знают ли они друг друга?

In [None]:
# Determine the number of unique users in the database
unique_person_ids = pd.read_sql_query("SELECT DISTINCT person_id FROM all_persons", conn)

# Create a dictionary to store dataframes for each user
dataframes_dict = {}

# Populate the dictionary with data for each user
for person_id in unique_person_ids['person_id']:
    csv_filename = f"person.{person_id}.csv"
    dataframes_dict[f"person_{person_id}_df_updated"] = get_data_from_db_or_csv(person_id, conn, csv_filename)

# Return the keys of the dictionary (i.e., the names of the dynamically created dataframes)
list(dataframes_dict.keys())

In [None]:
processed_dataframes = process_dataframes(dataframes_dict, precision=3)
common_locs = compare_locations(processed_dataframes)

In [None]:
overlap_results = {}

# Iterate over every combination of 2 dataframes in the dataframes_dict
for key1, key2 in combinations(dataframes_dict.keys(), 2):
    # Get the overlap dataframe for the current combination
    overlap_df = location_and_time_overlap_vectorized(dataframes_dict[key1], dataframes_dict[key2])
    
    # Store the overlap dataframe in the overlap_results dictionary
    overlap_key = f"{key1}_{key2}"
    overlap_results[overlap_key] = overlap_df

# This will provide the overlaps for all combinations: person_1_person_2, person_1_person_3, and person_2_person_3.
overlap_results.keys()

### Heat map of all our persons

In [None]:
# Plot scatter points for the persons
plot_scatter_persons(person_1_df_updated, person_2_df_updated, person_3_df_updated)

# Create a heatmap for the persons
heatmap_map = create_heatmap(person_1_df_updated, person_2_df_updated, person_3_df_updated)

# Display the heatmap
# heatmap_map.save('heatmap.html')  # Save the map to an HTML file
heatmap_map

In [None]:
# Assuming you have dataframes for person 2 and person 3 processed and stored in variables
# person_2_df_updated and person_3_df_updated

# Plot scatter points for the persons
plot_scatter_persons(person_2_df_updated, person_3_df_updated)

# Create a heatmap for the persons
heatmap_map = create_heatmap(person_2_df_updated, person_3_df_updated)

# Display the heatmap
# heatmap_map.save('heatmap.html')  # Save the map to an HTML file
heatmap_map


In [None]:
# Plot scatter points for the persons
plot_scatter_persons(person_1_df_updated, person_3_df_updated)

# Create a heatmap for the persons
heatmap_map = create_heatmap(person_1_df_updated, person_3_df_updated)

# Display the heatmap
# heatmap_map.save('heatmap.html')  # Save the map to an HTML file
heatmap_map

In [None]:
# Plot scatter points for the persons
plot_scatter_persons(person_1_df_updated, person_2_df_updated)

# Create a heatmap for the persons
heatmap_map = create_heatmap(person_1_df_updated, person_2_df_updated)

# Display the heatmap
# heatmap_map.save('heatmap.html')  # Save the map to an HTML file
heatmap_map

### задание 1.2. Ввести метрику социальной активности - она должна учитывать количество мест, которое посетил пользователь, как далеко они друг от друга и как долго человек оставался там - метрика должна позволять сравнивать пользователей и находить наиболее активных

To define a social activity metric, I'll consider the following factors:

### Number of Places Visited (NPV):
Represents the total number of unique places the person has visited.
### Average Distance Between Places (ADBP):
Represents the average distance between consecutive places visited by the user. This gives an indication of how far they travel between destinations.
### Average Duration at Places (ADAP):
Represents the average time spent at each location.
### The Social Activity Metric (SAM) can be formulated as:

SAM =
NPV
+
ADBP
+
ADAP



In [None]:
# Order the DataFrame by SAM from highest to lowest and return the result
social_activity_metric_v1 = calculate_social_activity_metric_v1(dataframes_dict)
social_activity_metric_v1 = social_activity_metric_v1.sort_values(by='SAM', ascending=False).reset_index(drop=True)
social_activity_metric_v1

Person 2 appears to be the most socially active, with the highest SAM value of 450.39.
Person 1 follows closely behind with a SAM value of 445.36.
Person 3 has the lowest SAM value of 314.79.

# SAM
$ SAM = NPV × f(ADBP, ADAP) $

## Where:
$ f(ADBP, ADAP) $ is a function that gives the average weight of a place using the Average Distance Between Places (ADBP) and the Average Duration at Places (ADAP).

## Definition of f:
One possible definition of f is the harmonic mean of ADBP and ADAP:
$$ f(ADBP, ADAP) = \frac{2 × ADBP × ADAP}{ADBP + ADAP} $$

The harmonic mean gives a balanced weight to both ADBP and ADAP. If one of them is very small while the other is large, the harmonic mean will be closer to the smaller value. This ensures that both the distance between places and the duration at places have an influential role in determining the weight of a place.


In [None]:
social_activity_metric_v2 = calculate_social_activity_metric_v2(dataframes_dict)
social_activity_metric_v2 = social_activity_metric_v2.sort_values(by='SAM', ascending=False).reset_index(drop=True)
social_activity_metric_v2

In [None]:
# Extract day of the week and hour of the day separately and then join them
person_1_temporal = extract_temporal_features(dataframes_dict['person_1_df_updated'])
person_2_temporal = extract_temporal_features(dataframes_dict['person_2_df_updated'])
person_3_temporal = extract_temporal_features(dataframes_dict['person_3_df_updated'])

## Plotting
### By day of the week

In [None]:
plot_daywise_activity_with_mean(person_1_temporal, 'Person 1')
plot_daywise_activity_with_mean(person_2_temporal, 'Person 2')
plot_daywise_activity_with_mean(person_3_temporal, 'Person 3')

### By time of the day

In [None]:
person_1_part_of_day = extract_part_of_day_features(dataframes_dict['person_1_df_updated'])
person_2_part_of_day = extract_part_of_day_features(dataframes_dict['person_2_df_updated'])
person_3_part_of_day = extract_part_of_day_features(dataframes_dict['person_3_df_updated'])

plot_activity_by_part_of_day(person_1_part_of_day, 'Person 1')
plot_activity_by_part_of_day(person_2_part_of_day, 'Person 2')
plot_activity_by_part_of_day(person_3_part_of_day, 'Person 3')

## Another option to calculate sam using day specification

In [None]:
PARTS_OF_DAY = {
    "Early Morning": (0, 6, 0.5),
    "Morning": (6, 12, 1),
    "Afternoon": (12, 17, 1.5),
    "Evening": (17, 21, 2),
    "Night": (21, 24, 1)
}

# Calculate the weighted SAM for each user and store the results in a dictionary
weighted_sam_dict = {}
for person, df in processed_dataframes.items():
    weighted_sam_dict[person] = calculate_weighted_sam(df)

weighted_sam_dict

'person_2 >> person_3 >> person_1'

In [None]:
PARTS_OF_DAY = {
    'early_morning': (0, 6),
    'morning': (6, 12),
    'afternoon': (12, 17),
    'evening': (17, 21),
    'night': (21, 24)
}

WEIGHTS = {
    'early_morning': 0.5,
    'morning': 1,
    'afternoon': 1.5,
    'evening': 2,
    'night': 0.5
}

modified_sam_values = {}

for person, df in processed_dataframes.items():
    metrics = calculate_social_activity_metric_weighted(df, WEIGHTS)
    modified_sam_values[person] = metrics

# Convert results to a DataFrame for better visualization
modified_sam_df = pd.DataFrame(modified_sam_values).T

modified_sam_df

In [None]:
# Calculate the modified SAM for each person using harmonic mean
modified_sam_harmonic_df = calculate_social_activity_metric_harmonic_v2(processed_dataframes, WEIGHTS)
modified_sam_harmonic_df

## Check if clients know each other

In [None]:
# Check for overlaps between each pair of users using the optimized function
overlap_counts_v4 = {}
for i in range(1, len(processed_dataframes) + 1):
    for j in range(i + 1, len(processed_dataframes) + 1):
        overlap_counts_v4[f'Person_{i}_Person_{j}'] = location_and_time_overlap_count_rtree(
            processed_dataframes[f'person_{i}_df_updated'], 
            processed_dataframes[f'person_{j}_df_updated']
        )

overlap_counts_v4

Person 3 and 2 defenitely know each other because they have 58 1 meter overlaps that was for at leas 1 minute long =>
That wasn't coincidence of any sort 

If users are related, that could be usefull for banking industry for example to make 
models as PD/LGD. Users with high credit risks would probably communicate with thouth who have high  risk and
vice a verca

## Add external data from OpenStreetMap

In [None]:
# Path to the saved POIs file
pois_file_path = 'Data/Processed/amsterdam_pois.geojson'

# Read the GeoJSON file into a GeoDataFrame
pois_gdf = gpd.read_file(pois_file_path)

# Display the first few rows of the GeoDataFrame
print(pois_gdf.head())


In [None]:
# Get the naturalearth_lowres dataset
world = gpd.read_file(gpd.datasets.get_path('naturalearth_lowres'))

# Filter for Netherlands
netherlands = world[world['name'] == "Netherlands"]

# Plot
fig, ax = plt.subplots(figsize=(10, 10))
netherlands.plot(ax=ax, color='lightgrey', edgecolor='black')
gdf.plot(ax=ax, markersize=5, color='red')
ax.set_title("POIs in Amsterdam")
plt.show()

In [None]:
# Extract 'amenity' from the 'tags' column and create a new 'amenity' column in the dataframe
pois_gdf['amenity'] = pois_gdf['tags'].apply(lambda x: x.get('amenity', None))
# Convert latitude and longitude to geometry
person_1_gdf = gpd.GeoDataFrame(processed_dataframes['person_1_df_updated'], 
                                geometry=gpd.points_from_xy(processed_dataframes['person_1_df_updated'].longitude, 
                                                            processed_dataframes['person_1_df_updated'].latitude))

In [None]:
# Calculate the enhanced SAM for person 1
enhanced_sam_person_1 = calculate_enhanced_sam(person_1_gdf, pois_gdf,distance_threshold=1)
enhanced_sam_person_2 = calculate_enhanced_sam(person_2_gdf, pois_gdf,distance_threshold=1)
enhanced_sam_person_3 = calculate_enhanced_sam(person_3_gdf, pois_gdf,distance_threshold=1)
print(enhanced_sam_person_1, enhanced_sam_person_2, enhanced_sam_person_3)

In [None]:
#And less preciese 
enhanced_sam_person_1_optimized = calculate_enhanced_sam_optimized(person_1_gdf, pois_gdf,distance_threshold=1)
enhanced_sam_person_2_optimized = calculate_enhanced_sam_optimized(person_2_gdf, pois_gdf,distance_threshold=1)
enhanced_sam_person_3_optimized = calculate_enhanced_sam_optimized(person_3_gdf, pois_gdf,distance_threshold=1)

Based on all this calculations we could understand that Second person is more active than first one
and third one is less active, despite have quite similar life balance as second one

In [None]:
enhanced_sam_person_1 = calculate_enhanced_sam(person_1_gdf, pois_gdf)
enhanced_sam_person_2 = calculate_enhanced_sam(person_2_gdf, pois_gdf)
enhanced_sam_person_3 = calculate_enhanced_sam(person_3_gdf, pois_gdf)
print(enhanced_sam_person_1, enhanced_sam_person_2, enhanced_sam_person_3)

But based on harmonic mean of all our variables we again have first one is more socially active 

## Final results
### 1.1 Посещали пользователи одинаковые места? Знают ли они друг друга?
All 3 personc visited same places, but relationthisp could be established only between 2 and 3, nevertheless 
that 1 visit similar points on map could be usefull for some modelling or marketing actions.
### 1.2 Ввести метрику социальной активности - она должна учитывать количество мест, которое посетил пользователь, как далеко они друг от друга и как долго человек оставался там - метрика должна позволять сравнивать пользователей и находить наиболее активных
Depends on method we use most active clients if as folows 2 3 1 if we not using outer data 
If we use external data user 2 1 3 is most active, because 3 don't visit points of interests as bars, cinemas
If we use distance as one of the main metrics (i think it's not useful because he trevelling from different city to 
AMSTERDAM) 1 2 3 are most socially active