In [None]:
# ! pip install -r ../requirements-dev.txt -q

In [None]:
import pandas as pd
import plotly.express as px
import numpy as np
from pathlib import Path
import h3
from math import radians, cos, sin, asin, sqrt, degrees, atan2
import shapely
import geopandas as gpd

In [None]:
data_dir = Path("..","data")
models_dir = Path("..", "models")

## Загружаем трейн данные

In [None]:
transactions_df = pd.read_parquet(data_dir / "transactions.parquet")
# Заполняем std = 0 где одна транзакция
transactions_df['std'].fillna(0, inplace=True)
# transactions_df.head()

In [None]:
target_df = pd.read_parquet(data_dir / "target.parquet")
# target_df.head()

In [None]:
# Собираем список всех доступных Hexes
# список всех 1658 геолокаций, где возможно снятие наличности, нужно разметить эти локации в решении
# Есть 3 локации, по которым нет транзакций set(hexses_target).difference(transactions_df.h3_09)
with open(data_dir / "hexses_target.lst", "r") as file:
    file_contents = file.read()
hexses_target = file_contents[:-1].split("\n") # remove /n
assert set(hexses_target)==set(target_df.h3_09)

# Cписок всех 8154 геолокаций h3_09 из transactions_df
with open(data_dir / "hexses_data.lst", "r") as file:
    file_contents = file.read()
hexses_data = file_contents[:-1].split("\n")

all_hexses = list(set(hexses_target) | set(hexses_data))
# all_hexses = pd.DataFrame({"h3_09":all_hexses})
all_hexses = gpd.GeoDataFrame({"h3_09":all_hexses})
all_hexses["geometry"] = all_hexses["h3_09"].apply(lambda x: shapely.geometry.Polygon(h3.h3_to_geo_boundary(x, geo_json=True)))
all_hexses[['lat', 'lon']] = all_hexses['h3_09'].apply(lambda x: pd.Series(h3.h3_to_geo(x)))

atm_hexses = all_hexses.merge(pd.DataFrame({"h3_09":hexses_target}))

In [None]:
transactions_df = pd.merge(transactions_df, all_hexses, on="h3_09")
target_df = pd.merge(target_df, all_hexses, on="h3_09")

## Посмотрим на выбранного клиента

In [None]:
customer_id = 14235

In [None]:
# See https://towardsdatascience.com/constructing-hexagon-maps-with-h3-and-plotly-a-comprehensive-tutorial-8f37a91573bb

def plot_customer_activity(all_hexses: pd.DataFrame, transactions_df: pd.DataFrame, target_df: pd.DataFrame, customer_id: int = None) -> None:
    """Plots map of transactions and cash withdraw of particular client or all clients"""
    transactions_df_plot = transactions_df.copy()
    target_df_plot = target_df.copy()

    if customer_id:
        transactions_df_plot = transactions_df_plot.query('customer_id == @customer_id')
        target_df_plot = target_df_plot.query('customer_id == @customer_id')

    # Добавим разброс, чтобы все было видно на карте
    transactions_df_plot['lat'] += np.random.normal(0, 0.0003, len(transactions_df_plot))
    transactions_df_plot['lon'] += np.random.normal(0, 0.0003, len(transactions_df_plot))

    fig = px.choropleth_mapbox(
        all_hexses,
        geojson=all_hexses.geometry,
        locations=all_hexses.index,  # Use index as locations to avoid duplicate rows
        center={"lat": 55.7558, "lon": 37.6173},  # Adjust the center as needed
        mapbox_style="open-street-map",
        opacity=0.2,
        height=800,
        zoom=10
    )

    data = px.scatter_mapbox(transactions_df_plot, lat='lat', lon='lon', color_discrete_sequence=['blue'],
                            hover_data=["datetime_id"]).data
    for trace in data:
        fig.add_trace(trace)

    data = px.scatter_mapbox(target_df_plot, lat='lat', lon='lon', color_discrete_sequence=['red']).data
    for trace in data:
        fig.add_trace(trace)
    fig.show()

In [None]:
# plot_customer_activity(all_hexses, transactions_df, target_df, customer_id)

## Plot predictions

In [None]:
# See https://towardsdatascience.com/constructing-hexagon-maps-with-h3-and-plotly-a-comprehensive-tutorial-8f37a91573bb
def plot_customer_preds(hexses: pd.DataFrame,
                        transactions_df: pd.DataFrame,
                        target_df: pd.DataFrame,
                        preds: pd.DataFrame,
                        customer_id: int) -> None:
    """Plots map of transactions and cash withdraw of particular client or all clients"""
    transactions_df_plot = transactions_df.copy()
    target_df_plot = target_df.copy()
    atm_preds = preds.query(f"index == {customer_id}").T.reset_index()
    atm_preds.columns = ["h3_09","pred"]
    atm_preds["pred"] = atm_preds["pred"].round(5)
    atm_hexses_preds = hexses.merge(atm_preds, on="h3_09")
    atm_hexses_preds = atm_hexses_preds.query("pred>0.01") # Уберем с графика хексы, где очень низкая вероятность снятия

    transactions_df_plot = transactions_df_plot.query('customer_id == @customer_id')
    transactions_df_plot = pd.merge(transactions_df_plot, atm_preds, how="left", on="h3_09")
    target_df_plot = target_df_plot.query('customer_id == @customer_id')
    target_df_plot = pd.merge(target_df_plot, atm_preds, how="left", on="h3_09")
    
    # Добавим разброс, чтобы все было видно на карте
    transactions_df_plot['lat'] += np.random.normal(0, 0.0003, len(transactions_df_plot))
    transactions_df_plot['lon'] += np.random.normal(0, 0.0003, len(transactions_df_plot))

    fig = px.choropleth_mapbox(
        atm_hexses_preds,
        geojson=atm_hexses_preds.geometry,
        locations=atm_hexses_preds.index,  # Use index as locations to avoid duplicate rows
        hover_data=["h3_09","pred"],
        color="pred",
        color_continuous_scale="BuGn",
        center={"lat": 55.7558, "lon": 37.6173},  # Adjust the center as needed
        mapbox_style="carto-positron",
        opacity=0.7,
        height=800,
        zoom=10
    )
    
    data = px.scatter_mapbox(transactions_df_plot, lat='lat', lon='lon', color_discrete_sequence=['blue'], opacity=0.5,
                            hover_data=["datetime_id","pred"]).data
    for trace in data:
        fig.add_trace(trace)

    data = px.scatter_mapbox(target_df_plot, lat='lat', lon='lon', color_discrete_sequence=['red'], opacity=0.5,
                            hover_data=["pred"]).data
    for trace in data:
        fig.add_trace(trace)
    fig.show()

def analyse_customer(all_hexses, transactions_df, target_df, preds, row_scores, customer_id):
    print(f"Scores: mean={row_scores.score.mean().round(5)}, min={round(row_scores.score.min(),5)}, max={round(row_scores.score.max(),5)}, user=", row_scores.query("index == @customer_id").iloc[0,0].round(5))
    plot_customer_preds(all_hexses, transactions_df, target_df, preds, customer_id)

In [None]:
model_1_dir = models_dir / "dummy_mean"
row_scores_1 = pd.read_csv(model_1_dir/"row_scores.csv").set_index("customer_id")
col_scores_1 = pd.read_csv(model_1_dir/"col_scores.csv").set_index("h3_09")
preds_1 = pd.read_parquet(model_1_dir/"preds.parquet").set_index("customer_id")

In [None]:
model_2_dir = models_dir / "catboost_plain_5iter"
row_scores_2 = pd.read_csv(model_2_dir/"row_scores.csv").set_index("customer_id")
col_scores_2 = pd.read_csv(model_2_dir/"col_scores.csv").set_index("h3_09")
preds_2 = pd.read_parquet(model_2_dir/"preds.parquet").set_index("customer_id")

# row_scores_1 = row_scores_1[row_scores_1.index.isin(row_scores_2.index)]
# preds_1 = preds_1[preds_1.index.isin(preds_2.index)]
assert preds_1.shape == preds_2.shape
row_scores_compare = pd.merge(row_scores_1, row_scores_2, left_index=True, right_index=True)
row_scores_compare.columns = ["score_1", "score_2"]
row_scores_compare["score_diff"] = row_scores_compare["score_2"] - row_scores_compare["score_1"]

col_scores_compare = pd.merge(col_scores_1, col_scores_2, left_index=True, right_index=True)
col_scores_compare.columns = ["score_1", "score_2"]
col_scores_compare["score_diff"] = col_scores_compare["score_2"] - col_scores_compare["score_1"]

In [None]:
display(row_scores_compare.mean())
# 15.237166 dummy_mean
# 10.860466 catboost_plain_5iter
display(col_scores_compare.mean())
# 212.532565 dummy_mean
# 151.485038 catboost_plain_5iter

In [None]:
# row_scores_compare.score_diff.hist()

In [None]:
row_scores_compare[row_scores_compare.score_1<600].sort_values("score_diff")

In [None]:
customer_id = 10742
row_scores_compare.query("index==@customer_id")

In [None]:
analyse_customer(all_hexses, transactions_df, target_df, preds_1, row_scores_1, customer_id)

In [None]:
analyse_customer(all_hexses, transactions_df, target_df, preds_2, row_scores_2, customer_id)

In [None]:
len(target_df.h3_09.unique())

In [None]:
col_scores_1.shape

In [None]:
col_scores_compare.query(f"index=='8911aa7b283ffff'")

## Еще файлы

In [None]:
moscow_df = pd.read_parquet(data_dir / "moscow.parquet")

In [None]:
moscow_df.head()

In [None]:
for tag in moscow_df[~moscow_df.tags.isna()].tags.iloc[:20]:
    print(tag)

In [None]:
fig = px.scatter_mapbox(moscow_df.iloc[:10000], lat='lat', lon='lon', hover_name='id', zoom=10, hover_data=["tags"])
fig.update_layout(mapbox_style="open-street-map", height=1000)
fig.update_layout(title='Plot of Points', hovermode='closest')
fig.show()

## Мусор