In [1]:
import pandas as pd
import numpy as np
from scipy.special import softmax
import torch

In [2]:
data = pd.read_csv("data/POIdata_cityA.csv")
data.head()

Unnamed: 0,x,y,category,POI_count
0,1,1,74,4
1,1,1,48,4
2,1,1,79,2
3,1,1,69,2
4,1,1,73,1


In [3]:
x_scaled = np.ceil(data[["x", ]] / 10)
x_scaled = x_scaled.astype("int")
y_scaled = np.ceil(data[["y", ]] / 10)
y_scaled = y_scaled.astype("int")

data["x"] = x_scaled
data["y"] = y_scaled

In [4]:
checked = []
city = pd.DataFrame(columns = data.columns)

for x, y in zip(data["x"], data["y"]):
    if (x, y) in checked:
        continue

    print(f"Aggregating ({x}, {y}) coordinates.")
    filtered_data = data[(data["x"] == x) & (data["y"] == y)]
    aggregated_data = filtered_data.groupby("category", as_index = False)["POI_count"].sum()
    aggregated_data["x"] = [x] * len(aggregated_data)
    aggregated_data["y"] = [y] * len(aggregated_data)

    city = pd.concat([city, aggregated_data[city.columns]], ignore_index = True)
    
    checked.append((x, y))

city

Aggregating (1, 1) coordinates.
Aggregating (1, 2) coordinates.
Aggregating (1, 3) coordinates.
Aggregating (1, 4) coordinates.
Aggregating (1, 5) coordinates.
Aggregating (1, 6) coordinates.
Aggregating (1, 7) coordinates.
Aggregating (1, 8) coordinates.
Aggregating (1, 9) coordinates.
Aggregating (1, 10) coordinates.
Aggregating (2, 1) coordinates.
Aggregating (2, 2) coordinates.
Aggregating (2, 3) coordinates.
Aggregating (2, 4) coordinates.
Aggregating (2, 10) coordinates.
Aggregating (2, 11) coordinates.
Aggregating (2, 12) coordinates.
Aggregating (2, 13) coordinates.
Aggregating (2, 14) coordinates.
Aggregating (3, 1) coordinates.
Aggregating (3, 2) coordinates.
Aggregating (3, 11) coordinates.
Aggregating (3, 12) coordinates.
Aggregating (3, 14) coordinates.
Aggregating (3, 13) coordinates.
Aggregating (3, 15) coordinates.
Aggregating (3, 16) coordinates.
Aggregating (3, 17) coordinates.
Aggregating (4, 1) coordinates.
Aggregating (4, 12) coordinates.
Aggregating (4, 13) coordi

Unnamed: 0,x,y,category,POI_count
0,1,1,4,21
1,1,1,5,9
2,1,1,6,1
3,1,1,7,1
4,1,1,8,1
...,...,...,...,...
17350,20,20,81,125
17351,20,20,82,22
17352,20,20,83,1
17353,20,20,84,23


In [5]:
city = city.sort_values(by = ["x", "y", "category"], ignore_index = True)
city

Unnamed: 0,x,y,category,POI_count
0,1,1,4,21
1,1,1,5,9
2,1,1,6,1
3,1,1,7,1
4,1,1,8,1
...,...,...,...,...
17350,20,20,81,125
17351,20,20,82,22
17352,20,20,83,1
17353,20,20,84,23


In [6]:
compressed = {"x" : [], "y" : []}
for i in range(1, 86):
    compressed[i] = []

compressed_df = pd.DataFrame(compressed)

checked = []

for x, y in zip(city["x"], city["y"]):
    if (x, y) in checked:
        continue

    filtered_vector = city[(city["x"] == x) & (city["y"] == y)]
    red_vector = [0] * 85

    for idx, row in filtered_vector.iterrows():
        category = row["category"]
        poi_count = row["POI_count"]
        
        red_vector[category - 1] = poi_count

    red_vector_softmax = softmax(red_vector)
    compressed_row = [x, y]
    compressed_row.extend(red_vector_softmax)

    compressed_df.loc[len(compressed_df)] = compressed_row

    checked.append((x, y))

In [7]:
compressed_df["x"] = compressed_df["x"].astype("int")
compressed_df["y"] = compressed_df["y"].astype("int")
compressed_df

Unnamed: 0,x,y,1,2,3,4,5,6,7,8,...,76,77,78,79,80,81,82,83,84,85
0,1,1,1.589134e-28,1.589134e-28,1.589134e-28,2.095775e-19,1.287689e-24,4.319715e-28,4.319715e-28,4.319715e-28,...,7.709926e-20,4.319715e-28,3.191861e-27,6.676287e-03,4.319715e-28,2.763941e-10,3.838546e-21,4.319715e-28,4.737142e-25,1.589134e-28
1,1,2,3.221340e-27,8.756511e-27,3.221340e-27,2.610279e-23,6.470235e-26,8.756511e-27,8.756511e-27,3.221340e-27,...,2.862519e-20,8.756511e-27,6.470235e-26,1.154822e-17,1.758792e-25,1.522998e-08,3.532629e-24,6.470235e-26,1.053062e-20,8.756511e-27
2,1,3,2.483533e-23,9.136408e-24,2.483533e-23,5.470346e-19,1.001929e-20,9.136408e-24,1.355963e-21,2.483533e-23,...,4.432667e-15,9.136408e-24,1.355963e-21,4.736959e-02,3.685890e-21,8.676042e-04,1.486994e-18,1.835097e-22,3.275323e-14,9.136408e-24
3,1,4,1.688912e-48,1.688912e-48,4.590938e-48,1.333615e-34,2.506567e-46,1.688912e-48,1.247946e-47,4.590938e-48,...,4.780892e-25,4.590938e-48,3.392270e-47,6.914399e-13,3.720075e-44,9.999998e-01,8.985825e-37,1.852116e-45,2.442600e-36,4.590938e-48
4,1,5,2.639245e-66,2.406680e-69,1.778309e-68,2.227089e-39,1.580225e-61,2.406680e-69,7.174211e-66,4.833945e-68,...,4.905489e-35,1.314003e-67,3.916986e-64,3.532193e-24,1.314003e-67,1.233946e-04,3.817026e-54,5.301064e-65,6.375083e-59,1.778309e-68
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
332,20,16,5.949087e-05,5.949087e-05,5.949087e-05,5.949087e-05,5.949087e-05,5.949087e-05,5.949087e-05,5.949087e-05,...,4.395813e-04,5.949087e-05,5.949087e-05,4.395813e-04,1.617129e-04,2.400033e-02,4.395813e-04,1.617129e-04,5.949087e-05,5.949087e-05
333,20,17,2.061118e-09,2.061118e-09,2.061118e-09,1.522972e-08,2.061118e-09,2.061118e-09,2.061118e-09,2.061118e-09,...,5.602700e-09,2.061118e-09,2.061118e-09,2.061118e-09,2.061118e-09,1.125332e-07,1.522972e-08,2.061118e-09,2.061118e-09,2.061118e-09
334,20,18,3.139133e-17,3.139133e-17,3.139133e-17,4.658886e-15,8.533047e-17,3.139133e-17,8.533047e-17,3.139133e-17,...,4.658886e-15,3.139133e-17,8.533047e-17,9.357622e-14,8.533047e-17,4.139937e-08,8.533047e-17,8.533047e-17,2.319523e-16,3.139133e-17
335,20,19,2.748785e-43,2.748785e-43,5.521082e-42,4.780892e-25,3.305700e-37,7.471971e-43,5.521082e-42,5.521082e-42,...,2.170522e-29,5.521082e-42,3.014408e-40,9.357622e-14,1.500786e-41,9.999999e-01,3.625141e-34,8.194012e-40,6.639676e-36,2.031092e-42


In [8]:
matrix_768x85 = torch.load("phrase_embeddings.pt")
vectors = compressed_df.drop(columns = ["x", "y"])
orange_vector_dict = {}

for idx, row in vectors.iterrows():
    vector_85x1 = torch.tensor(list(row))
    orange_vector = torch.matmul(vector_85x1, matrix_768x85)
    orange_vector_list = orange_vector.tolist()
    
    for i in range(len(orange_vector_list)):
        tostring = str(i + 1)
        key = "orange" + tostring
        
        if orange_vector_dict.get(key, "na") == "na":
            orange_vector_dict[key] = [orange_vector_list[i]]
        else:
            orange_vector_dict[key].append(orange_vector_list[i])

  matrix_768x85 = torch.load("phrase_embeddings.pt")


In [9]:
orange_vector_df = pd.DataFrame(orange_vector_dict)
final_df = pd.concat([compressed_df, orange_vector_df], axis = 1)
final_df.to_csv("cityA_red_orange.csv", index = False)