In [None]:
import os
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error
from sklearn.svm import SVR
from sklearn.neural_network import MLPRegressor
from sklearn.mixture import GaussianMixture
from sklearn.cluster import KMeans
import warnings
import geobleu

# 警告を無視する設定
warnings.filterwarnings("ignore", category=UserWarning)

def count_data_within_m_hours(uid, d_t_combined, m):
    # uidとd_t_combinedを考慮したデータの抽出
    selected_data = data[(data['uid'] == uid) & (data['d_t_combined'] <= d_t_combined)]

    # 過去m時間までのデータ数をカウント
    count = selected_data[(d_t_combined - selected_data['d_t_combined']) <= m].shape[0]

    return count

def featurecreation(data, pcnum, bcnum):

    # 曜日をsin, cosを用いて表現する特徴量を追加
    data['d_sin'] = np.sin(2 * np.pi * data['d'] / 7)
    data['d_cos'] = np.cos(2 * np.pi * data['d'] / 7)

    # 時間をsin, cosを用いて表現する特徴量を追加
    data['t_sin'] = np.sin(2 * np.pi * data['t'] / 48)
    data['t_cos'] = np.cos(2 * np.pi * data['t'] / 48)


    # 変換表を作成
    day_of_week_mapping = {0: 'Sun', 1: 'Mon', 2: 'Tue', 3: 'Wed', 4: 'Thu', 5: 'Fri', 6: 'Sat'}

    # d % 7 の値を文字列に変換
    data['day_of_week'] = data['d'] % 7
    data['day_of_week'] = data['day_of_week'].map(day_of_week_mapping)

    # ダミー変数化して追加
    dummies = pd.get_dummies(data['day_of_week'], prefix='day')
    data = pd.concat([data, dummies], axis=1)

    # 平日フラグを追加
    data['weekday_flag'] = 1  # 平日を示す初期値を設定

    # 'Sat' と 'Sun' の場合には平日フラグを0に設定
    data.loc[data['day_of_week'].isin(['Sat', 'Sun']), 'weekday_flag'] = 0

    # 不要な列を削除
    # data.drop(['day_of_week'], axis=1, inplace=True)

    data['AM_PM'] = data['t'].apply(lambda x: 'AM' if (0 <= x <= 24) else ('PM' if (25 <= x <= 48) else np.nan))

    # 活動時間追加
    data['activetime'] = data['t'].apply(lambda x: 'act' if (18 <= x <= 22) or (33 <= x <= 37) else ('high_act' if (23 <= x <= 32) else ('rest' if (13 <= x <= 17) or (38 <= x <= 43) else ('deep_rest' if (0 <= x <= 12) or (44 <= x <= 48) else np.nan))))

    dummies_3 = pd.get_dummies(data['AM_PM'])
    data = pd.concat([data, dummies_3], axis=1)

    dummies_4 = pd.get_dummies(data['activetime'])
    data = pd.concat([data, dummies_4], axis=1)

    print("complete basic feature creation")


    cell = pd.read_csv('cell_POIcat.csv')

    cell['POIcategory'] = cell['POIcategory'].astype(str)
    pivot_df = cell.pivot_table(index=['x', 'y'], columns='POIcategory', values='POI_count', aggfunc='sum').fillna(0).astype(int)
    pivot_df.columns = ['POIcategory_' + col for col in pivot_df.columns]

    # 元のデータフレームのC列と結合
    cell = cell[['x', 'y']].merge(pivot_df, on=['x', 'y'], how='left').drop_duplicates().reset_index(drop=True)

    # 1. cellのx、yの値から新たに「x_y」のカラムをcellに作成する
    cell['x_y'] = cell['x'].astype(str) + '_' + cell['y'].astype(str)


    # 2. dataのx、yの値からも新たに「x_y」のカラムをdataに作成する
    data['x_y'] = data['x'].astype(str) + '_' + data['y'].astype(str)

    # 3. dataを一行ごとの「x_y」にマッチするcellの「x_y」のPOI_catagoryカラムを持ってきてdataに追加する

    col=list(cell.filter(like="POIcategory_").columns)
    data = data.join(cell.set_index('x_y')[col], on='x_y')

    data = data.fillna(0)


    l = list(data.filter(like="POIcategory_").columns)
    lis = ['uid','activetime']
    col=l+lis

    temp=data[col]

    freq = temp.groupby(['uid', 'activetime']).agg(lambda x: (x != 0).sum()).reset_index()

    # カラム名の変更
    freq.columns = ['uid'] + ['activetime']+ ['freq_POICategory_' + str(i) for i in range(1, 86)]

    # 1から80までのカラムを合計して新しいカラム 'Total_Frequency' を作成
    freq['Total_Frequency'] = freq.iloc[:, 1:85].sum(axis=1)

    # 1から80までのカラムの値を割合に更新
    for i in range(1, 86):
        freq[f'freq_POICategory_{i}'] = freq[f'freq_POICategory_{i}'] / freq['Total_Frequency']

    # 不要なカラムを削除（Total_Frequency はもう不要なので削除）
    freq = freq.drop(columns=['Total_Frequency'])

    data = data.merge(freq, on=['uid', 'activetime'], how='left')

    l = list(data.filter(like="POIcategory_").columns)
    data = data.drop(columns=[col for col in data.columns if col in l])

    print("complete POI Category frequency feature creation")



    l = list(data.filter(like="freq_POICategory_").columns)

    X = data[l]

    # K-means クラスタリング
    kmeans = KMeans(n_clusters=pcnum, random_state=456)
    kmeans.fit(X)
    labels = kmeans.labels_
    cluster_centers = kmeans.cluster_centers_
    data["Cluster"] = labels

    dummies_1 = pd.get_dummies(data['Cluster'], prefix='PCluster')
    data = pd.concat([data, dummies_1], axis=1)

    l.append('Cluster')
    data = data.drop("Cluster",axis=1)



#     # 混合ガウス分布モデルを作成
#     n_components = pcnum  # クラスタ数を指定
#     gmm = GaussianMixture(n_components=n_components, random_state=456)

#     # データをモデルに適合
#     gmm.fit(X)

#     # 各データポイントが各クラスタに所属する確率を計算
#     cluster_probs = gmm.predict_proba(X)

#     # 結果をデータフレームに追加
#     for i in range(n_components):
#         data[f'PCluster_{i}'] = cluster_probs[:, i]

    # # 結果を表示
    # #print(data.filter(like="Cluster_").head())

    # data.head().T

    # l = list(data.filter(like="freq_POICategory_").columns)
    # data = data.drop(columns=[col for col in data.columns if col in l])

    print("complete cluster feature of POI Category frequency feature creation")




    X = data[['d_sin', 'd_cos', 't_sin', 't_cos', 'AM', 'PM', 'act', 'high_act', 'rest', 'deep_rest']]
    print(len(data))

    # K-means クラスタリング
    kmeans = KMeans(n_clusters=bcnum, random_state=456)
    kmeans.fit(X)
    labels = kmeans.labels_
    cluster_centers = kmeans.cluster_centers_
    # data_ = data.copy()

    print(len(labels))
    data["Cluster"] = labels

    dummies_1 = pd.get_dummies(data['Cluster'], prefix='GCluster')
    data = pd.concat([data, dummies_1], axis=1)



#     # 混合ガウス分布モデルを作成
#     gmm = GaussianMixture(n_components=bcnum, random_state=456)

#     # データをモデルに適合
#     gmm.fit(X)

#     # 各データポイントが各クラスタに所属する確率を計算
#     cluster_probs = gmm.predict_proba(X)

#     # 結果をデータフレームに追加
#     for i in range(bcnum):
#         data[f'GCluster_{i}'] = cluster_probs[:, i]

    # 結果を表示
    #print(data.filter(like="Cluster_").head())

    print("complete cluster feature of basic feature creation")

    return data


def model_xy(uid, data, col_list):

    # 目標変数が999以外のデータのみを選択
    data = data[(data['uid'] == uid) & (data['x'] != 999) & (data['y'] != 999)]

    X = data[col_list]
    y_x = data['x']
    y_y = data['y']
    X.head().T
    y_x.head()
    y_y.head()

    # モデルの構築
    model_x = SVR(kernel='rbf')  # RBFカーネルを使用
    model_y = SVR(kernel='rbf')  # RBFカーネルを使用

    # モデルの学習
    model_x.fit(X, y_x)
    model_y.fit(X, y_y)

    return model_x, model_y

print('reading learning function...')


# 予測値の関数を作成
def predict_xy(features, model_x, model_y):

    pred_x = model_x.predict(features)
    pred_y = model_y.predict(features)
    return pred_x, pred_y


print('reading predicting function...')



def evaluation(data, reference_data, col_list):

    ###精度検証

    uids = set(data['uid'])

    geobleu_vals = []
    dtw_vals = []

    for uid in uids:
        generated = []
        reference = []
        print(f"uid: {uid}")

        # 処理2と処理3: x=999とy=999のところのuidをgeneratedとreferenceに追加
        mask = (data['uid'] == uid) & (data['x'] == 999) & (data['y'] == 999)
        generated_rows = data[mask]

        if len(generated_rows)>0:
#             print(f"uid: {uid}")
            model_x, model_y = model_xy(uid,data,col_list)

            for index, row in generated_rows.iterrows():
                t = row['t']
                d = row['d']
                features = row[col_list]
                avg_x, avg_y = predict_xy(features.values.reshape(1, -1), model_x, model_y)
                # xとyの値を四捨五入して整数値に変換
                avg_x = np.where(avg_x < 1, 1, np.where(avg_x > 200, 200, avg_x.round().astype(int)))
                avg_y = np.where(avg_y < 1, 1, np.where(avg_y > 200, 200, avg_y.round().astype(int)))

                # データを整形してリストに追加
                generated.append((d, t, avg_x[0], avg_y[0]))  # avg_xとavg_yを取得する際に[0]を指定して値を取り出す
                reference_row = reference_data[(reference_data['uid'] == uid) & (reference_data['d'] == d) & (reference_data['t'] == t)]
                if not reference_row.empty:
                    reference.append((reference_row['d'].values[0], reference_row['t'].values[0], reference_row['x'].values[0], reference_row['y'].values[0]))

        if len(generated_rows) > 0:
            # 処理4: 評価指標の計算
            geobleu_val = geobleu.calc_geobleu(generated, reference, processes=3)
            dtw_val = geobleu.calc_dtw(generated, reference, processes=3)
            geobleu_vals.append(geobleu_val)
            dtw_vals.append(dtw_val)

#             print("geobleu:", geobleu_val)
#             print("dtw:", dtw_val)


    # 平均と標準偏差を計算して出力
    average_geobleu = sum(geobleu_vals) / len(geobleu_vals)
    std_geobleu = (sum((x - average_geobleu) ** 2 for x in geobleu_vals) / len(geobleu_vals)) ** 0.5

    average_dtw = sum(dtw_vals) / len(dtw_vals)
    std_dtw = (sum((x - average_dtw) ** 2 for x in dtw_vals) / len(dtw_vals)) ** 0.5

    print("Average geobleu:", average_geobleu)
    print("Standard deviation geobleu:", std_geobleu)
    print("Average dtw:", average_dtw)
    print("Standard deviation dtw:", std_dtw)



In [None]:
data = pd.read_csv('<your path>')
reference_data = pd.read_csv('<your path>')
data = featurecreation(data=data, pcnum=5, bcnum=5)

freq_col = list(data.filter(like="freq_POICategory_").columns)
freq_cluster_col = list(data.filter(like="PCluster_").columns)
basic_cluster_col = list(data.filter(like="GCluster_").columns)
basic_col = ['d_sin', 'd_cos', 't_sin', 't_cos',
          'day_Sun', 'day_Mon', 'day_Tue', 'day_Wed', 'day_Thu', 'day_Fri', 'day_Sat', 'weekday_flag', 'AM', 'PM', 'act', 'high_act', 'rest', 'deep_rest']

col_list = basic_col+basic_cluster_col+freq_cluster_col
evaluation(data,reference_data, col_list)