In [20]:
import os
import random
import numpy as np
import torch
import tensorflow as tf
from transformers.trainer_utils import set_seed

def set_random_seed(seed: int = 42):
    # Python, NumPy, random モジュール、PyTorch のシードを設定
    os.environ['PYTHONHASHSEED'] = str(seed)
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    
    # TensorFlow のシードを設定
    tf.random.set_seed(seed)

    # PyTorch 設定（再現性のためにcudnnの設定を固定）
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

set_random_seed(42)

In [21]:
import torch
import tensorflow as tf
import gc

def report_memory():
    """
    PyTorchおよびTensorFlowのGPUメモリ使用量を報告する関数
    """
    if torch.cuda.is_available():
        pass
    else:
        print("PyTorchでGPUが検出されませんでした。")

    # TensorFlowのメモリ使用量報告
    # print("\nTensorFlowのメモリ使用量:")
    gpu_devices = tf.config.experimental.list_physical_devices('GPU')
    if gpu_devices:
        for i, gpu in enumerate(gpu_devices):
            details = tf.config.experimental.get_memory_info(f"GPU:{i}")
    else:
        print("TensorFlowでGPUが検出されませんでした。")

def cleanup_gpu_memory():
    """
    PyTorchおよびTensorFlowのGPUメモリをクリーンアップし、メモリ使用量を報告する関数
    """
    # ガベージコレクション
    gc.collect()

    # PyTorchのメモリクリーンアップ
    if torch.cuda.is_available():
        torch.cuda.empty_cache()
        torch.cuda.reset_peak_memory_stats()
        torch.cuda.reset_accumulated_memory_stats()

    # TensorFlowのメモリクリーンアップ
    tf.keras.backend.clear_session()
    gpu_devices = tf.config.experimental.list_physical_devices('GPU')
    if gpu_devices:
        for i, gpu in enumerate(gpu_devices):
            tf.config.experimental.reset_memory_stats(f"GPU:{i}")  # デバイス名を "GPU:0" の形式で指定

    # print("After cleanup:")
    report_memory()

In [22]:
import torch.nn as nn
import torch_geometric.nn as pyg_nn
import torch.nn.functional as F
from torch_geometric.utils import degree

class Swish(nn.Module):
    def forward(self, x):
        return x * torch.sigmoid(x)
        
class STGCNModel(nn.Module):
    def __init__(self, in_channels, hidden_channels, num_classes):
        super(STGCNModel, self).__init__()

        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

        self.conv1 = pyg_nn.ChebConv(in_channels, hidden_channels, K=3)  # Kは多項式の次数
        self.conv2 = pyg_nn.ChebConv(hidden_channels, hidden_channels, K=3)
        
        # 時系列を扱うLSTM (bidirectional=Trueの時、出力は2倍のhidden_channelsになるため、全結合層の入力も2倍にする)
        self.lstm = nn.LSTM(hidden_channels, hidden_channels, batch_first=True)
        self.swish = Swish()
        self.fc = nn.Linear(hidden_channels, num_classes)
    
    def forward(self, data): 
        # グラフ畳み込み層の適用
        x, edge_index = data.x.to(self.device), data.edge_index.to(self.device)
        x = self.conv1(x, edge_index)
        x = self.swish(x)
        x = self.conv2(x, edge_index)
        
        # LSTM
        x = x.unsqueeze(1)  # 時系列次元を追加
        x, (h_n, c_n) = self.lstm(x)

        embedding = x[:, -1, :]

        # 出力層
        x = self.fc(embedding)  # 最後のタイムステップの出力を取得

        return x, embedding

In [23]:
def featurecreation(data):

    print('start create feature...')
    
    # 曜日をsin, cosを用いて表現する特徴量を追加
    data['d_sin'] = np.sin(2 * np.pi * data['d'] / 7)
    data['d_cos'] = np.cos(2 * np.pi * data['d'] / 7)

    # 時間をsin, cosを用いて表現する特徴量を追加
    data['t_sin'] = np.sin(2 * np.pi * data['t'] / 48)
    data['t_cos'] = np.cos(2 * np.pi * data['t'] / 48)

    data['time_point'] = data['t'].apply(lambda x: 0 if (0 <= x <= 14) else (0 if (38 <= x <= 48) else 1)) # 0-7時と19-24時

    return data

In [24]:
def create_edge_index(data):
    num_nodes = len(data)
    edge_index = torch.tensor([
        [i for i in range(num_nodes - 1)],  # エッジの始点
        [i + 1 for i in range(num_nodes - 1)]  # エッジの終点
    ], dtype=torch.long)
    return edge_index

In [25]:
# トレーニングプロセス
def train_model(data, labels, model, optimizer, criterion, thresholds, epochs=300, patience=10):
    results = []
    
    model.train()
    set_random_seed(42)
    best_loss = float('inf')
    patience_counter = 0
    best_model_state = None  # ベストモデルの状態を保存する変数

    for epoch in range(epochs):
        optimizer.zero_grad()
        out, _ = model(data)
        loss = criterion(out, labels)  # x座標とy座標の予測

        if epoch==0:
            results.append({
                'epoch': epoch,
                'loss': loss.item()
            })

        if (epoch==0)&(loss.item() >= thresholds):
            best_model_state = None
            return results, best_model_state, loss
        
        loss.backward()
        optimizer.step()
        
        # エポックごとにベスト損失を記録
        if loss.item() < best_loss:
            best_loss = loss.item()
            patience_counter = 0  # 改善が見られたのでリセット
            best_model_state = model.state_dict()  # モデルの状態を保存
        else:
            patience_counter += 1  # 改善が見られなかった場合、カウンターを増やす

        # 100単位で損失を表示
        if epoch % 100 == 0:
            print(f'Epoch {epoch}, Loss: {loss.item()}')
            
        # アーリーストッピング条件
        if patience_counter >= patience:
            print(f"Early stopping at epoch {epoch} with best loss {best_loss:.4f}")
            break

    # ベストモデルの状態に戻す
    if best_model_state is not None:
        model.load_state_dict(best_model_state)  # メモリ上でベストモデルを復元
        print(f"Best model restored with loss {best_loss:.4f}")

    return results, best_model_state, loss

def add_embedding_to_dataframe(df, embedding):
    embedding_np = embedding.cpu().numpy()
    embedding_df = pd.DataFrame(embedding_np, columns=[f'dim_{i}' for i in range(embedding_np.shape[1])])
    data = pd.concat([df['x_y'].reset_index(drop=True), embedding_df], axis=1)
    return data

# 重みの初期化関数
def initialize_weights(m):
    set_random_seed(42)
    if isinstance(m, nn.Linear):  # 線形層の場合
        torch.nn.init.xavier_uniform_(m.weight)  # Xavier一様分布で重みを初期化
        if m.bias is not None:
            m.bias.data.fill_(0.0)  # バイアスを0で初期化

In [28]:
import torch
from torch_geometric.data import Data
import torch_geometric.transforms as T
from sklearn.preprocessing import LabelEncoder
import geobleu_main.geobleu as geobleu
import time
import itertools
import csv
from sklearn.svm import SVR, SVC
import pandas as pd
import numpy as np
from tensorflow import keras
import geobleu_main.geobleu as geobleu
import warnings
from sklearn.preprocessing import MinMaxScaler, StandardScaler 

warnings.filterwarnings("ignore")

def create_data(df_target, thresholds, save_dir, city_name):

    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    
    # 特徴量エンジニアリング
    print(df_target.shape)
    df = featurecreation(df_target)
    print(df.shape)

    df['x_y'] = df['x'].astype(str) + '_' + df['y'].astype(str)
    
    # データをuidごとにグループ化
    grouped_data = df.groupby('uid')
    
    generated_all = []
    
    start = time.perf_counter()
    
    for uid, group in grouped_data:
        print(f"Processing uid: {uid}")

        group = group.reset_index(drop=True)

        if len(group[group['d']<60].x_y.unique())==1:
            x = group[group['d']<60].x.unique()
            y = group[group['d']<60].y.unique()
            cnt=0
            for index, row in group[group['d']>=60].iterrows():
                d = row['d']
                t = row['t']

                if cnt == 0:
                    print(x[0], y[0])
                    cnt+=1

                # データを整形してリストに追加
                generated_all.append((uid, d, t, x[0], y[0]))  # avg_xとavg_yを取得する際に[0]を指定して値を取り出す

        else:
            train_data = group[group['d'] < 60]
            test_data = group[group['d'] >= 60]
        
            # 学習用と評価用の edge_index を作成
            train_edge_index = create_edge_index(train_data)
            test_edge_index = create_edge_index(test_data)
        
            use_col = ['d_sin', 'd_cos', 't_sin', 't_cos', 'time_point']
        
            # ノード特徴量を準備
            train_x = torch.tensor(train_data[use_col].values, dtype=torch.float)
            test_x = torch.tensor(test_data[use_col].values, dtype=torch.float)
        
            # データセット作成
            data_tr = Data(x=train_x, edge_index=train_edge_index)
            data_ts = Data(x=test_x, edge_index=test_edge_index)
        
            # x_yラベルを数値に変換
            label_encoder = LabelEncoder()
            train_data['x_y_encoded'] = label_encoder.fit_transform(train_data['x_y'])
            
            # エンコーディングされたラベルをPyTorchテンソルに変換
            labels = torch.tensor(train_data['x_y_encoded'].values, dtype=torch.long)
            
            # モデル、損失関数、オプティマイザの設定
            num_classes = len(train_data['x_y'].unique())  # クラス数
            model = STGCNModel(in_channels=len(use_col), hidden_channels=64, num_classes=num_classes)
    
            # 重みの初期化を適用
            model.apply(initialize_weights)
            
            criterion = nn.CrossEntropyLoss()  # クラス分類なのでCrossEntropyを使用
            optimizer = torch.optim.AdamW(model.parameters(), lr=0.001, weight_decay=0.01)
            
            # モデルをGPUに移動
            model = model.to(device)
            
            # データもGPUに移動
            data_tr = data_tr.to(device)
            
            labels = labels.to(device)
            results, best_model_state, loss = train_model(data_tr, labels, model, optimizer, criterion, thresholds, epochs=500, patience=15)
        
            del train_x, train_edge_index, labels, model
            cleanup_gpu_memory()
    
            if loss >= thresholds:
                model = SVC(kernel='rbf', random_state=42)
                X = train_data[use_col]
                y = train_data['x_y']
                model.fit(X, y)
                predictions = []
                
                for _, test_row in test_data.iterrows():
                    test_row_values = test_row[use_col].astype(float).values  # 値をNumPy配列に変換
                    predicted_x_y = model.predict(test_row_values.reshape(1, -1))
                    
                    # 予測結果をリストに追加
                    predictions.append(predicted_x_y[0])
            else:
                model = STGCNModel(in_channels=len(use_col), hidden_channels=64, num_classes=num_classes)
                model = model.to(device)
                model.load_state_dict(best_model_state)
                
                # 予測
                model.eval()
                
                data_ts = data_ts.to(device)
                with torch.no_grad():
                    # ベストモデルでtrain/testエンベディングを取得
                    _, train_embedding = model(data_tr)
                    _, test_embedding = model(data_ts)
            
                train_svc = add_embedding_to_dataframe(train_data, train_embedding)
                test_svc = add_embedding_to_dataframe(test_data, test_embedding)
                test_svc = test_svc.drop('x_y', axis=1)
            
                model_svc = SVC(kernel='rbf', random_state=42)
                X = train_svc.drop('x_y', axis=1)
                y = train_svc['x_y']
                
                model_svc.fit(X, y)
    
                predictions = []
                
                for _, test_row in test_svc.iterrows():
                    test_row_values = test_row.astype(float).values  # 値をNumPy配列に変換
                    predicted_x_y = model_svc.predict(test_row_values.reshape(1, -1))
                    
                    # 予測結果をリストに追加
                    predictions.append(predicted_x_y[0])
                    
            predicted_x_y = np.array(predictions)
        
            del model, data_ts, test_x, test_edge_index
            cleanup_gpu_memory()
                
            predictions = np.array([pred.split('_') for pred in predicted_x_y])
            
            # 予測されたxとyをそれぞれ整数に変換
            avg_x = predictions[:, 0].astype(float).astype(int)
            avg_y = predictions[:, 1].astype(float).astype(int)
            
            # xとyの値を四捨五入して整数値に変換
            avg_x = np.where(avg_x < 1, 1, np.where(avg_x > 200, 200, avg_x.round().astype(int)))
            avg_y = np.where(avg_y < 1, 1, np.where(avg_y > 200, 200, avg_y.round().astype(int)))
        
        
            # d, t, x, y列を持つ空のデータフレームを作成
            columns = ['uid', 'd', 't', 'x', 'y']
            df_pr = pd.DataFrame(columns=columns)
            
            df_pr['uid'], df_pr['d'], df_pr['t'], df_pr['x'], df_pr['y'] = test_data['uid'], test_data['d'], test_data['t'], avg_x, avg_y
        
            # (d, t, x, y)のタプルのリストを作成
            generated = [tuple(row) for row in df_pr[['uid', 'd', 't', 'x', 'y']].to_numpy()]
            generated_all.extend(generated)

    df_generated = pd.DataFrame(generated_all, columns=['uid', 'd', 't', 'x', 'y'])

    filename = f"{save_dir}/mukumuku_city{city_name}_humob24.csv"

    # gzip形式で保存
    df_generated.to_csv(f'{filename}.gz', index=False, compression="gzip")

    # csv形式で保存
    df_generated.to_csv(filename, index=False)

    print("ファイル保存完了")
    
    end = time.perf_counter()
    print('{:.2f}分'.format((end-start)/60))

In [30]:
import os

# ⇒上記の結果から閾値を以下のように設定
# cityB: 5.1
# cityC: 5.2
# cityD: 5.3

city_list = ['B', 'C', 'D']
thresholds_list = [5.1, 5.2, 5.3]
save_dir = 'submit_file_GNN'

if not os.path.exists(save_dir):
    os.makedirs(save_dir)
    
for i, c in enumerate(city_list):

    thresholds = thresholds_list[i]

    print(f'City Name: city{c}')
    print(f'Threshold: {thresholds}')

    df_target = pd.read_csv(f'HuMob_Data/city{c}_challengedata_target.csv')

    # # Test
    # first_10_uids = df_target['uid'].unique()[:100]
    
    # # それに対応するデータを抽出
    # test = df_target[df_target['uid'].isin(first_10_uids)]
    
    create_data(df_target, thresholds, save_dir, c)

City Name: cityB
Threshold: 5.1
(2399892, 5)
start create feature...
(2399892, 10)
Processing uid: 22000
Epoch 0, Loss: 3.8010895252227783
Epoch 100, Loss: 1.6108635663986206
Epoch 200, Loss: 1.0982691049575806
Epoch 300, Loss: 0.7103473544120789
Epoch 400, Loss: 0.48730239272117615
Best model restored with loss 0.3662
Processing uid: 22001
Epoch 0, Loss: 4.438796520233154
Epoch 100, Loss: 2.092865228652954
Epoch 200, Loss: 1.7626616954803467
Epoch 300, Loss: 1.4297770261764526
Epoch 400, Loss: 1.1591169834136963
Best model restored with loss 0.9635
Processing uid: 22002
Epoch 0, Loss: 4.941259384155273
Epoch 100, Loss: 2.957155227661133
Epoch 200, Loss: 2.268467664718628
Epoch 300, Loss: 1.8036857843399048
Epoch 400, Loss: 1.5228379964828491
Best model restored with loss 1.3416
Processing uid: 22003
Epoch 0, Loss: 3.5065600872039795
Epoch 100, Loss: 1.2708261013031006
Epoch 200, Loss: 1.1212379932403564
Epoch 300, Loss: 0.9522225260734558
Epoch 400, Loss: 0.7780984044075012
Best model

In [33]:
df = pd.read_csv('submit_file_GNN/mukumuku_cityC_humob24.csv')
df.shape

(449308, 5)