In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import numpy as np
import pandas as pd
import ast
import time

In [3]:
data_path = '/content/drive/My Drive/big-data-analytics-2025/dtw_test.csv'
source_df = pd.read_csv(data_path)

In [4]:

def dtw(x, y):

    x = np.array(x)
    y = np.array(y)

    n = len(x)
    m = len(y)

    dtw_matrix = np.full((n + 1, m + 1), np.inf)

    dtw_matrix[0, 0] = 0

    for i in range(1, n + 1):
        for j in range(1, m + 1):
            cost = np.linalg.norm(x[i-1] - y[j-1])
            prev_min = min(dtw_matrix[i-1, j], dtw_matrix[i, j-1], dtw_matrix[i-1, j-1])
            dtw_matrix[i, j] = cost + prev_min

    return dtw_matrix[n, m]



In [5]:
source_df['series_a'] = source_df['series_a'].apply(ast.literal_eval)
source_df['series_b'] = source_df['series_b'].apply(ast.literal_eval)

In [6]:


start_time = time.time()
dtw_distances = []
for idx, row in source_df.iterrows():
    dtw_distance = dtw(row['series_a'], row['series_b'])
    dtw_distances.append(dtw_distance)

end_time = time.time()
source_df['DTW distance'] = dtw_distances

total_time_minutes = (end_time - start_time) / 60
avg_time_per_pair_minutes = total_time_minutes / len(source_df)

print(f"Total execution time: {total_time_minutes:.2f} minutes")
print(f"Average time per pair: {avg_time_per_pair_minutes:.4f} minutes")
print(f"Number of pairs processed: {len(source_df)}")

print("\nFirst few results:")
print(source_df[['id', 'DTW distance']].head())

Total execution time: 47.09 minutes
Average time per pair: 0.0470 minutes
Number of pairs processed: 1002

First few results:
   id  DTW distance
0   0       18.8752
1   1       16.4397
2   2       12.0723
3   3       17.0150
4   4        8.7369


In [7]:
source_df[['id', 'DTW distance']].to_csv('dtw.csv', index=False)

In [8]:
source_df[['id', 'DTW distance']].to_csv('/content/drive/My Drive/dtw.csv', index=False)