## Python 2

In [7]:
import pandas as pd
import time
import datetime

In [8]:
my_data = pd.read_csv("all_accelerometer_data_pids_13.csv")
my_data.head()

Unnamed: 0,time,pid,x,y,z
0,0,JB3156,0.0,0.0,0.0
1,0,CC6740,0.0,0.0,0.0
2,1493733882409,SA0297,0.0758,0.0273,-0.0102
3,1493733882455,SA0297,-0.0359,0.0794,0.0037
4,1493733882500,SA0297,-0.2427,-0.0861,-0.0163


In [9]:
my_data.shape

(14057567, 5)

In [10]:
df = my_data
df.shape

(14057567, 5)

In [11]:
def change_time(dataframe):
    dataframe2 = dataframe['time'].apply(lambda x: pd.to_datetime(x, unit='ns'))
    return dataframe2

### Without Multiprocessing

In [None]:
start_time = time.time()
df2 = change_time(df)
end_time = time.time()
time_elapsed = end_time-start_time
print('The runtime without multiprocess is %s seconds' %(time_elapsed))

### With Multiprocessing

In [12]:
import multiprocessing as mp
from multiprocessing import Pool
from multiprocessing import cpu_count
import numpy as np

In [13]:
time_process = mp.Process(target=change_time, args=(df,))

In [14]:
start_time = time.time()
time_process.start()
time_process.join()
time_process.terminate()
end_time = time.time()
time_elapsed = end_time-start_time
print('The runtime with multiprocess is %s seconds' %(time_elapsed))

The runtime with multiprocess is 2170.46045804 seconds


### Multiprocess with Pool

In [15]:
num_partitions = 10 #number of partitions to split dataframe
num_cores = 4 #number of cores on your machine

def parallelize_dataframe(df, func):
    df_split = np.array_split(df, num_partitions)
    pool = Pool(num_cores)
    df = pd.concat(pool.map(func, df_split))
    pool.close()
    pool.join()
    return df

start_time = time.time()
parallelize_dataframe(df, change_time)
end_time = time.time()
time_elapsed = end_time-start_time
print('The runtime with multiprocess and pool is %s seconds' %(time_elapsed))

The runtime with multiprocess and pool is 977.007727146 seconds
