In [1]:
import pandas as pd
import glob
from datetime import datetime
import time

#### Preprocessing of the raw collected data into the table where one row shows one period of time that the bike spent on the same location (station)

In [2]:
url = "/bigdata/jelicicna/output_datasets/cleaned_data_nextbike_2020-10.csv.gz"
df = pd.read_csv(url, sep=';', parse_dates=True, low_memory=False)

In [3]:
fmt = '%Y-%m-%d %H:%M:%S'

In [4]:
def parse_timestamp(el):
    a = el.split('.')[0]
    b = datetime.strptime(a, fmt)
    return b

In [5]:
df['time_begin'] = df['time_begin'].map(parse_timestamp)
df['time_end'] = df['time_end'].map(parse_timestamp)

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 99846 entries, 0 to 99845
Data columns (total 25 columns):
lat                   99846 non-null float64
lng                   99846 non-null float64
time_begin            99846 non-null datetime64[ns]
time_end              99846 non-null datetime64[ns]
name                  99846 non-null object
address               621 non-null object
bike                  99846 non-null bool
bike_racks            99846 non-null int64
bikes                 99846 non-null int64
booked_bikes          99846 non-null int64
free_racks            99846 non-null int64
free_special_racks    99846 non-null int64
maintenance           99846 non-null bool
number                99846 non-null int64
place_type            99846 non-null int64
rack_locks            99846 non-null bool
special_racks         99846 non-null int64
spot                  99846 non-null bool
terminal_type         0 non-null float64
uid                   99846 non-null int64
city           

In [7]:
new_df = pd.DataFrame(columns= ['lat', 'lng', 'parking_begin', 'parking_end', 'parked_time', 'uid', 'name', 'address', 
                                'available_bikes', 'maintenance','bike', 'bike_racks', 'bikes', 
                                'booked_bikes', 'free_racks', 'free_special_racks', 'number', 
                                'place_type', 'rack_locks', 'special_racks', 'spot', 'terminal_type', 'city',
                                'country_code', 'company', 'timezone'
                               ] ) #empty dataframe with the new column structure

In [8]:
for index, row in df.iterrows():
    
    d1_ts = time.mktime(df['time_begin'][index].timetuple())
    d2_ts = time.mktime(df['time_end'][index].timetuple())
    
    minsDiff = round((int(d2_ts-d1_ts) / 60), 0)
    new_df = new_df.append({'lat' : row['lat'], 'lng' : row['lng'] ,
                            'parking_begin' : row['time_begin'], 'parking_end' : row['time_end'], 
                            'parked_time' : minsDiff, 'uid' : row['uid'], 'name' : row['name'], 'address' : row['address'], 
                            'available_bikes' : row['available_bikes'], 'maintenance' : row['maintenance'], 
                            'bike' : row['bike'], 'bike_racks' : row['bike_racks'], 'bikes' : row['bikes'], 
                            'booked_bikes' : row['booked_bikes'], 'free_racks' : row['free_racks'], 
                            'free_special_racks' : row['free_special_racks'], 
                            'number' : row['number'], 
                            'place_type' : row['place_type'], 'rack_locks' : row['rack_locks'], 
                            'special_racks' : row['special_racks'], 'spot' : row['spot'], 
                            'terminal_type' : row['terminal_type'], 'city' : row['city'], 
                            'country_code' : row['country_code'], 'company' : row['company'], 
                            'timezone' : row['timezone']
                           }, ignore_index=True)


In [9]:
new_df.head()

Unnamed: 0,lat,lng,parking_begin,parking_end,parked_time,uid,name,address,available_bikes,maintenance,...,number,place_type,rack_locks,special_racks,spot,terminal_type,city,country_code,company,timezone
0,50.91756,6.940145,2020-10-01 00:00:02,2020-10-01 12:49:02,769.0,37174790,BIKE 22754,,1275,False,...,0,12,False,0,False,,Köln,DE,Nextbike,Europe/Berlin
1,50.904742,6.96317,2020-10-01 15:01:02,2020-10-01 15:01:02,0.0,37199571,BIKE 22754,,1310,False,...,0,12,False,0,False,,Köln,DE,Nextbike,Europe/Berlin
2,50.904745,6.963179,2020-10-01 15:02:03,2020-10-01 15:09:02,7.0,37199571,BIKE 22754,,1308,False,...,0,12,False,0,False,,Köln,DE,Nextbike,Europe/Berlin
3,50.904874,6.962288,2020-10-01 15:10:02,2020-10-01 15:10:02,0.0,37206818,BIKE 22754,,1309,False,...,0,12,False,0,False,,Köln,DE,Nextbike,Europe/Berlin
4,50.904932,6.962707,2020-10-01 15:11:02,2020-10-01 17:58:02,167.0,37206870,BIKE 22754,,1309,False,...,0,12,False,0,False,,Köln,DE,Nextbike,Europe/Berlin


In [10]:
new_df.to_csv('/bigdata/jelicicna/output_datasets/parked_time_data_nextbike_2020-10.csv', sep=';', index=False) #change the date in the output name