In [1]:
import pandas as pd
import numpy as np

In [2]:
# import data set
df_la = pd.read_csv("la_2019.csv", parse_dates=["start_time","end_time"], low_memory=False)

In [3]:
# data overview
df_la.head(3)

Unnamed: 0,start_time,end_time,start_station_id,end_station_id,bike_id,user_type,start_station_name,end_station_name
0,2019-01-01 00:07:00,2019-01-01 00:14:00,3046,3051,6468,Walk-up,2nd & Hill,7th & Broadway
1,2019-01-01 00:08:00,2019-01-01 00:14:00,3046,3051,12311,Walk-up,2nd & Hill,7th & Broadway
2,2019-01-01 00:18:00,2019-01-01 00:50:00,3030,3075,5992,Walk-up,Main & 1st,Broadway & 9th


In [4]:
print(df_la.count())
print(df_la.info())

start_time            290342
end_time              290342
start_station_id      290342
end_station_id        290342
bike_id               290342
user_type             290342
start_station_name    290342
end_station_name      290342
dtype: int64
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 290342 entries, 0 to 290341
Data columns (total 8 columns):
 #   Column              Non-Null Count   Dtype         
---  ------              --------------   -----         
 0   start_time          290342 non-null  datetime64[ns]
 1   end_time            290342 non-null  datetime64[ns]
 2   start_station_id    290342 non-null  int64         
 3   end_station_id      290342 non-null  int64         
 4   bike_id             290342 non-null  object        
 5   user_type           290342 non-null  object        
 6   start_station_name  290342 non-null  object        
 7   end_station_name    290342 non-null  object        
dtypes: datetime64[ns](2), int64(2), object(4)
memory usage: 17.7+ MB
None


In [5]:
# check if there are null values
df_la.isnull()

Unnamed: 0,start_time,end_time,start_station_id,end_station_id,bike_id,user_type,start_station_name,end_station_name
0,False,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...
290337,False,False,False,False,False,False,False,False
290338,False,False,False,False,False,False,False,False
290339,False,False,False,False,False,False,False,False
290340,False,False,False,False,False,False,False,False


In [6]:
# drop rows which contain missing values
df_la.dropna(axis=0, inplace=True)

In [7]:
df_la["bike_id"].size

290342

In [8]:
# add the duration of the trip
df_la["duration"] = (df_la["end_time"] - df_la["start_time"]).astype("timedelta64[m]")

In [9]:
# remove data with same start and end station, that are shorter or equal to one minute
df_la = df_la[~((df_la["start_station_name"] == df_la["end_station_name"]) & (df_la["duration"] <= 1.0))]

In [10]:
df_la["bike_id"].size

284417

In [11]:
# remove data with same start and end station, that are longer or equal to 24h
df_la = df_la[ ~ (df_la["duration"] >= 1440.0) ]

In [12]:
df_la["bike_id"].size

283135

In [13]:
df_la["user_type"].unique()

array(['Walk-up', 'Monthly Pass', 'Annual Pass', 'One Day Pass',
       'Flex Pass', 'Testing'], dtype=object)

In [14]:
wu = df_la[df_la["user_type"]=="Walk-up"]
print(wu["user_type"].size)
mp = df_la[df_la["user_type"]=="Monthly Pass"]
print(mp["user_type"].size)
ap = df_la[df_la["user_type"]=="Annual Pass"]
print(ap["user_type"].size)
odp = df_la[df_la["user_type"]=="One Day Pass"]
print(odp["user_type"].size)
fp = df_la[df_la["user_type"]=="Flex Pass"]
print(fp["user_type"].size)
test = df_la[df_la["user_type"]=="Testing"]
print(test["user_type"].size)
77857

77857
169677
21078
14128
313
82


77857

In [15]:
# new columns 
df_la["date"] = df_la["start_time"].apply(lambda x: x.date())
df_la['month']= df_la["start_time"].apply(lambda x: x.month)
df_la["weekday"]= df_la["start_time"].apply(lambda x: x.weekday())
df_la["day"] = df_la["start_time"].apply(lambda x: x.day)
df_la["hour"]= df_la["start_time"].apply(lambda x: x.hour)
df_la.head(3)

Unnamed: 0,start_time,end_time,start_station_id,end_station_id,bike_id,user_type,start_station_name,end_station_name,duration,date,month,weekday,day,hour
0,2019-01-01 00:07:00,2019-01-01 00:14:00,3046,3051,6468,Walk-up,2nd & Hill,7th & Broadway,7.0,2019-01-01,1,1,1,0
1,2019-01-01 00:08:00,2019-01-01 00:14:00,3046,3051,12311,Walk-up,2nd & Hill,7th & Broadway,6.0,2019-01-01,1,1,1,0
2,2019-01-01 00:18:00,2019-01-01 00:50:00,3030,3075,5992,Walk-up,Main & 1st,Broadway & 9th,32.0,2019-01-01,1,1,1,0


In [16]:
# saving data set
df_la.to_csv("prepared_data.csv")

In [17]:
df_weather = pd.read_csv("weather_hourly_la.csv" ,parse_dates=["date_time"])
df_weather.head(3)

Unnamed: 0,date_time,max_temp,min_temp,precip
0,2015-01-02 01:00:00,11.7,11.7,0.0
1,2015-01-02 02:00:00,11.1,11.1,0.0
2,2015-01-02 03:00:00,11.1,11.1,0.0


In [18]:
df_weather.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 43848 entries, 0 to 43847
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype         
---  ------     --------------  -----         
 0   date_time  43756 non-null  datetime64[ns]
 1   max_temp   43756 non-null  float64       
 2   min_temp   43756 non-null  float64       
 3   precip     43758 non-null  float64       
dtypes: datetime64[ns](1), float64(3)
memory usage: 1.3 MB


In [19]:
# keep only data from 2019
df_weather = df_weather[df_weather["date_time"].apply(lambda x: x.year == 2019)]
df_weather = df_weather.dropna()
df_weather.head(3)

Unnamed: 0,date_time,max_temp,min_temp,precip
35040,2019-01-01 01:00:00,15.6,15.6,0.0
35041,2019-01-01 02:00:00,15.0,15.0,0.0
35042,2019-01-01 03:00:00,15.0,15.0,0.0


In [20]:
# df_weather["temp"] = (df_weather["max_temp"]+df_weather["min_temp"])/2

In [21]:
# add month, day, hour
df_weather["date"] = df_weather["date_time"].apply(lambda x:x.date())
df_weather["month"] = df_weather["date_time"].apply(lambda x:x.month)
df_weather["day"] = df_weather["date_time"].apply(lambda x:x.day)
df_weather["hour"] = df_weather["date_time"].apply(lambda x:x.hour)
df_weather.sort_values(by="date_time")
df_weather.head(1)

Unnamed: 0,date_time,max_temp,min_temp,precip,date,month,day,hour
35040,2019-01-01 01:00:00,15.6,15.6,0.0,2019-01-01,1,1,1


In [22]:
# saving data set
df_weather.to_csv("prepared_data_weather.csv")

In [23]:
# merge weather and bikesharing data
merged_data = df_la.merge(df_weather[["max_temp","min_temp","precip","date","month","day","hour"]],left_on=["date","month","day","hour"],right_on=["date","month","day","hour"])
merged_data.head(1)

Unnamed: 0,start_time,end_time,start_station_id,end_station_id,bike_id,user_type,start_station_name,end_station_name,duration,date,month,weekday,day,hour,max_temp,min_temp,precip
0,2019-01-01 00:07:00,2019-01-01 00:14:00,3046,3051,6468,Walk-up,2nd & Hill,7th & Broadway,7.0,2019-01-01,1,1,1,0,15.6,15.6,0.0


In [24]:
# saving data set
merged_data.to_csv("merged_data.csv")