## Research Questions

-Identify the most popular hour of the day for picking up a shared electric car (Bluecar) in the city of Paris over the month of April 2018.

-


# Importing our Modules

In [1]:
# Importing pandas
#
import pandas as pd



# Importing our CSV files and creating our dataset.

In [2]:
url = "./dataset/autolib.csv"
df = pd.read_csv(url)
df.head()

Unnamed: 0,Address,Cars,Bluecar counter,Utilib counter,Utilib 1.4 counter,Charge Slots,Charging Status,City,Displayed comment,ID,...,Scheduled at,Slots,Station type,Status,Subscription status,year,month,day,hour,minute
0,2 Avenue de Suffren,0,0,0,0,0,nonexistent,Paris,,paris-suffren-2,...,,2,station,ok,nonexistent,2018,4,8,11,43
1,145 Rue Raymond Losserand,6,6,0,0,0,operational,Paris,,paris-raymondlosserand-145,...,,0,station,ok,nonexistent,2018,4,6,7,24
2,2 Avenue John Fitzgerald Kennedy,3,3,0,2,0,operational,Le Bourget,,lebourget-johnfitzgeraldkennedy-2,...,,1,station,ok,nonexistent,2018,4,3,20,14
3,51 Rue EugÃ¨ne OudinÃ©,3,3,1,0,1,operational,Paris,,paris-eugeneoudine-51,...,,2,station,ok,nonexistent,2018,4,4,4,37
4,6 avenue de la Porte de Champerret,3,3,0,0,0,nonexistent,Paris,,paris-portedechamperret-6,...,,3,station,ok,nonexistent,2018,4,8,17,23


# View our data info

In [3]:
# Getting information from our dataset.
#
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 25 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   Address              5000 non-null   object
 1   Cars                 5000 non-null   int64 
 2   Bluecar counter      5000 non-null   int64 
 3   Utilib counter       5000 non-null   int64 
 4   Utilib 1.4 counter   5000 non-null   int64 
 5   Charge Slots         5000 non-null   int64 
 6   Charging Status      5000 non-null   object
 7   City                 5000 non-null   object
 8   Displayed comment    111 non-null    object
 9   ID                   5000 non-null   object
 10  Kind                 5000 non-null   object
 11  Geo point            5000 non-null   object
 12  Postal code          5000 non-null   int64 
 13  Public name          5000 non-null   object
 14  Rental status        5000 non-null   object
 15  Scheduled at         47 non-null     object
 16  Slots 

In [4]:
# Describing our data allows us to get a general overview of how the data is.
#
df.describe()

Unnamed: 0,Cars,Bluecar counter,Utilib counter,Utilib 1.4 counter,Charge Slots,Postal code,Slots,year,month,day,hour,minute
count,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0
mean,2.3336,2.3336,0.0596,0.123,0.2534,82634.8784,1.9324,2018.0,4.0,4.9416,11.5092,29.27
std,2.035274,2.035274,0.246698,0.356506,0.546304,8835.865721,1.905402,0.0,0.0,2.597063,6.893549,17.231741
min,0.0,0.0,0.0,0.0,0.0,75001.0,0.0,2018.0,4.0,1.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,75012.0,0.0,2018.0,4.0,3.0,6.0,14.0
50%,2.0,2.0,0.0,0.0,0.0,75019.0,1.0,2018.0,4.0,5.0,11.0,29.0
75%,4.0,4.0,0.0,0.0,0.0,92320.0,3.0,2018.0,4.0,7.0,18.0,44.0
max,7.0,7.0,2.0,3.0,2.0,95880.0,7.0,2018.0,4.0,9.0,23.0,59.0


# Data Preparation/Cleaning


># 1.1 Validity

In [5]:
# Procedure 1:
# Data Cleaning Action: Droping the irrelevant Scheduled at column
# Explanation: We are droping it since its not useful in our analysis.
#
df_new = df.drop(['Scheduled at'], axis=1, inplace=True)
df_new

In [6]:
# Procedure 2:
# Data Cleaning Action: Droping the irrelevant Subscription status at column
# Explanation: We are droping it since its not useful in our analysis.
#
df_new = df.drop(['Subscription status'], axis=1, inplace=True)
df_new

In [7]:
# Procedure 3:
# Data Cleaning Action: Droping the irrelevant Subscription status at column
# Explanation: We are droping it since its not useful in our analysis.
#
df_new = df.drop(['Status'], axis=1, inplace=True)
df_new

In [8]:
# Procedure 3:
# Data Cleaning Action: Droping the Displayed comment column
# Explanation: We are droping it since its not useful in our analysis.
#
df_new = df.drop(['Displayed comment'], axis=1, inplace=True)
df_new

># 1.2 Accuracy

In [9]:
# Procedure 1:
# Data Cleaning Action: Checking for null values in our dataset.
# Explanation: This is to allows us to determin the columns that have null values.
# 
df_null = df.isnull().any()
df_null

Address               False
Cars                  False
Bluecar counter       False
Utilib counter        False
Utilib 1.4 counter    False
Charge Slots          False
Charging Status       False
City                  False
ID                    False
Kind                  False
Geo point             False
Postal code           False
Public name           False
Rental status         False
Slots                 False
Station type          False
year                  False
month                 False
day                   False
hour                  False
minute                False
dtype: bool

># 1.3 Completeness

In [10]:
# Procedure 1:
# Data Cleaning Action: Check for duplicate values.
# Explanation: This is to allow us determine data that is duplicated.
#
df_dup = df.duplicated().sum()
df_dup

0

># 1.4 Consistency

In [11]:
# Procedure 1:
# Data Cleaning Action: Dropping null values if they exist.
# Explanation: This is to allow us determine data that is duplicated.
#
df_new = df.dropna(how="all")
df_new.head()

Unnamed: 0,Address,Cars,Bluecar counter,Utilib counter,Utilib 1.4 counter,Charge Slots,Charging Status,City,ID,Kind,...,Postal code,Public name,Rental status,Slots,Station type,year,month,day,hour,minute
0,2 Avenue de Suffren,0,0,0,0,0,nonexistent,Paris,paris-suffren-2,STATION,...,75015,Paris/Suffren/2,operational,2,station,2018,4,8,11,43
1,145 Rue Raymond Losserand,6,6,0,0,0,operational,Paris,paris-raymondlosserand-145,STATION,...,75014,Paris/Raymond Losserand/145,operational,0,station,2018,4,6,7,24
2,2 Avenue John Fitzgerald Kennedy,3,3,0,2,0,operational,Le Bourget,lebourget-johnfitzgeraldkennedy-2,STATION,...,93350,Le Bourget/John Fitzgerald Kennedy/2,operational,1,station,2018,4,3,20,14
3,51 Rue EugÃ¨ne OudinÃ©,3,3,1,0,1,operational,Paris,paris-eugeneoudine-51,STATION,...,75013,Paris/EugÃ¨ne OudinÃ©/51,operational,2,station,2018,4,4,4,37
4,6 avenue de la Porte de Champerret,3,3,0,0,0,nonexistent,Paris,paris-portedechamperret-6,PARKING,...,75017,Paris/Porte de Champerret/6,operational,3,station,2018,4,8,17,23


># 1.5 Uniformity

In [12]:
# Procedure 1:
# Data Cleaning Action: Converting column names to lowercase characters.
# Explanation: This is to allow for a uniform columns in the dataframe.
#
df.columns = df.columns.str.rstrip().str.lower()
df.head(3)

Unnamed: 0,address,cars,bluecar counter,utilib counter,utilib 1.4 counter,charge slots,charging status,city,id,kind,...,postal code,public name,rental status,slots,station type,year,month,day,hour,minute
0,2 Avenue de Suffren,0,0,0,0,0,nonexistent,Paris,paris-suffren-2,STATION,...,75015,Paris/Suffren/2,operational,2,station,2018,4,8,11,43
1,145 Rue Raymond Losserand,6,6,0,0,0,operational,Paris,paris-raymondlosserand-145,STATION,...,75014,Paris/Raymond Losserand/145,operational,0,station,2018,4,6,7,24
2,2 Avenue John Fitzgerald Kennedy,3,3,0,2,0,operational,Le Bourget,lebourget-johnfitzgeraldkennedy-2,STATION,...,93350,Le Bourget/John Fitzgerald Kennedy/2,operational,1,station,2018,4,3,20,14


In [13]:
# Procedure 2:
# Data Cleaning Action: Converting row values to lowercase characters.
# Explanation: This is to allow for rows to have uniform values which are in lowercase in the dataframe.
#
df = df.astype(str).apply(lambda x: x.str.lower())
df.head(3)

Unnamed: 0,address,cars,bluecar counter,utilib counter,utilib 1.4 counter,charge slots,charging status,city,id,kind,...,postal code,public name,rental status,slots,station type,year,month,day,hour,minute
0,2 avenue de suffren,0,0,0,0,0,nonexistent,paris,paris-suffren-2,station,...,75015,paris/suffren/2,operational,2,station,2018,4,8,11,43
1,145 rue raymond losserand,6,6,0,0,0,operational,paris,paris-raymondlosserand-145,station,...,75014,paris/raymond losserand/145,operational,0,station,2018,4,6,7,24
2,2 avenue john fitzgerald kennedy,3,3,0,2,0,operational,le bourget,lebourget-johnfitzgeraldkennedy-2,station,...,93350,le bourget/john fitzgerald kennedy/2,operational,1,station,2018,4,3,20,14


In [14]:
df.columns = df.columns.str.replace(" ", "_", regex=True)
df.head(2)

Unnamed: 0,address,cars,bluecar_counter,utilib_counter,utilib_1.4_counter,charge_slots,charging_status,city,id,kind,...,postal_code,public_name,rental_status,slots,station_type,year,month,day,hour,minute
0,2 avenue de suffren,0,0,0,0,0,nonexistent,paris,paris-suffren-2,station,...,75015,paris/suffren/2,operational,2,station,2018,4,8,11,43
1,145 rue raymond losserand,6,6,0,0,0,operational,paris,paris-raymondlosserand-145,station,...,75014,paris/raymond losserand/145,operational,0,station,2018,4,6,7,24


In [15]:
# Splitting the geopoint column into two.
#


# Exporting our clean csv file.

In [16]:
df.to_csv("autolib_clean.csv")

# Answering Questions


#  *Using Blue cars*

> # *Challenge 1*
> 1. What is the most popular hour for returning cars?

In [17]:
# Challenge 1
# Finding the most popular hour for returning cars?.
#
url = "autolib_clean.csv"
df = pd.read_csv(url)
df_x = df[["bluecar_counter","day", "hour", "minute"]]
df_car = df_x[df_x["bluecar_counter"] == 0]
df_car['hour'].max()

23

> # *Challenge 2*
>  What station is the most popular?
      > * Overall?

In [18]:
# Challenge 2
# Finding the most popular station( Overall) from our dataset.
#
df_p = df.groupby(['address'])['bluecar_counter'].max().sort_values(ascending=False)

df_p.head(1)

address
194 rue de fontenay    7
Name: bluecar_counter, dtype: int64

> # *Challenge 3*
> 3. What station is the most popular?
      > * At the most popular picking hour?

In [19]:
# Challenge 3
# Finding the most popular station (At the most popular picking hour) from our dataset.
#
df_p = df.groupby(['address'])['hour'].max().sort_values(ascending=False)

df_p.head(1)

address
11 rue robert lavergne    23
Name: hour, dtype: int64

> # *Challenge 4*
  > 4. What postal code is the most popular for picking up Blue cars? Does the most popular station belong to that postal code?
   > * Overall?

In [20]:
# Challenge 4
# Finding the What postal code is the most popular for picking up Blue cars? 
# Does the most popular station belong to that postal code? from our dataset.
#
df_station = df[['address', 'postal_code', 'bluecar_counter']]
df_postal = df_station.sort_values('bluecar_counter', ascending=False)
df_postal.head(1)

Unnamed: 0,address,postal_code,bluecar_counter
1985,99 rue charles-de-gaulle,91330,7


 > # *Challenge 5*
   > * At the most popular picking hour?

In [21]:
# Challenge 5
# Finding the What postal code is the most popular for picking up Blue cars? 
# Does the most popular station belong to that postal code? from our dataset.
#
df_station = df[['address', 'postal_code', 'hour']]
df_postal = df_station.sort_values('hour', ascending=False)
df_postal.head(1)

Unnamed: 0,address,postal_code,hour
581,65 boulevard de picpus,75012,23


> # *Challenge 6*
> 5. Sorting the non-existent, operational and broken charging status in Descending Order.


In [22]:
# Challenge 4
# Finding the What charging status has the highest value for picking up Blue cars from our dataset.
#
df_city = df.groupby(['charging_status'])['bluecar_counter'].sum().sort_values(ascending=False)
df_city.head(16)

charging_status
nonexistent    7269
operational    4359
broken           40
Name: bluecar_counter, dtype: int64

> # Using Utilib counter and Utilib 1.4 counter

> # *Challenge 1*
>  What station is the most popular?
      > * Overall?

> a). utilib_counter

In [23]:
# Challenge 1.1
# Finding the most popular station( Overall) from our dataset.
#
df_p = df.groupby(['address'])['utilib_counter'].max().sort_values(ascending=False)
df_p.head(1)

address
161 avenue de suffren    2
Name: utilib_counter, dtype: int64

> b). utilib_1.4_counter

In [24]:
# Challenge 1.2
# Finding the most popular station( Overall) from our dataset.
#
df_p = df.groupby(['address'])['utilib_1.4_counter'].max().sort_values(ascending=False)
df_p.head(1)

address
123 grande rue    3
Name: utilib_1.4_counter, dtype: int64

> # *Challenge 2*
  > 2. What postal code is the most popular for picking up Utilib_counter and Utilib_1.4_counter? Does the most popular station belong to that postal code?
   > * Overall?

> b). utilib_1.4_counter

In [25]:
# Challenge 2.1
# Finding the What postal code is the most popular for picking up Blue cars? 
# Does the most popular station belong to that postal code? from our dataset.
#
df_station = df[['address', 'postal_code', 'utilib_counter']]
df_postal = df_station.sort_values('utilib_counter', ascending=False)
df_postal.head(1)

Unnamed: 0,address,postal_code,utilib_counter
3329,19 rue de chateaubriand,75008,2


> b). utilib_1.4_counter

In [26]:
# Challenge 2.2
# Finding the What postal code is the most popular for picking up Blue cars? 
# Does the most popular station belong to that postal code? from our dataset.
#
df_station = df[['address', 'postal_code', 'utilib_1.4_counter']]
df_postal = df_station.sort_values('utilib_1.4_counter', ascending=False)
df_postal.head(1)

Unnamed: 0,address,postal_code,utilib_1.4_counter
2731,123 grande rue,92310,3


> # *Challenge 3*
> 5. Sorting the non-existent, operational and broken charging status in Descending Order.


> b). utilib_1.4_counter

In [27]:
# Challenge 3.1
# Finding the What charging status has the highest value for picking up utilib cars from our dataset.
#
df_city = df.groupby(['charging_status'])['utilib_counter'].sum().sort_values(ascending=False)
df_city

charging_status
nonexistent    178
operational    120
broken           0
Name: utilib_counter, dtype: int64

> b). utilib_1.4_counter

In [28]:
# Challenge 4
# Finding the What charging status has the highest value for picking up Blue cars from our dataset.
#
df_city = df.groupby(['charging_status'])['utilib_1.4_counter'].sum().sort_values(ascending=False)
df_city

charging_status
nonexistent    363
operational    248
broken           4
Name: utilib_1.4_counter, dtype: int64

# Research Question

> * Identify the most popular hour of the day for picking up a shared electric car (Bluecar) in the city of Paris over the month of April 2018.*

In [29]:
df_p = df[['city', 'bluecar_counter', 'hour']]
df_paris = df[df['city'] == 'paris']
df_p = df_paris.groupby(['bluecar_counter'])['hour'].max().sort_values(ascending=False)
df_p

bluecar_counter
6    23
5    23
4    23
3    23
2    23
1    23
0    23
7    17
Name: hour, dtype: int64