-
Notifications
You must be signed in to change notification settings - Fork 1
/
Mahyar_Bixi_Part1_SQL.sql
294 lines (235 loc) · 12.4 KB
/
Mahyar_Bixi_Part1_SQL.sql
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
/*
Mahyar Sabouniaghdam
Bixi Project Deliverable 1
2022-11-18
mahyar_sabooni@yahoo.com
*/
-- ------------------------------------------------------ QUESTION 1 --------------------------------------------------------------
-- 1.1
SELECT COUNT(YEAR(start_date)) AS number_of_2016_trips
FROM bixi.trips
WHERE YEAR(start_date) = 2016;
-- 1.2
SELECT COUNT(YEAR(start_date)) AS number_of_2017_trips
FROM bixi.trips
WHERE YEAR(start_date) = 2017;
-- 1.3
SELECT MONTH(start_date) AS months_of_2016, COUNT(*) AS number_of_trips
FROM (
SELECT *, YEAR(start_date)
FROM bixi.trips
WHERE YEAR(start_date) = 2016 ) AS 2016_trips
GROUP BY months_of_2016;
-- 1.4
SELECT MONTH(start_date) AS months_of_2017, COUNT(*) AS number_of_trips
FROM (
SELECT *, YEAR(start_date)
FROM bixi.trips
WHERE YEAR(start_date) = 2017 ) AS 2017_trips
GROUP BY months_of_2017;
-- 1.5
SELECT year(start_date) AS year_, MONTH(start_date) AS month_, MAX(DAY(start_date)) AS number_of_active_days, CAST(( COUNT(*) / MAX(DAY(start_date))) AS DECIMAL(12,2)) AS average_number_of_trips_per_day
FROM (
SELECT *, YEAR(start_date)
FROM bixi.trips
WHERE YEAR(start_date) = 2016 )
AS 2016_trips
GROUP BY month_
UNION
SELECT year(start_date) AS year_, MONTH(start_date) AS month_, MAX(DAY(start_date)) AS number_of_active_days, CAST(( COUNT(*) / MAX(DAY(start_date))) AS DECIMAL(12,2)) AS average_number_of_trips_per_day
FROM (
SELECT *, YEAR(start_date)
FROM bixi.trips
WHERE YEAR(start_date) = 2017 )
AS 2017_trips
GROUP BY month_;
-- In this question, on the 11th month of the year (November), for both 2016 and 2017, 15 days are considered in the dataset.
-- To make average calculation more accurate, I decided to consider 15 days for the count of days on November.
-- I reach to the number of days on each month by writing MAX query to make the number of days dynamic.
-- 1.6
CREATE TABLE IF NOT EXISTS working_table1
( id INT AUTO_INCREMENT, PRIMARY KEY(id), number_of_active_days TINYINT, average_number_of_trips_per_day DECIMAL(12,2)) AS
SELECT *
FROM (
SELECT year(start_date) AS year_, MONTH(start_date) AS month_, MAX(DAY(start_date)) AS number_of_active_days, CAST(( COUNT(*) / MAX(DAY(start_date))) AS DECIMAL(12,2)) AS average_number_of_trips_per_day
FROM (
SELECT *, YEAR(start_date)
FROM bixi.trips
WHERE YEAR(start_date) = 2016 )
AS 2016_trips
GROUP BY month_
UNION
SELECT year(start_date) AS year_, MONTH(start_date) AS month_, MAX(DAY(start_date)) AS number_of_active_days, CAST(( COUNT(*) / MAX(DAY(start_date))) AS DECIMAL(12,2)) AS average_number_of_trips_per_day
FROM (
SELECT *, YEAR(start_date)
FROM bixi.trips
WHERE YEAR(start_date) = 2017 )
AS 2017_trips
GROUP BY month_) AS year_month_trip_table;
-- ------------------------------------------------------ QUESTION 2 --------------------------------------------------------------
-- 2.1
SELECT YEAR(start_date), is_member, COUNT(*) AS total_number_of_trips
FROM (
SELECT *, YEAR(start_date)
FROM bixi.trips
WHERE YEAR(start_date) = 2017 )
AS 2017_trips
GROUP BY is_member;
-- 2.2
WITH t1 AS
(
SELECT MONTH(start_date) AS months_of_2017, COUNT(*) AS number_of_trips_for_members
FROM (
SELECT *, YEAR(start_date)
FROM bixi.trips
WHERE YEAR(start_date) = 2017 AND is_member = 1 )
AS 2017_trips
GROUP BY months_of_2017 )
SELECT *, CONCAT( ( number_of_trips_for_members * 100 / (SELECT SUM(number_of_trips_for_members) FROM t1)), ' ', '%' ) AS percentage_of_trips_for_members
FROM t1
GROUP BY months_of_2017;
-- ------------------------------------------------------ QUESTION 3 --------------------------------------------------------------
-- 3.1 on JULY (7th month) demand for Bixi bikes are at its peak with 17.38 %
WITH t1 AS
(
SELECT MONTH(start_date) AS months_of_2017, COUNT(*) AS number_of_trips_for_members
FROM (
SELECT *, YEAR(start_date)
FROM bixi.trips
WHERE YEAR(start_date) = 2017 AND is_member = 1 )
AS 2017_trips
GROUP BY months_of_2017
ORDER BY COUNT(*) DESC )
SELECT *, CONCAT( ( number_of_trips_for_members * 100 / (SELECT SUM(number_of_trips_for_members) FROM t1)), ' ', '%' ) AS percentage_of_trips_for_members
FROM t1
GROUP BY months_of_2017;
-- 3.2
-- I am going to offer my promotion to non-members on the 4th and 5th months of the year.
-- Since we have our peak at rhe month 7, it is a good opportunity to encourage people to use Bixi bikes on 4th and 5th months of the year when the weather is going to get warm.
-- (Knowing that because of the cold weather, they did not have the opportunity to ride a bike for couple of months)
-- And after that try to convert them to members before the peak season on months 7 and 8.
-- The promotion can be three partial discounts in one week (for example 20%, 15% and 10%) for their first three rides in one week,
-- And after that winning a free ride if they become a member.
-- This type of promotion would increase the total number of users at the beginning of the season when the demand is low.
-- In this way people are encouraged to try Bixi bikes and after that with one free ride they are more likely to become a member before seasonal peak.
-- ------------------------------------------------------ QUESTION 4 --------------------------------------------------------------
-- 4.1
SELECT trips.start_station_code, stations.name, COUNT(*) AS number_of_trips_started_form_the_station
FROM bixi.trips
JOIN bixi.stations
ON trips.start_station_code = stations.code
GROUP BY start_station_code
ORDER BY COUNT(*) DESC
LIMIT 5;
-- 4.2
SELECT start_station_code, stations.name, number_of_trips_started_form_the_station
FROM (
SELECT start_station_code, COUNT(*) AS number_of_trips_started_form_the_station
FROM trips
GROUP BY start_station_code
ORDER BY COUNT(*) DESC
LIMIT 5) AS top_5_starting_stations
JOIN stations
ON top_5_starting_stations.start_station_code = stations.code;
-- Q1 has 20 sec runtime but Q2 has 9 sec. We saw that by using a subquery the runtime became even less than half.
-- The reason is because we are joining fewer records when using subquery. We are somehow downsizing the table
-- and decreasing the number of records in the first place before performing the JOIN statement.
-- So, fewer records will take part in the JOIN statement leading to decreasing the runtime.
-- ------------------------------------------------------ QUESTION 5 --------------------------------------------------------------
-- 5.1
-- start_station
SELECT start_station_code, COUNT(*) AS number_of_starting_trips_from_Mackay , CASE
WHEN HOUR(start_date) BETWEEN 7 AND 11 THEN "morning"
WHEN HOUR(start_date) BETWEEN 12 AND 16 THEN "afternoon"
WHEN HOUR(start_date) BETWEEN 17 AND 21 THEN "evening"
ELSE "night"
END AS "time_of_day"
FROM trips
WHERE start_station_code = 6100
GROUP BY time_of_day
ORDER BY number_of_starting_trips_from_Mackay DESC;
-- end station
SELECT end_station_code, COUNT(*) AS number_of_ending_trips_from_Mackay , CASE
WHEN HOUR(start_date) BETWEEN 7 AND 11 THEN "morning"
WHEN HOUR(start_date) BETWEEN 12 AND 16 THEN "afternoon"
WHEN HOUR(start_date) BETWEEN 17 AND 21 THEN "evening"
ELSE "night"
END AS "time_of_day"
FROM trips
WHERE end_station_code = 6100
GROUP BY time_of_day
ORDER BY number_of_ending_trips_from_Mackay DESC;
-- 5.2
-- By exploring the google map, we can see that Mackay / de Maisonneuve station is one of the busiest places and streets in Montreal.
-- The Montreal Museum of Fine Arts, Concordia University, Hall Building Auditorium and many popular buildings and offices are located here.
-- In the dataset we can see that the number of starting trips in the mornings are relatively low comparing to ending trips.
-- I expect that because people are traveling all over the city to Mackay, so low starting trips and high ending trips is reasonable in Mackay.
-- In the evenings and at nights the number of starting trips from here are more than ending trips. The reason is obvious.
-- Because people are going back to their places and homes from Mackay station.
-- In the afternoons the number of starting trips and ending trips are the same because people maybe go for lunch or commute near that neighborhood for meetings and then return back to their works.
-- In total, the number of both starting and ending trips in evenings and afternoons are higher than mornings and nights.
-- I think it is maybe because of the weather conditions. evenings and afternoons have warmer weather than nights and early mornings,
-- especially in a city like Montreal with cold weather.
-- Another reason is that evenings and afternoons are rush hours and people commute more, especially in a busy place like Mackay.
-- ------------------------------------------------------ QUESTION 6 --------------------------------------------------------------
-- 6.1
SELECT start_station_code, COUNT(*) AS number_of_starting_trips
FROM trips
GROUP BY start_station_code;
-- 6.2
SELECT start_station_code AS station_code, COUNT(*) AS number_of_round_trips
FROM trips
WHERE start_station_code = end_station_code
GROUP BY start_station_code;
-- 6.3
SELECT total_round_trips.station_code, number_of_starting_trips, number_of_round_trips, (number_of_round_trips / number_of_starting_trips ) AS fraction_of_round_trips_to_total_starting_trips
FROM (
SELECT start_station_code, COUNT(*) AS number_of_starting_trips
FROM trips
GROUP BY start_station_code ) AS total_trips
JOIN (
SELECT start_station_code AS station_code, COUNT(*) AS number_of_round_trips
FROM trips
WHERE start_station_code = end_station_code
GROUP BY start_station_code ) AS total_round_trips
ON total_trips.start_station_code = total_round_trips.station_code;
-- 6.4
SELECT total_round_trips.station_code, number_of_starting_trips, number_of_round_trips, (number_of_round_trips / number_of_starting_trips ) AS fraction_of_round_trips_to_total_starting_trips
FROM (
SELECT start_station_code, COUNT(*) AS number_of_starting_trips
FROM trips
GROUP BY start_station_code ) AS total_trips
JOIN (
SELECT start_station_code AS station_code, COUNT(*) AS number_of_round_trips
FROM trips
WHERE start_station_code = end_station_code
GROUP BY start_station_code ) AS total_round_trips
ON total_trips.start_station_code = total_round_trips.station_code
WHERE number_of_starting_trips >= 500 AND (number_of_round_trips / number_of_starting_trips ) >= 0.1;
-- 6.5
-- As it can be seen below, Question 6.5 query, stations Métro Jean-Drapeau, Métro Angrignon, Berlioz, LaSalle and Basile-Routhier are
-- top 5 stations with highest fraction of round trips to starting trips. This means that in these stations the number of people
-- who had round trips compared to starting trips were relatively high. By examining these stations we will find out that they are all
-- Train stations. Now it is reasonable because it can be understood that people get out of the station,
-- ride Bixi bikes to their destination (their workplace for example), ride back from the destination to the train station,
-- and then go back to their homes by the train again. It is rational that people are using trains for long distances and Bixi bikes for shorter distances.
-- So, main Metro stations have the highest number of round trips because so many people are using Bixi bikes for round trips starting
-- from train stations for short distances and then they get back to train stations to use trains for traveling long distances.
SELECT fraction_table.* , stations.name
FROM (
SELECT total_round_trips.station_code, number_of_starting_trips, number_of_round_trips, (number_of_round_trips / number_of_starting_trips ) AS fraction_of_round_trips_to_total_starting_trips
FROM (
SELECT start_station_code, COUNT(*) AS number_of_starting_trips
FROM trips
GROUP BY start_station_code ) AS total_trips
JOIN (
SELECT start_station_code AS station_code, COUNT(*) AS number_of_round_trips
FROM trips
WHERE start_station_code = end_station_code
GROUP BY start_station_code ) AS total_round_trips
ON total_trips.start_station_code = total_round_trips.station_code
WHERE number_of_starting_trips >= 500 AND (number_of_round_trips / number_of_starting_trips ) >= 0.1
) AS fraction_table
JOIN stations
ON fraction_table.station_code = stations.code
ORDER BY fraction_of_round_trips_to_total_starting_trips DESC;