In [1]:
import pandas as pd

PATH_FILE = '../Data/Hour.csv'

hour = pd.read_csv(PATH_FILE)
print(hour.head(5)) 


   instant      dteday  season  yr  mnth  hr  holiday  weekday  workingday  \
0        1  2011-01-01       1   0     1   0        0        6           0   
1        2  2011-01-01       1   0     1   1        0        6           0   
2        3  2011-01-01       1   0     1   2        0        6           0   
3        4  2011-01-01       1   0     1   3        0        6           0   
4        5  2011-01-01       1   0     1   4        0        6           0   

   weathersit  temp   atemp   hum  windspeed  casual  registered  count  
0           1  0.24  0.2879  0.81        0.0       3          13     16  
1           1  0.22  0.2727  0.80        0.0       8          32     40  
2           1  0.22  0.2727  0.80        0.0       5          27     32  
3           1  0.24  0.2879  0.75        0.0       3          10     13  
4           1  0.24  0.2879  0.75        0.0       0           1      1  


In [3]:
# 1. How many rows and columns are there in the dataset?
print(hour.shape)


(17379, 17)


In [4]:
# .mean(): This method returns the average value of the data. It is calculated by adding up all the numbers in the dataset and then dividing by the count of numbers. 
# For example, the mean of [1, 2, 3, 4, 5] is (1+2+3+4+5)/5 = 3.0.

print(hour['count'].mean())


189.46308763450142


In [2]:
# .median(): This method returns the middle value of a sorted dataset. If the dataset has an odd number of observations, the median is the middle number. 
# If the dataset has an even number of observations, the median is the average of the two middle numbers. 
# For example, the median of [1, 2, 3, 4, 5] is 3, and the median of [1, 2, 3, 4] is (2+3)/2 = 2.5.

print(hour['count'].median())   

142.0


In [5]:
print(hour['count'].std())  # .std(): This method returns the standard deviation of the data. It is a measure of the amount of variation or dispersion of a set of values.

181.38759909186527


In [6]:
# .describe(): This method returns a summary of the data. It includes the count, mean, standard deviation, minimum, 25th percentile, 50th percentile, 75th percentile, and maximum of the data.
print(hour.describe())    

          instant        season            yr          mnth            hr  \
count  17379.0000  17379.000000  17379.000000  17379.000000  17379.000000   
mean    8690.0000      2.501640      0.502561      6.537775     11.546752   
std     5017.0295      1.106918      0.500008      3.438776      6.914405   
min        1.0000      1.000000      0.000000      1.000000      0.000000   
25%     4345.5000      2.000000      0.000000      4.000000      6.000000   
50%     8690.0000      3.000000      1.000000      7.000000     12.000000   
75%    13034.5000      3.000000      1.000000     10.000000     18.000000   
max    17379.0000      4.000000      1.000000     12.000000     23.000000   

            holiday       weekday    workingday    weathersit          temp  \
count  17379.000000  17379.000000  17379.000000  17379.000000  17379.000000   
mean       0.028770      3.003683      0.682721      1.425283      0.496987   
std        0.167165      2.005771      0.465431      0.639357      0.

 ## Analyzing Subset of Data

In [7]:
# .loc[]: This method is used to access a group of rows and columns by label(s) or a boolean array. It is used to select rows and columns by label. 
# When we use loc(), we specify the subset we want to select by using square brackets with this pattern: [<row>,<column>].
print(hour.loc[3, 'count']) 

13


In [9]:
# In this snippet, we specify that we want values of the registered variable. 
#By specifying 2:4 in the square brackets, we indicate that we want all the rows between row 2 and row 4, so we get three numbers as output: 27, 10, and 1
print(hour.loc[2:4, 'registered'])

2    27
3    10
4     1
Name: registered, dtype: int64


In [11]:
# This part of the code is filtering the 'hour' DataFrame based on a condition. It selects rows where the value in the 'hr' column is less than 5.
# After filtering the DataFrame, this part further selects only the 'registered' column from the filtered rows.
# Finally, it calculates the mean of the 'registered' column.

print(hour.loc[hour['hr']<5,'registered'].mean())

20.7881427367238


In [None]:
# This part of the code is using the loc function to filter the hour DataFrame based on two conditions:
# (hour['hr']<5) and hour['temp']<.50. It then selects the 'count' column from the filtered rows.
# Finally, it calculates the mean of the 'count' column.

print(hour.loc[(hour['hr']<5) & (hour['temp']<.50),'count'].mean())

In [13]:
# It's same above but using OR operator
# Then it calculates the mean of the 'count' column.

print(hour.loc[(hour['temp']>0.5) | (hour['hum']>0.5),'count'].mean())

193.36764103264582


## Seasonal Data

Seasonal data refers to data that exhibits a recurring pattern or trend over a specific time period. This pattern repeats itself at regular intervals, such as daily, weekly, monthly, or yearly. Seasonal data is commonly observed in various domains, including sales, weather, stock market, and tourism. Analyzing seasonal data helps identify and understand the underlying patterns, trends, and seasonality effects, which can be useful for forecasting, planning, and decision-making.
Seasonal Data is 

In [14]:
# This code is using the groupby() function to group the data in the hour DataFrame by the 'season' column

print(hour.groupby(['season'])['count'].mean())


season
1    111.114569
2    208.344069
3    236.016237
4    198.868856
Name: count, dtype: float64


In [16]:
#The code print(hour.groupby(['season','holiday'])['count'].mean()) performs a group-by operation on the hour DataFrame based on two columns: season and holiday.

print(hour.groupby(['season','holiday'])['count'].mean())

season  holiday
1       0          112.685875
        1           72.042683
2       0          208.428472
        1          204.552083
3       0          235.976818
        1          237.822917
4       0          199.965998
        1          167.722222
Name: count, dtype: float64


## Visualizing Data with Matplotlib

### Drawing and Displaying a Simple Plot

In [None]:
# fig, ax = plt.subplots(figsize=(10, 6)): This line creates a new figure with a single subplot. 
# The figsize=(10, 6) argument sets the size of the figure to be 10 inches wide and 6 inches. 
# The plt.subplots() function returns two objects: fig (the whole figure) and ax (the subplot).

# ax.scatter(x= hour['instant'], y=hour['count']): This line creates a scatter plot on the subplot ax. 
# The x and y arguments specify the data to be plotted on the x-axis and y-axis, respectively. 
# In this case, the 'instant' column from the 'hour' DataFrame is plotted on the x-axis, and the 'count' column from the same DataFrame is plotted on the y-axis.

# plt.show(): This line displays the plot.
import matplotlib.pyplot as plt
fig, ax = plt.subplots(figsize=(10, 6))
ax.scatter(x= hour['instant'], y=hour['count']) 
plt.show()

### Clarifying Plots with Titles and Labels