# Exercise 01. Basic operations:

#### - Turn-in directory: `ex01/`.
#### - Files to turn in: `basic_operations.ipynb`.
#### - Allowed functions: `import pandas as pd`.

In [1]:
import pandas as pd

####
## 1. Create a dataframe called `views` with two columns: `datetime` and `user` by reading `feed-views.log`:
   - Convert the `datetime` to the `datetime64[ns]` `Dtype`.
   - Extract the year, month, day, hour, minute, and second from the values of the `datetime` column to new columns.

In [2]:
file_path = '../data/feed-views.log'
views = pd.read_csv(file_path, sep='\t', names=['datetime', 'user'])
views

Unnamed: 0,datetime,user
0,2020-04-17 12:01:08.463179,artem
1,2020-04-17 12:01:23.743946,artem
2,2020-04-17 12:27:30.646665,artem
3,2020-04-17 12:35:44.884757,artem
4,2020-04-17 12:35:52.735016,artem
...,...,...
1071,2020-05-21 18:45:20.441142,valentina
1072,2020-05-21 23:03:06.457819,maxim
1073,2020-05-21 23:23:49.995349,pavel
1074,2020-05-21 23:49:22.386789,artem


In [3]:
views['datetime'] = views['datetime'].astype('datetime64[ns]')
# views['datetime'] = pd.to_datetime(views['datetime'], errors='coerce') # better method for handling inconsistent date formatiing

views['year'] = views['datetime'].dt.year
views['month'] = views['datetime'].dt.month
views['day'] = views['datetime'].dt.day
views['hour'] = views['datetime'].dt.hour
views['minute'] = views['datetime'].dt.minute
views['second'] = views['datetime'].dt.second
views

Unnamed: 0,datetime,user,year,month,day,hour,minute,second
0,2020-04-17 12:01:08.463179,artem,2020,4,17,12,1,8
1,2020-04-17 12:01:23.743946,artem,2020,4,17,12,1,23
2,2020-04-17 12:27:30.646665,artem,2020,4,17,12,27,30
3,2020-04-17 12:35:44.884757,artem,2020,4,17,12,35,44
4,2020-04-17 12:35:52.735016,artem,2020,4,17,12,35,52
...,...,...,...,...,...,...,...,...
1071,2020-05-21 18:45:20.441142,valentina,2020,5,21,18,45,20
1072,2020-05-21 23:03:06.457819,maxim,2020,5,21,23,3,6
1073,2020-05-21 23:23:49.995349,pavel,2020,5,21,23,23,49
1074,2020-05-21 23:49:22.386789,artem,2020,5,21,23,49,22


####
## 2. Create the new column `daytime`:
   - Assign a particular time of day value if an hour is within a particular interval. For example, assign "afternoon" if the hour is greater than 11 and less than or equal to 17.
   - 0–3:59 = night, 4–6:59 = early morning, 7–10:59 = morning, 11–16:59 = afternoon, 17–19:59 = early evening, 20–23:59 = evening.
   - Use the method `cut` to solve this subtask.
   - Assign the column `user` as the index.

In [4]:
bins = [0, 4, 7, 11, 17, 20, 24]
labels = ['night', 'early morning', 'morning', 'afternoon', 'early evening', 'evening']
views['daytime'] = pd.cut(
    views['datetime'].dt.hour,
    bins=bins,
    labels=labels,
    right=False,
    include_lowest=True
)

views = views.set_index('user')

views

Unnamed: 0_level_0,datetime,year,month,day,hour,minute,second,daytime
user,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
artem,2020-04-17 12:01:08.463179,2020,4,17,12,1,8,afternoon
artem,2020-04-17 12:01:23.743946,2020,4,17,12,1,23,afternoon
artem,2020-04-17 12:27:30.646665,2020,4,17,12,27,30,afternoon
artem,2020-04-17 12:35:44.884757,2020,4,17,12,35,44,afternoon
artem,2020-04-17 12:35:52.735016,2020,4,17,12,35,52,afternoon
...,...,...,...,...,...,...,...,...
valentina,2020-05-21 18:45:20.441142,2020,5,21,18,45,20,early evening
maxim,2020-05-21 23:03:06.457819,2020,5,21,23,3,6,evening
pavel,2020-05-21 23:23:49.995349,2020,5,21,23,23,49,evening
artem,2020-05-21 23:49:22.386789,2020,5,21,23,49,22,evening


####
## 3. Calculate the number of elements in your dataframe:
   - Use the method `count()`.
   - Calculate the number of elements in each time-of-day category using the `value_counts()` method.

In [5]:
# количество элементов в датафрейме
views.count()

datetime    1076
year        1076
month       1076
day         1076
hour        1076
minute      1076
second      1076
daytime     1076
dtype: int64

In [6]:
# количество вхождений в каждую из категорий столбца daytime
views['daytime'].value_counts()

daytime
evening          509
afternoon        252
early evening    145
night            129
morning           36
early morning      5
Name: count, dtype: int64

####
## 4. Sort the values in your dataframe by hour, minute, and second in ascending order simultaneously, not one by one:

In [7]:
views = views.sort_values(by=['hour', 'minute', 'second'], ascending=True)
# views = views.sort_values(by='datetime', ascending=True)
views

Unnamed: 0_level_0,datetime,year,month,day,hour,minute,second,daytime
user,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
valentina,2020-05-15 00:00:13.222265,2020,5,15,0,0,13,night
valentina,2020-05-15 00:01:05.153738,2020,5,15,0,1,5,night
pavel,2020-05-12 00:01:27.764025,2020,5,12,0,1,27,night
pavel,2020-05-12 00:01:38.444917,2020,5,12,0,1,38,night
pavel,2020-05-12 00:01:55.395042,2020,5,12,0,1,55,night
...,...,...,...,...,...,...,...,...
artem,2020-05-21 23:49:22.386789,2020,5,21,23,49,22,evening
anatoliy,2020-05-09 23:53:55.599821,2020,5,9,23,53,55,evening
pavel,2020-05-09 23:54:54.260791,2020,5,9,23,54,54,evening
valentina,2020-05-14 23:58:56.754866,2020,5,14,23,58,56,evening


####
## 5. Calculate the minimum and maximum for the hours and the mode for the daytime categories:
   - Calculate the maximum hour for the rows where the time of day is night.
   - Calculate the minimum hour for rows where the time of day is morning.
   - In addition, find out who visited the page during those hours and provide one example.
   - Calculate the mode for the hour and daytime.

In [8]:
# min for hours column
views['hour'].min()

0

In [9]:
# max for hours column
views['hour'].max()

23

In [10]:
# mode (наиболее часто встречающееся) for daytime column
views['daytime'].mode()[0]

'evening'

In [11]:
# max for hours in night daytime category
visitor_hour_max = views[views['daytime'] == 'night']['hour'].max()
visitor_hour_max

3

In [12]:
# min for hours in morining daytime category
visitor_hour_min = views[views['daytime'] == 'morning']['hour'].min()
visitor_hour_min

8

In [13]:
# пример посетителя из наиболее посещаемого часа в ночное время дня
visitor = views[(views['daytime'] == 'night') & (views['hour'] == visitor_hour_max)]
visitor.index[0]

'konstantin'

In [14]:
# пример посетителя из наименее посещаемого часа в утреннее время
visitor = views[(views['daytime'] == 'morning') & (views['hour'] == visitor_hour_min)]
visitor.index[0]

'alexander'

In [15]:
# mode for hour and daytime together
views.groupby(['hour', 'daytime']).size().idxmax()

(22, 'evening')

####
## 6. Show the three earliest and latest hours of the day and their corresponding usernames using `nsmallest()` and `nlargest()`:

In [16]:
# 3 earliest hours of the day and their corresponding user names
views.nsmallest(3, 'hour')

Unnamed: 0_level_0,datetime,year,month,day,hour,minute,second,daytime
user,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
valentina,2020-05-15 00:00:13.222265,2020,5,15,0,0,13,night
valentina,2020-05-15 00:01:05.153738,2020,5,15,0,1,5,night
pavel,2020-05-12 00:01:27.764025,2020,5,12,0,1,27,night


In [17]:
# 3 latest hours of the day and their corresponding usernames
views.nlargest(3, 'hour')

Unnamed: 0_level_0,datetime,year,month,day,hour,minute,second,daytime
user,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
ekaterina,2020-05-14 23:02:11.327532,2020,5,14,23,2,11,evening
ekaterina,2020-05-14 23:02:14.494985,2020,5,14,23,2,14,evening
ekaterina,2020-05-14 23:02:15.588808,2020,5,14,23,2,15,evening


####
## 7. Use the `describe()` method to get the basic statistics for the columns:
   - To find the most popular visiting interval, calculate the interquartile range for the hour by extracting values from the result of the `describe()` method and storing them in the variable `iqr`.

In [18]:
stats = views.describe()
stats

Unnamed: 0,datetime,year,month,day,hour,minute,second
count,1076,1076.0,1076.0,1076.0,1076.0,1076.0,1076.0
mean,2020-05-10 09:00:41.211420672,2020.0,4.870818,13.552974,16.249071,29.629182,29.500929
min,2020-04-17 12:01:08.463179,2020.0,4.0,1.0,0.0,0.0,0.0
25%,2020-05-10 01:13:49.857472,2020.0,5.0,11.0,13.0,14.0,14.0
50%,2020-05-11 22:48:35.302552832,2020.0,5.0,13.0,19.0,29.0,30.0
75%,2020-05-14 14:44:34.749530624,2020.0,5.0,15.0,22.0,46.0,45.0
max,2020-05-22 10:36:14.662600,2020.0,5.0,30.0,23.0,59.0,59.0
std,,0.0,0.335557,4.906567,6.95549,17.689388,17.405506


In [19]:
# most popular visiting interval
iqr = stats.loc['75%', 'hour'] - stats.loc['25%', 'hour']
iqr

9.0

In [20]:
# import pandas as pd

# def base_op():
#   try:
#       file_path = '../data/feed-views.log'
#       views = pd.read_csv(file_path, sep='\t', names=['datetime', 'user'])
      
#       views['datetime'] = views['datetime'].astype('datetime64[ns]')
#       # views['datetime'] = pd.to_datetime(views['datetime'], errors='coerce') # better method for handling inconsistent date formatiing
      
#       views['year'] = views['datetime'].dt.year
#       views['month'] = views['datetime'].dt.month
#       views['day'] = views['datetime'].dt.day
#       views['hour'] = views['datetime'].dt.hour
#       views['minute'] = views['datetime'].dt.minute
#       views['second'] = views['datetime'].dt.second

#       bins = [0, 4, 7, 11, 17, 20, 24]
#       labels = ['night', 'early morning', 'morning', 'afternoon', 'early evening', 'evening']
#       views['daytime'] = pd.cut(
#           views['datetime'].dt.hour,
#           bins=bins,
#           labels=labels,
#           right=False,
#           include_lowest=True
#       )

#       views = views.set_index('user')
          
#       return views
#   except Exception as e:
#       print(e)

# if __name__ == '__main__':
#   views = base_op()

In [21]:
views.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1076 entries, valentina to alexander
Data columns (total 8 columns):
 #   Column    Non-Null Count  Dtype         
---  ------    --------------  -----         
 0   datetime  1076 non-null   datetime64[ns]
 1   year      1076 non-null   int32         
 2   month     1076 non-null   int32         
 3   day       1076 non-null   int32         
 4   hour      1076 non-null   int32         
 5   minute    1076 non-null   int32         
 6   second    1076 non-null   int32         
 7   daytime   1076 non-null   category      
dtypes: category(1), datetime64[ns](1), int32(6)
memory usage: 43.3+ KB


In [22]:
views.count()

datetime    1076
year        1076
month       1076
day         1076
hour        1076
minute      1076
second      1076
daytime     1076
dtype: int64

In [23]:
views.daytime.value_counts()

daytime
evening          509
afternoon        252
early evening    145
night            129
morning           36
early morning      5
Name: count, dtype: int64

In [24]:
views.loc[views.daytime == 'night'].hour.idxmax()

'konstantin'

In [25]:
views.loc[views.daytime == 'morning'].hour.idxmin()

'alexander'

In [26]:
views.hour.mode()

0    22
Name: hour, dtype: int32

In [27]:
views.daytime.mode()

0    evening
Name: daytime, dtype: category
Categories (6, object): ['night' < 'early morning' < 'morning' < 'afternoon' < 'early evening' < 'evening']

In [28]:
iqr

9.0