In [1]:
import pandas as pd

<h2><center>Функция создания dataframe</center></h2>

In [2]:
def create_df():
    df = pd.read_csv('../data/feed-views.log', sep='\t', names=['datetime', 'user'])
    df["datetime"] = pd.to_datetime(df["datetime"])
    df["year"] = df["datetime"].dt.year
    df["month"] = df["datetime"].dt.month
    df["day"] = df["datetime"].dt.day
    df["hour"] = df["datetime"].dt.hour
    df["minute"] = df["datetime"].dt.minute
    df["second"] = df["datetime"].dt.second
    return df

In [3]:
views = create_df()

In [4]:
views.datetime

0      2020-04-17 12:01:08.463179
1      2020-04-17 12:01:23.743946
2      2020-04-17 12:27:30.646665
3      2020-04-17 12:35:44.884757
4      2020-04-17 12:35:52.735016
                  ...            
1071   2020-05-21 18:45:20.441142
1072   2020-05-21 23:03:06.457819
1073   2020-05-21 23:23:49.995349
1074   2020-05-21 23:49:22.386789
1075   2020-05-22 10:36:14.662600
Name: datetime, Length: 1076, dtype: datetime64[ns]

In [5]:
views.head()

Unnamed: 0,datetime,user,year,month,day,hour,minute,second
0,2020-04-17 12:01:08.463179,artem,2020,4,17,12,1,8
1,2020-04-17 12:01:23.743946,artem,2020,4,17,12,1,23
2,2020-04-17 12:27:30.646665,artem,2020,4,17,12,27,30
3,2020-04-17 12:35:44.884757,artem,2020,4,17,12,35,44
4,2020-04-17 12:35:52.735016,artem,2020,4,17,12,35,52


<h2><center>Создание колонки daytime и назначение индекса</center></h2>

In [6]:
def def_daytime(df):
    df["daytime"] = pd.cut(df["hour"], [-1, 3, 6, 10, 16, 19, 23], labels=["night", "early morning", "morning", "afternoon", "early evening", "evening"])
    df = df.set_index('user')
    return df

In [7]:
views = def_daytime(views)

In [8]:
views.head(20)

Unnamed: 0_level_0,datetime,year,month,day,hour,minute,second,daytime
user,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
artem,2020-04-17 12:01:08.463179,2020,4,17,12,1,8,afternoon
artem,2020-04-17 12:01:23.743946,2020,4,17,12,1,23,afternoon
artem,2020-04-17 12:27:30.646665,2020,4,17,12,27,30,afternoon
artem,2020-04-17 12:35:44.884757,2020,4,17,12,35,44,afternoon
artem,2020-04-17 12:35:52.735016,2020,4,17,12,35,52,afternoon
oksana,2020-04-17 12:36:21.401412,2020,4,17,12,36,21,afternoon
oksana,2020-04-17 12:36:22.023355,2020,4,17,12,36,22,afternoon
artem,2020-04-17 13:55:19.129243,2020,4,17,13,55,19,afternoon
artem,2020-04-17 15:00:33.138530,2020,4,17,15,0,33,afternoon
artem,2020-04-17 15:14:09.581054,2020,4,17,15,14,9,afternoon


<h2><center>Calculating</center></h2>

In [9]:
def calculate(df):
    nums = df.count()
    daytimes = df.daytime.value_counts()
    return nums, daytimes

In [10]:
nums, daytimes = calculate(views)

In [11]:
print(nums)

datetime    1076
year        1076
month       1076
day         1076
hour        1076
minute      1076
second      1076
daytime     1076
dtype: int64


In [12]:
print(daytimes)

evening          509
afternoon        252
early evening    145
night            129
morning           36
early morning      5
Name: daytime, dtype: int64


<h2><center>Сортировка</center></h2>

In [13]:
def sorting(df):
    df = df.sort_values(by=["hour", "minute", "second"])
    return df

In [14]:
views = sorting(views)

In [15]:
views.head(130)

Unnamed: 0_level_0,datetime,year,month,day,hour,minute,second,daytime
user,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
valentina,2020-05-15 00:00:13.222265,2020,5,15,0,0,13,night
valentina,2020-05-15 00:01:05.153738,2020,5,15,0,1,5,night
pavel,2020-05-12 00:01:27.764025,2020,5,12,0,1,27,night
pavel,2020-05-12 00:01:38.444917,2020,5,12,0,1,38,night
pavel,2020-05-12 00:01:55.395042,2020,5,12,0,1,55,night
...,...,...,...,...,...,...,...,...
konstantin,2020-05-10 02:15:45.523919,2020,5,10,2,15,45,night
konstantin,2020-04-19 03:23:35.471598,2020,4,19,3,23,35,night
konstantin,2020-04-19 03:23:55.473926,2020,4,19,3,23,55,night
konstantin,2020-04-19 03:33:07.757714,2020,4,19,3,33,7,night


<h2><center>Подсчет минимума, максимума и моды</center></h2>

In [16]:
views[views["daytime"] == 'night']['hour'].max()

3

In [17]:
views[views["daytime"] == 'morning']['hour'].min()

8

In [18]:
views[views['hour'] == 8]

Unnamed: 0_level_0,datetime,year,month,day,hour,minute,second,daytime
user,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
alexander,2020-05-15 08:16:03.918402,2020,5,15,8,16,3,morning
alexander,2020-05-15 08:35:01.471463,2020,5,15,8,35,1,morning


In [19]:
views[views['hour'] == 3]

Unnamed: 0_level_0,datetime,year,month,day,hour,minute,second,daytime
user,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
konstantin,2020-04-19 03:23:35.471598,2020,4,19,3,23,35,night
konstantin,2020-04-19 03:23:55.473926,2020,4,19,3,23,55,night
konstantin,2020-04-19 03:33:07.757714,2020,4,19,3,33,7,night


In [20]:
views['hour'].mode()

0    22
dtype: int64

In [21]:
views['daytime'].mode()

0    evening
Name: daytime, dtype: category
Categories (6, object): ['night' < 'early morning' < 'morning' < 'afternoon' < 'early evening' < 'evening']

<h2><center>Ранние пташки и поздние совы</center></h2>

In [22]:
views[views['daytime'] == 'morning'].nsmallest(3, 'hour').hour

user
alexander    8
alexander    8
alexander    9
Name: hour, dtype: int64

In [23]:
views[views['daytime'] == 'morning'].nlargest(3, 'hour').hour

user
konstantin    10
maxim         10
maxim         10
Name: hour, dtype: int64

<h2><center>Базовая статистика</center></h2>

In [24]:
views.describe()

Unnamed: 0,year,month,day,hour,minute,second
count,1076.0,1076.0,1076.0,1076.0,1076.0,1076.0
mean,2020.0,4.870818,13.552974,16.249071,29.629182,29.500929
std,0.0,0.335557,4.906567,6.95549,17.689388,17.405506
min,2020.0,4.0,1.0,0.0,0.0,0.0
25%,2020.0,5.0,11.0,13.0,14.0,14.0
50%,2020.0,5.0,13.0,19.0,29.0,30.0
75%,2020.0,5.0,15.0,22.0,46.0,45.0
max,2020.0,5.0,30.0,23.0,59.0,59.0


In [25]:
iqr = views.describe()['50%':'50%'].hour

In [26]:
print(iqr)

50%    19.0
Name: hour, dtype: float64
