In [1]:
# Данные тут
# статья https://statmodeling.stat.columbia.edu/2012/06/14/cool-ass-signal-processing-using-gaussian-processes/
# данные https://raw.githubusercontent.com/jakevdp/data-CDCbirths/master/births.csv


import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns
from IPython.display import display
import datetime
from datetime import datetime

In [2]:
# Скачиваем данные, проверяем их, ищем пропущенные данные
path = 'https://raw.githubusercontent.com/jakevdp/data-CDCbirths/master/births.csv'
df = pd.read_csv(path)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15547 entries, 0 to 15546
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   year    15547 non-null  int64  
 1   month   15547 non-null  int64  
 2   day     15067 non-null  float64
 3   gender  15547 non-null  object 
 4   births  15547 non-null  int64  
dtypes: float64(1), int64(3), object(1)
memory usage: 607.4+ KB


In [3]:
df[df.day.isna()]

Unnamed: 0,year,month,day,gender,births
15067,1989,1,,F,156749
15068,1989,1,,M,164052
15069,1989,2,,F,146710
15070,1989,2,,M,154047
15071,1989,3,,F,165889
...,...,...,...,...,...
15542,2008,10,,M,183219
15543,2008,11,,F,158939
15544,2008,11,,M,165468
15545,2008,12,,F,173215


In [4]:
df_cor = df.dropna()

df_cor = df_cor.replace({'day': 99},np.nan).dropna()

In [5]:
# ВНИМАНИЕ! К этому нужно вернуться!
df_cor[df_cor['births']<10]

Unnamed: 0,year,month,day,gender,births
892,1970,2,31.0,F,8
893,1970,2,31.0,M,6
1656,1971,2,29.0,F,4
1657,1971,2,30.0,M,2
1658,1971,2,31.0,M,6
...,...,...,...,...,...
7253,1978,6,31.0,M,3
7443,1978,9,31.0,F,2
7444,1978,9,31.0,M,3
7571,1978,11,31.0,F,2


In [6]:
# переводим данные в удобный формат (столбики по отдельности)

In [9]:
dff = df_cor.pivot(index = ['year','month','day'], columns = 'gender', values='births').reset_index()
dff

gender,year,month,day,F,M
0,1969,1,1.0,4046.0,4440.0
1,1969,1,2.0,4454.0,4548.0
2,1969,1,3.0,4548.0,4994.0
3,1969,1,4.0,4440.0,4520.0
4,1969,1,5.0,4192.0,4198.0
...,...,...,...,...,...
7363,1988,12,27.0,5633.0,5895.0
7364,1988,12,28.0,5858.0,5989.0
7365,1988,12,29.0,5760.0,5944.0
7366,1988,12,30.0,5742.0,6095.0


In [10]:
# Соберем временные индексы и добавим столбцы

In [11]:
dff.index = pd.PeriodIndex(year = dff['year'],month = dff['month'], day = dff['day'],freq='D')
dff[dff['F']<10]
dff

gender,year,month,day,F,M
1969-01-01,1969,1,1.0,4046.0,4440.0
1969-01-02,1969,1,2.0,4454.0,4548.0
1969-01-03,1969,1,3.0,4548.0,4994.0
1969-01-04,1969,1,4.0,4440.0,4520.0
1969-01-05,1969,1,5.0,4192.0,4198.0
...,...,...,...,...,...
1988-12-27,1988,12,27.0,5633.0,5895.0
1988-12-28,1988,12,28.0,5858.0,5989.0
1988-12-29,1988,12,29.0,5760.0,5944.0
1988-12-30,1988,12,30.0,5742.0,6095.0


In [None]:
# Визуализируем 1980 год (делаем красиво)

In [None]:
datetime.strptime('1978-02-3','%Y-%m-%d')

In [None]:
dff['datetime'] = dff['year'].astype(int).astype(str)+'-'+dff['month'].astype(int).astype(str)+'-'+dff['day'].astype(int).astype(str)
def day_to_day(name):
    try:
        day = datetime.strptime(name,'%Y-%m-%d')
    except ValueError :
            day = np.nan
    return day

dff['datetime'] = dff.datetime.map(day_to_day)
dff = dff.dropna(subset = ['datetime']).copy()
#dff.isna().sum()
dff.set_index(['datetime'])
dff['sum'] = dff['F']+dff['M']
dff['week'] = dff.index.dayofweek



dff.head(5)

In [None]:
# Аггрегируем недельные данные внути года (делаем красиво)

In [None]:
week  = dff.loc['1988'].groupby('week')['sum'].mean()
days=['ПН','ВТ','СР','ЧТ','ПТ','СБ','ВС']
week.index = days
week.plot();

In [None]:
dff.groupby('week')['sum'].mean().plot()

In [None]:
# Агрегируем данные по годам, ищем "сливы"
dff.loc['1980']['sum'].plot()

In [None]:
# Добавляем выходные
import holidays
holi1980 = pd.Series(holidays.US(years = [1980]))
holi1980

In [None]:
plt.figure(figsize =(15,5))
plt.plot(dff.loc['1980']['sum'])
[plt.axvline(x,color = 'red') for x in holi1980.index];

In [None]:
dff.head()

In [None]:
plt.figure(figsize =(20,5))
df1 = dff.groupby([dff.index.month,dff.index.day])['sum'].mean()
plt.plot(df1.values)
plt.xticks(range(366)[::10],s_serday.values[::10],rotation = 90)
[plt.axvline(x,color = 'red') for x in holi1980.index];

In [None]:
holi1980.index[1].day_of_year

In [None]:
s_serday = dff.loc['1980'].month.astype(str) +'-'+ dff.loc['1980'].day.astype(int).astype(str)

In [None]:
# Обрабатываем "еденичные" выбросы используя межквантильный размах.