In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

## describe()
Generate descriptive statistics.

Descriptive statistics include those that summarize the central
tendency, dispersion and shape of a
dataset's distribution, excluding NaN values.

Analyzes both numeric and object series, as well
as DataFrame column sets of mixed data types. The output
will vary depending on what is provided. Refer to the notes
below for more detail.

`describe(percentiles=None, include=None, exclude=None,datetime_is_numeric=False)`

Parameters:
* `percentiles` : list-like of numbers, optional The percentiles to include in the output. All should fall between 0 and 1. The default is [.25, .5, .75],which returns the 25th, 50th, and 75th percentiles.
* `include` : 'all', list-like of dtypes or None (default), optional. A white list of data types to include in the result. Ignored for Series. Here are the options: (e.g. df.describe(include=['O']))




**Describing a numeric Series.**

In [2]:
s = pd.Series([1, 2, 3])
s.describe()

count    3.0
mean     2.0
std      1.0
min      1.0
25%      1.5
50%      2.0
75%      2.5
max      3.0
dtype: float64

In [3]:
s = pd.Series(['a', 'a', 'b', 'c'])
s.describe()

count     4
unique    3
top       a
freq      2
dtype: object

**Datetime**

In [4]:
s = pd.Series([
               np.datetime64("2000-01-01"),
               np.datetime64("2010-01-01"),
               np.datetime64("2010-01-01")])
s.describe(datetime_is_numeric=True)

count                      3
mean     2006-09-01 08:00:00
min      2000-01-01 00:00:00
25%      2004-12-31 12:00:00
50%      2010-01-01 00:00:00
75%      2010-01-01 00:00:00
max      2010-01-01 00:00:00
dtype: object

In [7]:
df = pd.DataFrame({'categorical': pd.Categorical(['d','e','f']),
                   'numeric': [1, 2, 3],
                   'object': ['a', 'b', 'c']})
df

Unnamed: 0,categorical,numeric,object
0,d,1,a
1,e,2,b
2,f,3,c


Describing a DataFrame. By default only numeric fields
are returned.

In [16]:
df.describe(percentiles=[0.05,.25, .5, .75, 0.95])

Unnamed: 0,numeric
count,3.0
mean,2.0
std,1.0
min,1.0
5%,1.1
25%,1.5
50%,2.0
75%,2.5
95%,2.9
max,3.0


NB! мы создали дасает с category, numeric, object типами

**Describe() создал статистику только для numeric**

Example below: Describing all columns of a DataFrame regardless of data type.

In [6]:
df.describe(include='all')  # doctest: +SKIP

Unnamed: 0,categorical,numeric,object
count,3,3.0,3
unique,3,,3
top,f,,b
freq,1,,1
mean,,2.0,
std,,1.0,
min,,1.0,
25%,,1.5,
50%,,2.0,
75%,,2.5,


In [9]:
# Describing a column from a DataFrame by accessing it as an attribute.
df.numeric.describe()

count    3.0
mean     2.0
std      1.0
min      1.0
25%      1.5
50%      2.0
75%      2.5
max      3.0
Name: numeric, dtype: float64

In [10]:
# Including only numeric columns in a DataFrame description.
# 
df.describe(include=[np.number])

Unnamed: 0,numeric
count,3.0
mean,2.0
std,1.0
min,1.0
25%,1.5
50%,2.0
75%,2.5
max,3.0


In [11]:
# Including only string columns in a DataFrame description.
df.describe(include=[object])  # doctest: +SKIP

Unnamed: 0,object
count,3
unique,3
top,b
freq,1


In [12]:
# Including only categorical columns from a DataFrame description.
df.describe(include=['category'])

Unnamed: 0,categorical
count,3
unique,3
top,f
freq,1


In [13]:
# Excluding numeric columns from a DataFrame description.
df.describe(exclude=[np.number])

Unnamed: 0,categorical,object
count,3,3
unique,3,3
top,f,b
freq,1,1


In [14]:
# Excluding object columns from a DataFrame description.
df.describe(exclude=[object])

Unnamed: 0,categorical,numeric
count,3,3.0
unique,3,
top,f,
freq,1,
mean,,2.0
std,,1.0
min,,1.0
25%,,1.5
50%,,2.0
75%,,2.5
