# Toplulaştırma ve Gruplama (Aggregation & Grouping)

In [1]:
import seaborn as sns
import pandas as pd
import numpy as np

In [2]:
?sns.load_dataset

[1;31mSignature:[0m [0msns[0m[1;33m.[0m[0mload_dataset[0m[1;33m([0m[0mname[0m[1;33m,[0m [0mcache[0m[1;33m=[0m[1;32mTrue[0m[1;33m,[0m [0mdata_home[0m[1;33m=[0m[1;32mNone[0m[1;33m,[0m [1;33m**[0m[0mkws[0m[1;33m)[0m[1;33m[0m[1;33m[0m[0m
[1;31mDocstring:[0m
Load an example dataset from the online repository (requires internet).

This function provides quick access to a small number of example datasets
that are useful for documenting seaborn or generating reproducible examples
for bug reports. It is not necessary for normal usage.

Note that some of the datasets have a small amount of preprocessing applied
to define a proper ordering for categorical variables.

Use :func:`get_dataset_names` to see a list of available datasets.

Parameters
----------
name : str
    Name of the dataset (``{name}.csv`` on
    https://github.com/mwaskom/seaborn-data).
cache : boolean, optional
    If True, try to load from the local cache first, and save to the cache
 

In [3]:
sns.get_dataset_names()

['anagrams',
 'anscombe',
 'attention',
 'brain_networks',
 'car_crashes',
 'diamonds',
 'dots',
 'dowjones',
 'exercise',
 'flights',
 'fmri',
 'geyser',
 'glue',
 'healthexp',
 'iris',
 'mpg',
 'penguins',
 'planets',
 'seaice',
 'taxis',
 'tips',
 'titanic']

In [4]:
df = sns.load_dataset('planets')
df.head()

Unnamed: 0,method,number,orbital_period,mass,distance,year
0,Radial Velocity,1,269.3,7.1,77.4,2006
1,Radial Velocity,1,874.774,2.21,56.95,2008
2,Radial Velocity,1,763.0,2.6,19.84,2011
3,Radial Velocity,1,326.03,19.4,110.62,2007
4,Radial Velocity,1,516.22,10.5,119.47,2009


In [5]:
df.size

6210

In [6]:
df.shape

(1035, 6)

In [7]:
df.mean()

  df.mean()


number               1.785507
orbital_period    2002.917596
mass                 2.638161
distance           264.069282
year              2009.070531
dtype: float64

In [8]:
df['number'].mean()

1.7855072463768116

In [9]:
df['orbital_period'].mean()

2002.9175960947584

In [10]:
df['mass'].mean()

2.6381605847953233

In [11]:
df.count()

method            1035
number            1035
orbital_period     992
mass               513
distance           808
year              1035
dtype: int64

In [12]:
df['number'].count()

1035

In [13]:
df['orbital_period'].count()

992

In [14]:
df['mass'].count()

513

In [15]:
?df.first

[1;31mSignature:[0m [0mdf[0m[1;33m.[0m[0mfirst[0m[1;33m([0m[0moffset[0m[1;33m)[0m [1;33m->[0m [1;34m'NDFrameT'[0m[1;33m[0m[1;33m[0m[0m
[1;31mDocstring:[0m
Select initial periods of time series data based on a date offset.

When having a DataFrame with dates as index, this function can
select the first few rows based on a date offset.

Parameters
----------
offset : str, DateOffset or dateutil.relativedelta
    The offset length of the data that will be selected. For instance,
    '1M' will display all the rows having their index within the first month.

Returns
-------
Series or DataFrame
    A subset of the caller.

Raises
------
TypeError
    If the index is not  a :class:`DatetimeIndex`

See Also
--------
last : Select final periods of time series based on a date offset.
at_time : Select values at a particular time of the day.
between_time : Select values between particular times of the day.

Examples
--------
>>> i = pd.date_range('2018-04-09', periods=4, f

In [16]:
df.max()

method            Transit Timing Variations
number                                    7
orbital_period                     730000.0
mass                                   25.0
distance                             8500.0
year                                   2014
dtype: object

In [17]:
df.min()

method            Astrometry
number                     1
orbital_period      0.090706
mass                  0.0036
distance                1.35
year                    1989
dtype: object

In [18]:
df.std()

  df.std()


number                1.240976
orbital_period    26014.728304
mass                  3.818617
distance            733.116493
year                  3.972567
dtype: float64

In [19]:
df.var()

  df.var()


number            1.540022e+00
orbital_period    6.767661e+08
mass              1.458183e+01
distance          5.374598e+05
year              1.578129e+01
dtype: float64

In [20]:
df.sum()

method            Radial VelocityRadial VelocityRadial VelocityR...
number                                                         1848
orbital_period                                       1986894.255326
mass                                                     1353.37638
distance                                                  213367.98
year                                                        2079388
dtype: object

In [21]:
type(df)

pandas.core.frame.DataFrame

In [22]:
df

Unnamed: 0,method,number,orbital_period,mass,distance,year
0,Radial Velocity,1,269.300000,7.10,77.40,2006
1,Radial Velocity,1,874.774000,2.21,56.95,2008
2,Radial Velocity,1,763.000000,2.60,19.84,2011
3,Radial Velocity,1,326.030000,19.40,110.62,2007
4,Radial Velocity,1,516.220000,10.50,119.47,2009
...,...,...,...,...,...,...
1030,Transit,1,3.941507,,172.00,2006
1031,Transit,1,2.615864,,148.00,2007
1032,Transit,1,3.191524,,174.00,2007
1033,Transit,1,4.125083,,293.00,2008


In [23]:
df.describe()

Unnamed: 0,number,orbital_period,mass,distance,year
count,1035.0,992.0,513.0,808.0,1035.0
mean,1.785507,2002.917596,2.638161,264.069282,2009.070531
std,1.240976,26014.728304,3.818617,733.116493,3.972567
min,1.0,0.090706,0.0036,1.35,1989.0
25%,1.0,5.44254,0.229,32.56,2007.0
50%,1.0,39.9795,1.26,55.25,2010.0
75%,2.0,526.005,3.04,178.5,2012.0
max,7.0,730000.0,25.0,8500.0,2014.0


In [24]:
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
number,1035.0,1.785507,1.240976,1.0,1.0,1.0,2.0,7.0
orbital_period,992.0,2002.917596,26014.728304,0.090706,5.44254,39.9795,526.005,730000.0
mass,513.0,2.638161,3.818617,0.0036,0.229,1.26,3.04,25.0
distance,808.0,264.069282,733.116493,1.35,32.56,55.25,178.5,8500.0
year,1035.0,2009.070531,3.972567,1989.0,2007.0,2010.0,2012.0,2014.0


In [25]:
# veri seti içerisindeki eksik gözlemleri kaldırıp temel analizlere ulaşmak için;
df.dropna().describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
number,498.0,1.73494,1.17572,1.0,1.0,1.0,2.0,6.0
orbital_period,498.0,835.778671,1469.128259,1.3283,38.27225,357.0,999.6,17337.5
mass,498.0,2.50932,3.636274,0.0036,0.2125,1.245,2.8675,25.0
distance,498.0,52.068213,46.596041,1.35,24.4975,39.94,59.3325,354.0
year,498.0,2007.37751,4.167284,1989.0,2005.0,2009.0,2011.0,2014.0


In [29]:
sns.get_dataset_names()

['anagrams',
 'anscombe',
 'attention',
 'brain_networks',
 'car_crashes',
 'diamonds',
 'dots',
 'dowjones',
 'exercise',
 'flights',
 'fmri',
 'geyser',
 'glue',
 'healthexp',
 'iris',
 'mpg',
 'penguins',
 'planets',
 'seaice',
 'taxis',
 'tips',
 'titanic']

In [33]:
anagram = sns.load_dataset('anagrams')
anagram

Unnamed: 0,subidr,attnr,num1,num2,num3
0,1,divided,2,4.0,7
1,2,divided,3,4.0,5
2,3,divided,3,5.0,6
3,4,divided,5,7.0,5
4,5,divided,4,5.0,8
5,6,divided,5,5.0,6
6,7,divided,5,4.5,6
7,8,divided,5,7.0,8
8,9,divided,2,3.0,7
9,10,divided,6,5.0,6


In [34]:
anagram.mean()

  anagram.mean()


subidr    10.500
num1       5.350
num2       5.975
num3       6.550
dtype: float64

In [35]:
anagram.describe()

Unnamed: 0,subidr,num1,num2,num3
count,20.0,20.0,20.0,20.0
mean,10.5,5.35,5.975,6.55
std,5.91608,1.843195,1.67391,1.099043
min,1.0,2.0,3.0,5.0
25%,5.75,4.75,5.0,6.0
50%,10.5,5.5,5.5,6.0
75%,15.25,6.25,7.25,7.0
max,20.0,8.0,9.0,9.0


In [36]:
anagram.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
subidr,20.0,10.5,5.91608,1.0,5.75,10.5,15.25,20.0
num1,20.0,5.35,1.843195,2.0,4.75,5.5,6.25,8.0
num2,20.0,5.975,1.67391,3.0,5.0,5.5,7.25,9.0
num3,20.0,6.55,1.099043,5.0,6.0,6.0,7.0,9.0
