# Quartis

In [53]:
import pandas as pd
import numpy as np
import math
import statistics
from scipy.stats import stats
from scipy.stats.mstats import gmean
from scipy import ndimage

In [2]:
dados = [150, 151, 152, 152, 153, 154, 155, 155, 155, 155]  

# base deve estar ordanada

In [3]:
df = pd.DataFrame(dados)

In [4]:
mediana = np.median(dados)
posicao_mediana = math.floor(len(dados)/2)
esquerda = dados[:posicao_mediana]
direita = dados[posicao_mediana:]

## Q1

In [5]:
np.median(esquerda)

152.0

In [6]:
np.quantile(dados, 0.25)

152.0

In [7]:
stats.scoreatpercentile(dados, 25)

152.0

## Q1

In [8]:
mediana

153.5

In [9]:
np.quantile(dados, 0.5)

153.5

In [10]:
stats.scoreatpercentile(dados, 50)

153.5

## Q3

In [11]:
np.median(direita)

155.0

In [12]:
np.quantile(dados, 0.75)

155.0

In [13]:
stats.scoreatpercentile(dados, 75)

155.0

## Q4

In [14]:
dados[-1]

155

In [15]:
np.quantile(dados, 1)

155

In [16]:
stats.scoreatpercentile(dados, 100)

155.0

## Pandas

In [17]:
df.quantile([0.25,0.5,0.75,1])

Unnamed: 0,0
0.25,152.0
0.5,153.5
0.75,155.0
1.0,155.0


In [18]:
df.describe()

Unnamed: 0,0
count,10.0
mean,153.2
std,1.873796
min,150.0
25%,152.0
50%,153.5
75%,155.0
max,155.0


## Dados agrupados

In [19]:
dados_agr = {'inferior': [150, 154, 158, 162, 166, 170], 
             'superior': [154, 158, 162, 166, 170, 174],
             'fi': [5, 9, 11, 7, 5, 3]}

dados_agr = pd.DataFrame(dados_agr)
dados_agr['xi'] = (dados_agr['superior'] + dados_agr['inferior'])/2
dados_agr['fi.xi'] = dados_agr['fi']*dados_agr['xi']
dados_agr['Fi'] = [dados_agr.iloc[:(i+1),2:3].sum()['fi'] for i in range(dados_agr.shape[0])]

print(dados_agr.shape)
dados_agr

(6, 6)


Unnamed: 0,inferior,superior,fi,xi,fi.xi,Fi
0,150,154,5,152.0,760.0,5
1,154,158,9,156.0,1404.0,14
2,158,162,11,160.0,1760.0,25
3,162,166,7,164.0,1148.0,32
4,166,170,5,168.0,840.0,37
5,170,174,3,172.0,516.0,40


In [20]:
def get_quartil(dataframe, q1=True):
    if q1 == True:
        fi_4 = dataframe['fi'].sum()/4
    else:
        fi_4 = (3 * dataframe['fi'].sum())/4
        
    limite_inferior, frequencia_classe, id_frequencia_anterior = 0, 0, 0
    for linha in dataframe.iterrows():
        limite_inferior = linha[1][0]
        frequencia_classe = linha[1][2]
        id_frequencia_anterior = linha[0]
        if linha[1][5] >= fi_4:
            id_frequencia_anterior -= 1
            break
    Fi_anterior = dataframe.iloc[[id_frequencia_anterior]]['Fi'].values[0]
    q = limite_inferior + ((fi_4 - Fi_anterior) * 4) / frequencia_classe
    
    return q

In [21]:
get_quartil(dados_agr),get_quartil(dados_agr, q1=False)  #q1 e q3

(156.22222222222223, 164.85714285714286)

## Percentis

In [22]:
dataset = np.array([160, 165, 167, 164, 160, 166, 160, 161, 150, 152, 173, 160, 155,
                    164, 168, 162, 161, 168, 163, 156, 155, 169, 151, 170, 164,
                    155, 152, 163, 160, 155, 157, 156, 158, 158, 161, 154, 161, 156, 172, 153])
len(dataset)

40

In [23]:
np.median(dataset)

160.0

In [24]:
np.quantile(dataset, 0.05) # 5%

151.95000000000002

In [25]:
np.percentile(dataset,5)   # 5%

151.95000000000002

In [26]:
stats.scoreatpercentile(dataset, 5)  # 5%

151.95000000000002

## EXERCICIO

In [27]:
df_census = pd.read_csv('Bases de dados/census.csv')
print(df_census.shape)
df_census.head()

(32561, 15)


Unnamed: 0,age,workclass,final-weight,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loos,hour-per-week,native-country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [28]:
df_census.describe(include='all')

Unnamed: 0,age,workclass,final-weight,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loos,hour-per-week,native-country,income
count,32561.0,32561,32561.0,32561,32561.0,32561,32561,32561,32561,32561,32561.0,32561.0,32561.0,32561,32561
unique,,9,,16,,7,15,6,5,2,,,,42,2
top,,Private,,HS-grad,,Married-civ-spouse,Prof-specialty,Husband,White,Male,,,,United-States,<=50K
freq,,22696,,10501,,14976,4140,13193,27816,21790,,,,29170,24720
mean,38.581647,,189778.4,,10.080679,,,,,,1077.648844,87.30383,40.437456,,
std,13.640433,,105550.0,,2.57272,,,,,,7385.292085,402.960219,12.347429,,
min,17.0,,12285.0,,1.0,,,,,,0.0,0.0,1.0,,
25%,28.0,,117827.0,,9.0,,,,,,0.0,0.0,40.0,,
50%,37.0,,178356.0,,10.0,,,,,,0.0,0.0,40.0,,
75%,48.0,,237051.0,,12.0,,,,,,0.0,0.0,45.0,,


In [29]:
print(df_census['age'].min())
print(df_census['age'].max())

17
90


In [30]:
stats.hmean(df_census['age'])

33.91874139089839

In [31]:
stats.gmean(df_census['age'])

36.210879158177256

In [32]:
statistics.mode(df_census['age'])

36

In [33]:
np.median(df_census['age'])

37.0

In [34]:
df_census['age'].quantile([0.25,0.5,0.75,1])

0.25    28.0
0.50    37.0
0.75    48.0
1.00    90.0
Name: age, dtype: float64

## Amplitude total

In [35]:
dataset

array([160, 165, 167, 164, 160, 166, 160, 161, 150, 152, 173, 160, 155,
       164, 168, 162, 161, 168, 163, 156, 155, 169, 151, 170, 164, 155,
       152, 163, 160, 155, 157, 156, 158, 158, 161, 154, 161, 156, 172,
       153])

In [36]:
amplit = dataset.max() - dataset.min()
amplit 

23

## Diferença interquartil

In [37]:
q1 = np.quantile(dataset, 0.25)
q3 = np.quantile(dataset, 0.75)
dif_q = q3 - q1
dif_q

8.25

### Limites de outliers

In [38]:
inferior = q1 - (1.5 * dif_q)
inferior

143.375

In [39]:
superior = q3 + (1.5 * dif_q)
superior

176.375

## Variância e desvio padrâo

In [45]:
dados = np.array(dados)
dados

array([150, 151, 152, 152, 153, 154, 155, 155, 155, 155])

### Variancia

In [51]:
media = sum(dados)/len(dados)
desvio = ((dados - media) ** 2).sum()/len(dados)
desvio

3.16

In [50]:
np.var(dados)

3.16

In [52]:
statistics.variance(dados)

3

In [54]:
ndimage.variance(dados)

3.16

### Desvio Padrão

In [55]:
desvio_padrao = math.sqrt(np.var(dados))
desvio_padrao

1.7776388834631178

In [None]:
stat