# Testes de hipóteses para a média (multidimensional)

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
import random
random.seed(123)

In [3]:
df = pd.read_csv('banco.csv', decimal='.', sep=',', index_col=0)
df.head()

Unnamed: 0_level_0,Sexo,Idade,CartaodeCredito,ChequeEspecial,Renda,LimiteCartaodeCredito,LimiteChequeEspecial,Devedor,SaldoDevedor
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1,0,26,0,0,247151,0,0,0,0
2,1,25,1,1,278190,417285,278190,0,0
3,0,29,0,0,256738,0,0,0,0
4,1,29,1,0,332950,499424,0,0,0
5,1,19,0,1,207408,0,207408,100,588733


In [4]:
from scipy.stats import f

In [11]:
def T2Hotelling(df, mu0, n, p):
    Xbarra=df.mean()
    S = df.cov()
    S_inv = np.linalg.inv(S)
    T2Hotelling = n*np.array(Xbarra-mu0).T.dot(S_inv).dot(np.array(Xbarra-mu0))
    qf = f.ppf(0.95, p , n-p, loc=0, scale=1)
    teste = T2Hotelling > (n-1) * p / (n-p) * qf
    pvalor = 1-f.cdf(T2Hotelling/((n-1) * p / (n-p) ), p, n-p)
    print('Rejeitamos H0') if teste else print('Não rejeitamos H0')
    print('Valor da estatística', T2Hotelling)
    print('valor p', pvalor)

In [6]:
df_amostra = df.sample(100)

In [12]:
mu0 = [0.3, 30, 0.48, 0.43, 3000, 2100, 1240, 0.28, 2150]
n=len(df_amostra)
p=len(df_amostra.columns)

T2Hotelling(df_amostra, mu0, n, p)

  Xbarra=df.mean()
  S = df.cov()


ValueError: operands could not be broadcast together with shapes (4,) (9,) 

In [13]:
df_amostra.mean()

  df_amostra.mean()


Sexo                0.33
Idade              29.39
CartaodeCredito     0.48
ChequeEspecial      0.47
dtype: float64

### Região de confiança

In [14]:
Xbarra = df_amostra.mean()
mu0 = [0.3, 30, 0.48, 0.43, 3000, 2100, 1240, 0.28, 2150]
S_inv = np.linalg.inv(df_amostra.cov())
n = len(df_amostra)

Teste = n*np.array(Xbarra-mu0).T.dot(S_inv).dot(np.array(Xbarra-mu0)) < (n-1) * p / (n-p) * f.ppf(0.95, p , n-p, loc=0, scale=1)

print('Resultado: mu0 está na região de confiança de mu') if(Teste) else print('Resultado: mu0 não está na região de confiança de mu')

  Xbarra = df_amostra.mean()
  S_inv = np.linalg.inv(df_amostra.cov())


ValueError: operands could not be broadcast together with shapes (4,) (9,) 

# Testes de hipóteses para a comparação de médias em amostras independentes

### Medias amostrais

In [None]:
df_amostra[df_amostra['Sexo']==0].mean()

In [None]:
df_amostra[df_amostra['Sexo']==1].mean()

In [None]:
import seaborn as sns

corrmat =  df_amostra.corr()
corrmat

fig, ax = plt.subplots(figsize=(20,10))  
sns.heatmap(corrmat, vmax=1., square=False).xaxis.tick_top()

In [None]:
# Dados mulheres

S1 = df_amostra.iloc[:,1:8][df_amostra['Sexo']==1].cov()
n1 = len(df_amostra[df_amostra['Sexo']==1])
Xbarra1 = df_amostra[df_amostra['Sexo']==1].mean()
S1

In [None]:
n1

In [None]:
# Dados homens

S2 = df.iloc[:,1:8][df['Sexo']==0].cov()
n2 = len(df_amostra[df_amostra['Sexo']==0])
Xbarra2 = df_amostra[df_amostra['Sexo']==0].mean()
S2

In [None]:
n2

In [None]:
S_pooled = ((n1-1)*S1 + (n2-1)*S2)/(n1+n2-2)
S_pooled

In [None]:
def T2Hotelling_duas_amostras(df1, df2, delta0):
    n1 = len(df1)
    n2 = len(df2)
    p = len(df1.columns)
    Xbarra1=df1.mean()
    Xbarra2=df2.mean()
    S1 = df1.cov()
    S2 = df2.cov()
    S_pooled = ((n1-1)*S1 + (n2-1)*S2)/(n1+n2-2)
    S_pooled_inv = np.linalg.inv(S_pooled)
    
    T2Hotelling_duas_amostras = np.array(Xbarra1-Xbarra2-delta0).T.dot(S_pooled_inv).dot(np.array(Xbarra1-Xbarra2-delta0)) / (n1+n2-2)
    qf = f.ppf(0.95, p , (n1+n2-2), loc=0, scale=1)
    teste = T2Hotelling_duas_amostras > (n1+n2-2) * p / (n1+n2-p-1) * qf
    pvalor = 1-f.cdf(T2Hotelling_duas_amostras/((n1+n2-2) * p / (n1+n2-p-1) ), p, (n1+n2-2))
    print('Rejeitamos H0') if teste else print('Não rejeitamos H0')
    print('Valor da estatística', T2Hotelling_duas_amostras)
    print('valor p', pvalor)

In [None]:
df1 = df_amostra.iloc[:,1:8][df_amostra['Sexo']==1]
df2 = df_amostra.iloc[:,1:8][df_amostra['Sexo']==0]

delta0 = [0,0,0,0,0,0,0]

T2Hotelling_duas_amostras(df1,df2,delta0)

# Testes de hipóteses para a comparação de médias em amostras correlacionadas

### Amostras pareadas

In [None]:
mean = [0, 0, 0]
mean1 = [1, 0, 0]

cov1 = [[2,1,0],[1,3,1],[0,1,4]]  
cov2 = [[0.01,0,0],[0,0.01,0],[0,0,0.01]]

X1 = np.random.multivariate_normal(mean, cov1, 50)
X2 = X1 + np.random.multivariate_normal(mean1, cov2, 50)

In [None]:
X = np.concatenate((X1,X2), axis=1)
X.shape

In [None]:
df = pd.DataFrame(X)
df.head()

In [None]:
df.mean()

In [None]:
corrmat =  df.corr()
corrmat

sns.heatmap(corrmat, vmax=1., square=False).xaxis.tick_top()

In [None]:
df_diff =  pd.DataFrame((X1-X2))

In [None]:
df_diff.head()

In [None]:
mu0=[0,0,0]
n=len(df_diff)
p=len(df_diff.columns)
T2Hotelling(df_diff, mu0, n, p)