In [223]:

from sklearn.datasets import fetch_california_housing
from sklearn.linear_model import Ridge
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_predict
from sklearn.preprocessing import StandardScaler

data = fetch_california_housing(as_frame=True, return_X_y=True)
df, target = data[0], data[1]
df = StandardScaler().fit_transform(df)

cv = KFold(n_splits=4, shuffle=True, random_state=6)

losses1 = []
losses2 = []

r1 = Ridge(alpha=1)
r2 = Ridge(alpha=10)

y_pred1 = cross_val_predict(r1, df, target, cv=cv)
y_pred2 = cross_val_predict(r2, df, target, cv=cv)

arr1 = (target - y_pred1) ** 2
arr2 = (target - y_pred2) ** 2

In [224]:
from scipy.stats import binomtest

binomtest(k=(arr1 > arr2).sum(), n=len(arr1), p=0.5, alternative='two-sided').pvalue

5.015427635331782e-08

In [225]:
from scipy.stats import binom

tN = (arr1 > arr2).sum()

binom(n=len(arr1), p=0.5).cdf(tN) * 2

5.015427635331782e-08

In [226]:
import pandas as pd

pd.Series([7, 1, 5, 1, 3, 2, 5]).rank()

0    7.0
1    1.5
2    5.5
3    1.5
4    4.0
5    3.0
6    5.5
dtype: float64

In [283]:
from sklearn.datasets import fetch_california_housing

data = fetch_california_housing(as_frame=True, return_X_y=True)
data = pd.concat([data[0], data[1]], axis=1)
data['MedIncRank'] = data['MedInc'].rank()

data_u_30, data_o_30 = data[data['HouseAge'] <= 30], data[data['HouseAge'] > 30]

In [284]:
data_u_30

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,MedHouseVal,MedIncRank
1,8.3014,21.0,6.238137,0.971880,2401.0,2.109842,37.86,-122.22,3.585,20071.0
50,0.9218,21.0,2.045662,1.034247,735.0,1.678082,37.82,-122.27,1.719,103.0
59,2.5625,2.0,2.771930,0.754386,94.0,1.649123,37.82,-122.29,0.600,5150.0
70,1.7719,26.0,6.047244,1.196850,392.0,3.086614,37.81,-122.29,0.825,1648.0
74,2.4830,20.0,6.278195,1.210526,290.0,2.180451,37.81,-122.29,1.375,4717.5
...,...,...,...,...,...,...,...,...,...,...
20635,1.5603,25.0,5.045455,1.133333,845.0,2.560606,39.48,-121.09,0.781,942.0
20636,2.5568,18.0,6.114035,1.315789,356.0,3.122807,39.49,-121.21,0.771,5116.5
20637,1.7000,17.0,5.205543,1.120092,1007.0,2.325635,39.43,-121.22,0.923,1381.0
20638,1.8672,18.0,5.329513,1.171920,741.0,2.123209,39.43,-121.32,0.847,1932.0


In [285]:
R1 = data_u_30['MedIncRank'].sum()
R2 = data_o_30['MedIncRank'].sum()

N1 = data_u_30.shape[0]
N2 = data_o_30.shape[0]

from scipy.stats import norm

mu = N1 * (N1 + N2 + 1) / 2
sigma = (N1 * N2 * (N1 + N2 + 1) / 12) ** 0.5

norm(loc=mu, scale=sigma).cdf(R1) * 2

2.0

Тут выходит говно, тк данные не очень независимы

In [287]:
U1 = R1 - N1 * (N1 + 1) / 2
U2 = R2 - N2 * (N2 + 1) / 2

print(f'N1 - {N1}, N2 - {N2}\nU1 - {U1}, U2 - {U2}')

# По таблице не найдёшь:(

N1 - 11145, N2 - 9495
U1 - 59254034.5, U2 - 46567740.5


In [288]:
from scipy.stats import mannwhitneyu

mannwhitneyu(
    x=data_u_30['MedInc'],
    y=data_o_30['MedInc'],
    alternative="two-sided",
    method='auto'
).pvalue

5.345198689020034e-50

In [282]:
mannwhitneyu(
    x=data_u_30['MedInc'],
    y=data_o_30['MedInc'],
    alternative="two-sided",
    method='asymptotic'
).pvalue

5.345198689020034e-50

In [236]:
mannwhitneyu(
    x=data_u_30['MedInc'],
    y=data_o_30['MedInc'],
    alternative="two-sided",
    method='exact'
)  # Очень долго:(

KeyboardInterrupt: 

В данном случае формула и функция выдают похожие результаты так как данные независимы (почти)

Если group_1 и group_2 будут от оба от 0 до 100, то будет дерьмо:(

In [289]:
import numpy as np

group_1 = np.random.randint(0,100, size=100)
group_2 = np.random.randint(50,150, size=100)

test_data = pd.DataFrame({
    'Value': np.concatenate([group_1, group_2]),
    'Group': [1] * len(group_1) + [2] * len(group_2)
})
test_data['rankValues'] = test_data['Value'].sort_values().rank()

test_data_1, test_data_2 = (test_data[test_data['Group'] == 1]['rankValues'],
                            test_data[test_data['Group'] == 2]['rankValues'])

In [293]:
R1 = test_data_1.sum()
R2 = test_data_2.sum()

N1 = len(test_data_1)
N2 = len(test_data_2)

from scipy.stats import norm

mu = N1 * (N1 + N2 + 1) / 2
sigma = (N1 * N2 * (N1 + N2 + 1) / 12) ** 0.5

norm(loc=mu, scale=sigma).cdf(R1) * 2

1.75202851885393e-22

In [294]:
from scipy.stats import mannwhitneyu

mannwhitneyu(
    x=group_1,
    y=group_2,
    alternative="two-sided",
    method='asymptotic'
).pvalue

1.7602566763379255e-22

In [295]:
1-0.99**12

0.11361512828387077