# Importing libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats

## 1.Завантаження залежних та незалежних вибірок з текстових файлів. 

In [2]:
path = "../data_lab4/indep/5Prak.txt"
columns = ['x','y']
df = pd.read_csv(path, delimiter=" ", header=None, names=columns)
df.head()

Unnamed: 0,x,y
0,102.9,424.9
1,142.0,353.1
2,353.1,310.2
3,253.3,422.0
4,169.9,454.2


## Creating class for calculation quantiles

In [3]:
# Коефіцієнти для формули
C0 = 2.515_517
C1 = 0.802_853
C2 = 0.010_328
D1 = 1.432_788
D2 = 0.189_265_9
D3 = 0.001_308

class Quantile:
    
    @classmethod
    def calculate_quantile(cls, p):
        if p > 0.5:
            t = np.sqrt(-2 * np.log(1 - p))
            quantile = (t - ((C0 + C1 * t + C2 * t ** 2) / (1 + D1 * t + D2 * t ** 2 + D3 * t ** 3)))
        else:
            t = np.sqrt(-2 * np.log(p))
            quantile = -(t - ((C0 + C1 * t + C2 * t ** 2) / (1 + D1 * t + D2 * t ** 2 + D3 * t ** 3)))

        return quantile
        
    @staticmethod
    def student_quantile(p, v):
        up = Quantile.calculate_quantile(p)
        tpv = up + (1 / v) * (1 / 4) * (up ** 3 + up) + (1 / v ** 2) * (1 / 96) * (
            5 * up ** 5 + 16 * up ** 3 + 3 * up) + (1 / v ** 3) * (1 / 384) * (
                  3 * up ** 7 + 19 * up ** 5 + 17 * up ** 3 - 15 * up) + (1 / v ** 4) * (1 / 92_160) * (
                  79 * up ** 9 + 779 * up ** 7 + 1_482 * up ** 5 - 1_920 * up ** 3 - 945 * up)
        return tpv
    
    @staticmethod
    def fisher_quantile(p ,v1, v2):
        up = Quantile.calculate_quantile(p)   

        sigma = 1/v1 + 1/v2
        delta = 1/v1 - 1/v2
        
        z = (up * np.sqrt(sigma / 2) -
         (1 / 6) * delta * (up**2 + 2) +
         np.sqrt(sigma / 2) * (sigma / 24 * (up**2 + 3 * up) + (1 / 72) * (delta**2 / sigma) * (up**3 + 11 * up)) -
         (sigma * delta / 120) * (up**4 + 9 * up**2 + 8) +
         (delta**3 / (3240 * sigma)) * (3 * up**4 + 7 * up**2 - 16) +
         np.sqrt(sigma / 2) * (sigma**2 / 1920 * (up**5 + 20 * up**3 + 15 * up)) +
         (delta**4 / 2880) * (up**5 + 44 * up**3 + 183 * up) +
         (delta**5 / (155520 * sigma**2)) * (9 * up**5 + 284 * up**3 + 1513 * up)
        )

        return np.exp(2*z)

## Критерії перевірки однорідності двох залежних вибірок: критерії перевірки рівності дисперсій та середніх + критерій знакових рангів Вілкоксона

In [4]:
class DependentSamples:
    @classmethod
    def create_thrird_sample(cls, df):
        df['z'] = df['x']-df['y']

        z_mean = np.mean(df['z'])
        z_std = np.std(df['z'], ddof=1)
       
        return DependentSamples.calculate_t(len(df)-1, z_mean, z_std)

    @classmethod
    def calculate_t(cls, length, z_mean, z_std):
        t = z_mean*np.sqrt(length)/z_std
        return DependentSamples.is_mean_equal(t, length-1)

    @classmethod
    def is_mean_equal(cls, t, n):
        print('t equals: '+str(t))
        print('Quantile: '+str(Quantile.student_quantile(1-0.05/2, n)))
        return np.abs(t)<=Quantile.student_quantile(1-0.05/2, n)

    # @classmethod
    # def is_variances_equal(cls, t, n):
    #     print('t equals: '+str(t))
    #     print('Quantile: '+str(Quantile.student_quantile(1-0.05/2, n)))
    #     return np.abs(t)<=Quantile.student_quantile(1-0.05/2, n)
    
    @classmethod
    def calculate_statistic(cls, df):
        squared_x_std = np.std(df['x'], ddof=1)**2
        squared_y_std = np.std(df['y'], ddof=1)**2
        if(squared_x_std>=squared_y_std):
            f=squared_x_std/squared_y_std
            v1 = df['x'].count() - 1
            v2 = df['y'].count() - 1 
        else:
            f=squared_y_std/squared_x_std
            v1 = df['y'].count() - 1 
            v2 = df['x'].count() - 1
        
        #check is variances coincide
        f_fisher = Quantile.fisher_quantile(0.95,v1,v2)
        print('F:' +str(f))
        print('F fisher:' +str(f_fisher))
        return f<=f_fisher
        
    @staticmethod
    def check_for_homogeneity(df):
        mean_equal = DependentSamples.create_thrird_sample(df)
        varians_equal = DependentSamples.calculate_statistic(df)
        print('Is mean equals: '+str(mean_equal))
        print('Is varians equals: '+str(varians_equal))

#### Test

In [5]:
DependentSamples.check_for_homogeneity(df)
DependentSamples.calculate_rank_with_mann_whitney(df)

t equals: -3.7035508041843515
Quantile: 2.447544124742589
F:2.756852676299814
F fisher:4.135508578777062
Is mean equals: False
Is varians equals: True


{102.9: 0,
 142.0: 0,
 353.1: 1.5,
 253.3: 0,
 169.9: 0,
 234.4: 0,
 277.9: 0,
 175.8: 0}

## Критерії перевірки однорідності двох незалежних вибірок: критерії перевірки рівності дисперсій та середніх + ранговий критерій, заданий індивідуальним варіантом.

In [6]:
class IndependentSamples:
    @classmethod
    def calculate_weighted_average_and_t(cls, df):
        x_mean = np.mean(df['x'])
        y_mean = np.mean(df['y'])
        x_dispersion = np.std(df['x'], ddof=1)**2
        y_dispersion = np.std(df['y'], ddof=1)**2
        x_n =df['x'].count()
        y_n =df['y'].count()

        weighted_average = ((x_n-1)*x_dispersion+(y_n-1)*y_dispersion)/(x_n+y_n-2)
        t = (x_mean-y_mean)/(np.sqrt(weighted_average/x_n+weighted_average/y_n))
        print('weighted average: '+str(weighted_average))
        print('t equals: '+str(t))
        return IndependentSamples.is_mean_equal(t, x_n+y_n-2)

    @classmethod
    def is_mean_equal(cls, t, v):
        print('Quantile: '+str(Quantile.student_quantile(1-0.05/2, v)))
        return np.abs(t)<=Quantile.student_quantile(1-0.05/2, v)

    @classmethod
    def calculate_criterion_with_Welch_correction(cls, df):
        x_mean = np.mean(df['x'])
        y_mean = np.mean(df['y'])
        x_dispersion = np.std(df['x'], ddof=1)**2
        y_dispersion = np.std(df['y'], ddof=1)**2
        x_n =df['x'].count()
        y_n =df['y'].count()
        t = (x_mean-y_mean)/(np.sqrt(x_dispersion/x_n+y_dispersion/y_n))
        v = (x_dispersion/x_n+y_dispersion/y_n)**2*((1/(x_n-1)*(x_dispersion/x_n)**2)+(1/(y_n-1)*(y_dispersion/y_n)**2))**-1
        print('v: '+str(v))
        return IndependentSamples.is_mean_equal(t, v)
    
    @staticmethod
    def check_for_homogeneity(df,variance):
        if variance:
            return IndependentSamples.calculate_weighted_average_and_t(df)
        else:
            return IndependentSamples.calculate_criterion_with_Welch_correction(df)

    @staticmethod
        def calculate_rank_with_mann_whitney(df):
            result = {}
            for x_val in df['x']:
                inversions = 0
                for y_val in df['y']:
                    if y_val < x_val:
                        inversions += 1
                    elif y_val == x_val:
                        inversions += 0.5  # Розглядаємо рівні значення як півінверсії
                result[x_val] = inversions
            return result

#### Test

In [7]:
IndependentSamples.check_for_homogeneity(df, DependentSamples.calculate_statistic(df))

F:2.756852676299814
F fisher:4.135508578777062
weighted average: 4671.190233516483
t equals: -4.974661619949333
Quantile: 2.1609142123446516


False

### Check wheather distribution normal or not

In [8]:
alpha = 0.05
# Shapiro-Wilk test for 'x'
statistic_x, p_value_x = stats.shapiro(df['x'])
normal_x = p_value_x > alpha

# Shapiro-Wilk test for 'y'
statistic_y, p_value_y = stats.shapiro(df['y'])
normal_y = p_value_y > alpha

if normal_x and normal_y:
    normal =True;
else:
    normal =False;

print(normal)

True


### Checking the equality of variances

In [9]:
x_mean = np.mean(df['x'])
y_mean = np.mean(df['y'])
squared_x_std = np.std(df['x'], ddof=1)**2
squared_y_std = np.std(df['y'], ddof=1)**2
print("Mean for 1 column: "+ str(x_mean))
print("Mean for 2 column: "+ str(y_mean))
print("STD for 1 column: "+ str(squared_x_std))
print("STD for 2 column: "+ str(squared_y_std))

Mean for 1 column: 213.66250000000002
Mean for 2 column: 389.6285714285714
STD for 1 column: 6617.574107142857
STD for 2 column: 2400.409047619047


#### Calculate P value with CDF(using student quantile)

### Calculate paired t-test

In [10]:
v1 = df['x'].count() - 1
v2 = df['y'].count() - 1 

Quantile.fisher_quantile(0.95,v1,v2)


4.135508578777062

In [11]:
F = (squared_x_std/squared_y_std)
if(normal):
    2
elif(f>1):
    print("F equalse: " + str(F))
    2