In [1]:
import os
import pandas as pd
import numpy as np
from scipy.stats import ttest_ind, kstest, norm
import chardet
import re
import csv
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
from statsmodels.graphics.gofplots import qqplot

INPUT_FOLDER = "filtered_data"
RESULTS_FOLDER = "check_normality"
GRAPHS_FOLDER = os.path.join(RESULTS_FOLDER, "graphs_of_distributions")
for folder in [RESULTS_FOLDER, GRAPHS_FOLDER]:
    os.makedirs(folder, exist_ok=True)

sns.set_style("whitegrid")
plt.rcParams['font.family'] = 'Arial'
plt.rcParams['font.size'] = 12
plt.rcParams['axes.titlesize'] = 14
plt.rcParams['axes.labelsize'] = 12

def detect_encoding(file_path):
    """–í–∏–∑–Ω–∞—á–∞—î –∫–æ–¥—É–≤–∞–Ω–Ω—è —Ñ–∞–π–ª—É."""
    try:
        with open(file_path, 'rb') as f:
            return chardet.detect(f.read(10000))['encoding'] or 'utf-8'
    except Exception as e:
        print(f"–ü–æ–º–∏–ª–∫–∞ –≤–∏–∑–Ω–∞—á–µ–Ω–Ω—è –∫–æ–¥—É–≤–∞–Ω–Ω—è –¥–ª—è {file_path}: {e}")
        return 'utf-8'

def detect_separator(file_path, encoding):
    """–í–∏–∑–Ω–∞—á–∞—î —Ä–æ–∑–¥—ñ–ª—å–Ω–∏–∫ —É CSV-—Ñ–∞–π–ª—ñ."""
    try:
        with open(file_path, encoding=encoding) as f:
            sample = f.read(2048)
            return csv.Sniffer().sniff(sample).delimiter
    except Exception as e:
        print(f"–ü–æ–º–∏–ª–∫–∞ –≤–∏–∑–Ω–∞—á–µ–Ω–Ω—è —Ä–æ–∑–¥—ñ–ª—å–Ω–∏–∫–∞ –¥–ª—è {file_path}: {e}")
        return ','

def load_csv(file_path):
    """–ó–∞–≤–∞–Ω—Ç–∞–∂—É—î CSV-—Ñ–∞–π–ª."""
    encoding = detect_encoding(file_path)
    sep = detect_separator(file_path, encoding)
    print(f"   –ö–æ–¥—É–≤–∞–Ω–Ω—è: {encoding}, —Ä–æ–∑–¥—ñ–ª—å–Ω–∏–∫: '{sep}'")
    
    try:
        df = pd.read_csv(file_path, encoding=encoding, sep=sep, quotechar='"', on_bad_lines='skip')
        print(f"   –ü–µ—Ä—à—ñ 5 –∫–æ–ª–æ–Ω–æ–∫: {list(df.columns[:5])}")
        return df
    except Exception as e:
        print(f"–ü–æ–º–∏–ª–∫–∞ —á–∏—Ç–∞–Ω–Ω—è {file_path}: {e}")
        return None

def detect_dataset_type(filename):
    """–í–∏–∑–Ω–∞—á–∞—î —Ç–∏–ø –¥–∞—Ç–∞—Å–µ—Ç—É –∑–∞ –Ω–∞–∑–≤–æ—é —Ñ–∞–π–ª—É."""
    match = re.search(r'(\d{4})', filename)
    if match:
        year = int(match.group(1))
        return 'NMT' if year >= 2022 else 'ZNO'
    return None

def check_normality(df, dataset_type, filename, normality_results):
    """–ü–µ—Ä–µ–≤—ñ—Ä—è—î –Ω–æ—Ä–º–∞–ª—å–Ω—ñ—Å—Ç—å —Ä–æ–∑–ø–æ–¥—ñ–ª—É —Å–µ—Ä–µ–¥–Ω—å–æ—ó –æ—Ü—ñ–Ω–∫–∏."""
    score_columns = [col for col in df.columns if col.endswith('_score_100')]
    if not score_columns:
        print(f"‚ö†Ô∏è –î–ª—è {filename} –Ω–µ –∑–Ω–∞–π–¥–µ–Ω–æ –∫–æ–ª–æ–Ω–æ–∫ –∑ –æ—Ü—ñ–Ω–∫–∞–º–∏ (*_score_100)")
        return
    
    df['average_score'] = df[score_columns].mean(axis=1, skipna=True)
    scores = df['average_score'].dropna()
    
    if len(scores) < 3:
        print(f"‚ö†Ô∏è –ù–µ–¥–æ—Å—Ç–∞—Ç–Ω—å–æ –¥–∞–Ω–∏—Ö –¥–ª—è –ø–µ—Ä–µ–≤—ñ—Ä–∫–∏ –Ω–æ—Ä–º–∞–ª—å–Ω–æ—Å—Ç—ñ –≤ {filename}")
        return
    
    ks_stat, ks_p = kstest(scores, 'norm', args=(scores.mean(), scores.std()))
  
    normality_results.append({
        'file': filename,
        'dataset_type': dataset_type,
        'sample_size': len(scores),
        'ks_stat': ks_stat,
        'ks_p_value': ks_p,
    })
    
    plt.figure(figsize=(10, 6))
    sns.histplot(scores, kde=False, stat='density', bins=30, color='#36A2EB', edgecolor='black')
    x = np.linspace(scores.min(), scores.max(), 100)
    plt.plot(x, norm.pdf(x, scores.mean(), scores.std()), 'r-', lw=2, label='–ù–æ—Ä–º–∞–ª—å–Ω–∏–π —Ä–æ–∑–ø–æ–¥—ñ–ª')
    plt.title(f'–†–æ–∑–ø–æ–¥—ñ–ª —Å–µ—Ä–µ–¥–Ω—å–æ—ó –æ—Ü—ñ–Ω–∫–∏ ({dataset_type}, {filename})', pad=20)
    plt.xlabel('–°–µ—Ä–µ–¥–Ω—è –æ—Ü—ñ–Ω–∫–∞')
    plt.ylabel('–ì—É—Å—Ç–æ—Ç–∞')
    plt.legend()
    plt.tight_layout()
    plt.savefig(os.path.join(GRAPHS_FOLDER, f'normality_hist_{filename}.png'), dpi=300, bbox_inches='tight')
    plt.close()
    
    plt.figure(figsize=(10, 6))
    qqplot(scores, line='s', marker='o', markerfacecolor='#FF6384', markeredgecolor='black', alpha=0.5)
    plt.title(f'Q-Q –ø–ª–æ—Ç —Å–µ—Ä–µ–¥–Ω—å–æ—ó –æ—Ü—ñ–Ω–∫–∏ ({dataset_type}, {filename})', pad=20)
    plt.xlabel('–¢–µ–æ—Ä–µ—Ç–∏—á–Ω—ñ –∫–≤–∞–Ω—Ç–∏–ª—ñ')
    plt.ylabel('–ï–º–ø—ñ—Ä–∏—á–Ω—ñ –∫–≤–∞–Ω—Ç–∏–ª—ñ')
    plt.tight_layout()
    plt.savefig(os.path.join(GRAPHS_FOLDER, f'qq_plot_{filename}.png'), dpi=300, bbox_inches='tight')
    plt.close()

normality_results = []


for filename in os.listdir(INPUT_FOLDER):
    if not filename.endswith(".csv"):
        continue
    
    path = os.path.join(INPUT_FOLDER, filename)
    print(f"üîÑ –û–±—Ä–æ–±–∫–∞ {filename}...")
    
    try:
        df = load_csv(path)
        if df is None or df.empty:
            print(f"‚ö†Ô∏è –§–∞–π–ª {filename} –ø–æ—Ä–æ–∂–Ω—ñ–π –∞–±–æ –Ω–µ –≤–¥–∞–ª–æ—Å—è –∑–∞–≤–∞–Ω—Ç–∞–∂–∏—Ç–∏")
            continue
        
        dataset_type = detect_dataset_type(filename)
        if not dataset_type:
            print(f"‚ö†Ô∏è –ù–µ–≤—ñ–¥–æ–º–∏–π —Ç–∏–ø –¥–∞–Ω–∏—Ö: {filename}")
            continue
        
        print(f"   –ó–∞–≤–∞–Ω—Ç–∞–∂–µ–Ω–æ {len(df)} —Ä—è–¥–∫—ñ–≤, {len(df.columns)} —Å—Ç–æ–≤–ø—Ü—ñ–≤")
        
        check_normality(df, dataset_type, filename, normality_results)
        
    except Exception as e:
        print(f"‚ùå –ü–æ–º–∏–ª–∫–∞ —É —Ñ–∞–π–ª—ñ {filename}: {e}")
        import traceback
        traceback.print_exc()

pd.DataFrame(normality_results).to_csv(os.path.join(RESULTS_FOLDER, 'normality_tests.csv'), index=False)
print(f"\nüéâ –ê–Ω–∞–ª—ñ–∑ –∑–∞–≤–µ—Ä—à–µ–Ω–æ! –†–µ–∑—É–ª—å—Ç–∞—Ç–∏ –∑–±–µ—Ä–µ–∂–µ–Ω–æ –≤ '{RESULTS_FOLDER}'")

üîÑ –û–±—Ä–æ–±–∫–∞ 2020.csv...
   –ö–æ–¥—É–≤–∞–Ω–Ω—è: utf-8, —Ä–æ–∑–¥—ñ–ª—å–Ω–∏–∫: ','


  df = pd.read_csv(file_path, encoding=encoding, sep=sep, quotechar='"', on_bad_lines='skip')


   –ü–µ—Ä—à—ñ 5 –∫–æ–ª–æ–Ω–æ–∫: ['id', 'birth_year', 'gender', 'region_name', 'area_name']
   –ó–∞–≤–∞–Ω—Ç–∞–∂–µ–Ω–æ 201212 —Ä—è–¥–∫—ñ–≤, 131 —Å—Ç–æ–≤–ø—Ü—ñ–≤
üîÑ –û–±—Ä–æ–±–∫–∞ 2021.csv...
   –ö–æ–¥—É–≤–∞–Ω–Ω—è: utf-8, —Ä–æ–∑–¥—ñ–ª—å–Ω–∏–∫: ','


  df = pd.read_csv(file_path, encoding=encoding, sep=sep, quotechar='"', on_bad_lines='skip')


   –ü–µ—Ä—à—ñ 5 –∫–æ–ª–æ–Ω–æ–∫: ['id', 'birth_year', 'gender', 'region_name', 'area_name']
   –ó–∞–≤–∞–Ω—Ç–∞–∂–µ–Ω–æ 188609 —Ä—è–¥–∫—ñ–≤, 152 —Å—Ç–æ–≤–ø—Ü—ñ–≤
üîÑ –û–±—Ä–æ–±–∫–∞ 2023.csv...
   –ö–æ–¥—É–≤–∞–Ω–Ω—è: utf-8, —Ä–æ–∑–¥—ñ–ª—å–Ω–∏–∫: ','
   –ü–µ—Ä—à—ñ 5 –∫–æ–ª–æ–Ω–æ–∫: ['id', 'birth_year', 'gender', 'region_name', 'area_name']
   –ó–∞–≤–∞–Ω—Ç–∞–∂–µ–Ω–æ 256313 —Ä—è–¥–∫—ñ–≤, 64 —Å—Ç–æ–≤–ø—Ü—ñ–≤
üîÑ –û–±—Ä–æ–±–∫–∞ 2022.csv...
   –ö–æ–¥—É–≤–∞–Ω–Ω—è: utf-8, —Ä–æ–∑–¥—ñ–ª—å–Ω–∏–∫: ','


  df = pd.read_csv(file_path, encoding=encoding, sep=sep, quotechar='"', on_bad_lines='skip')


   –ü–µ—Ä—à—ñ 5 –∫–æ–ª–æ–Ω–æ–∫: ['id', 'birth_year', 'gender', 'region_name', 'area_name']
   –ó–∞–≤–∞–Ω—Ç–∞–∂–µ–Ω–æ 213647 —Ä—è–¥–∫—ñ–≤, 36 —Å—Ç–æ–≤–ø—Ü—ñ–≤
üîÑ –û–±—Ä–æ–±–∫–∞ 2019.csv...
   –ö–æ–¥—É–≤–∞–Ω–Ω—è: utf-8, —Ä–æ–∑–¥—ñ–ª—å–Ω–∏–∫: ','


  df = pd.read_csv(file_path, encoding=encoding, sep=sep, quotechar='"', on_bad_lines='skip')


   –ü–µ—Ä—à—ñ 5 –∫–æ–ª–æ–Ω–æ–∫: ['id', 'birth_year', 'gender', 'region_name', 'area_name']
   –ó–∞–≤–∞–Ω—Ç–∞–∂–µ–Ω–æ 172734 —Ä—è–¥–∫—ñ–≤, 131 —Å—Ç–æ–≤–ø—Ü—ñ–≤
üîÑ –û–±—Ä–æ–±–∫–∞ 2024.csv...
   –ö–æ–¥—É–≤–∞–Ω–Ω—è: utf-8, —Ä–æ–∑–¥—ñ–ª—å–Ω–∏–∫: ','
   –ü–µ—Ä—à—ñ 5 –∫–æ–ª–æ–Ω–æ–∫: ['id', 'birth_year', 'gender', 'region_name', 'area_name']
   –ó–∞–≤–∞–Ω—Ç–∞–∂–µ–Ω–æ 264164 —Ä—è–¥–∫—ñ–≤, 78 —Å—Ç–æ–≤–ø—Ü—ñ–≤

üéâ –ê–Ω–∞–ª—ñ–∑ –∑–∞–≤–µ—Ä—à–µ–Ω–æ! –†–µ–∑—É–ª—å—Ç–∞—Ç–∏ –∑–±–µ—Ä–µ–∂–µ–Ω–æ –≤ 'check_normality'


<Figure size 1000x600 with 0 Axes>

<Figure size 1000x600 with 0 Axes>

<Figure size 1000x600 with 0 Axes>

<Figure size 1000x600 with 0 Axes>

<Figure size 1000x600 with 0 Axes>

<Figure size 1000x600 with 0 Axes>