In [None]:
import pandas as pd
import os


INPUT_FOLDER = "filtered_data"
RESULTS_FOLDER = "results"
os.makedirs(RESULTS_FOLDER, exist_ok=True)


selected_types = ["–ª—ñ—Ü–µ–π", "–≥—ñ–º–Ω–∞–∑—ñ—è", "–Ω–∞–≤—á–∞–ª—å–Ω–æ-–≤–∏—Ö–æ–≤–Ω–∏–π –∫–æ–º–ø–ª–µ–∫—Å", 
                  "—Å–ø–µ—Ü—ñ–∞–ª—ñ–∑–æ–≤–∞–Ω–∞ —à–∫–æ–ª–∞", "–∑–∞–∫–ª–∞–¥ —Ñ–∞—Ö–æ–≤–æ—ó –ø–µ—Ä–µ–¥–≤–∏—â–æ—ó –æ—Å–≤—ñ—Ç–∏", "—Å–µ—Ä–µ–¥–Ω—è –∑–∞–≥–∞–ª—å–Ω–æ–æ—Å–≤—ñ—Ç–Ω—è —à–∫–æ–ª–∞"]

def count_education_types_by_location(df, year):
    required_cols = ['education_org_type', 'territory_type']
    if not all(col in df.columns for col in required_cols):
        print(f"‚ö†Ô∏è –í—ñ–¥—Å—É—Ç–Ω—ñ –∫–æ–ª–æ–Ω–∫–∏: education_org_type –∞–±–æ territory_type —É —Ñ–∞–π–ª—ñ –¥–ª—è {year}")
        return None
    
    df_filtered = df[df['education_org_type'].isin(selected_types)].copy()
    
    valid_locations = df_filtered['territory_type'].dropna().unique().tolist()
    if not valid_locations:
        print(f"‚ö†Ô∏è –ñ–æ–¥–Ω–∏—Ö –¥–∞–Ω–∏—Ö –ø—Ä–æ territory_type –¥–ª—è {year}")
        return None

    result = df_filtered.groupby(['territory_type', 'education_org_type']).size().unstack(fill_value=0)

    result = result.loc[valid_locations]
    
    return result


all_results = {}
for filename in os.listdir(INPUT_FOLDER):
    if not filename.endswith(".csv"):
        continue
    
    file_path = os.path.join(INPUT_FOLDER, filename)
    year = filename.split('.')[0]  
    print(f"üîÑ –û–±—Ä–æ–±–∫–∞ {filename} ({year})...")
    
    try:
        df = pd.read_csv(file_path, encoding='utf-8', on_bad_lines='skip', low_memory=False)
        print(f"   –ó–∞–≤–∞–Ω—Ç–∞–∂–µ–Ω–æ {len(df)} —Ä—è–¥–∫—ñ–≤, {len(df.columns)} —Å—Ç–æ–≤–ø—Ü—ñ–≤")
        
        result = count_education_types_by_location(df, year)
        if result is not None:
            all_results[year] = result
            print(f"‚úÖ –î–∞–Ω—ñ –¥–ª—è {filename} ({year}) –æ–±—Ä–æ–±–ª–µ–Ω–æ")
    
    except Exception as e:
        print(f"‚ùå –ü–æ–º–∏–ª–∫–∞ —É —Ñ–∞–π–ª—ñ {filename}: {e}")
        import traceback
        traceback.print_exc()


if all_results:
    for year, result in all_results.items():
        output_path = os.path.join(RESULTS_FOLDER, f"education_types_by_location_{year}.csv")
        result.to_csv(output_path)
        print(f"‚úÖ –†–µ–∑—É–ª—å—Ç–∞—Ç–∏ –¥–ª—è {year} –∑–±–µ—Ä–µ–∂–µ–Ω–æ —É: {output_path}")
else:
    print("‚ö†Ô∏è –ñ–æ–¥–Ω–∏—Ö –¥–∞–Ω–∏—Ö –Ω–µ –æ–±—Ä–æ–±–ª–µ–Ω–æ")

print(f"\nüéâ –ê–Ω–∞–ª—ñ–∑ –∑–∞–≤–µ—Ä—à–µ–Ω–æ! –†–µ–∑—É–ª—å—Ç–∞—Ç–∏ –∑–±–µ—Ä–µ–∂–µ–Ω–æ –≤ '{RESULTS_FOLDER}'")

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os


INPUT_FOLDER = "filtered_data"
RESULTS_FOLDER = "results"
GRAPHS_FOLDER = os.path.join(RESULTS_FOLDER, "graphs")
for folder in [RESULTS_FOLDER, GRAPHS_FOLDER]:
    os.makedirs(folder, exist_ok=True)

selected_types = ["–ª—ñ—Ü–µ–π", "–≥—ñ–º–Ω–∞–∑—ñ—è", "–Ω–∞–≤—á–∞–ª—å–Ω–æ-–≤–∏—Ö–æ–≤–Ω–∏–π –∫–æ–º–ø–ª–µ–∫—Å", 
                  "—Å–ø–µ—Ü—ñ–∞–ª—ñ–∑–æ–≤–∞–Ω–∞ —à–∫–æ–ª–∞", "–∑–∞–∫–ª–∞–¥ —Ñ–∞—Ö–æ–≤–æ—ó –ø–µ—Ä–µ–¥–≤–∏—â–æ—ó –æ—Å–≤—ñ—Ç–∏", "—Å–µ—Ä–µ–¥–Ω—è –∑–∞–≥–∞–ª—å–Ω–æ–æ—Å–≤—ñ—Ç–Ω—è —à–∫–æ–ª–∞"]

def visualize_education_types_by_location_2022():
    filename = "2022.csv"
    file_path = os.path.join(INPUT_FOLDER, filename)
    
    print(f"üîÑ –û–±—Ä–æ–±–∫–∞ {filename}...")
    
    try:
        df = pd.read_csv(file_path, encoding='utf-8', on_bad_lines='skip', low_memory=False)
        print(f"   –ó–∞–≤–∞–Ω—Ç–∞–∂–µ–Ω–æ {len(df)} —Ä—è–¥–∫—ñ–≤, {len(df.columns)} —Å—Ç–æ–≤–ø—Ü—ñ–≤")
        
        required_cols = ['education_org_type', 'territory_type']
        if not all(col in df.columns for col in required_cols):
            print("‚ö†Ô∏è –í—ñ–¥—Å—É—Ç–Ω—ñ –∫–æ–ª–æ–Ω–∫–∏: education_org_type –∞–±–æ territory_type")
            return
        
        df_filtered = df[df['education_org_type'].isin(selected_types)].copy()
        
        locations = ["—Å–µ–ª–∏—â–µ, —Å–µ–ª–æ", "—Å–µ–ª–∏—â–µ –º—ñ—Å—å–∫–æ–≥–æ —Ç–∏–ø—É", "–º—ñ—Å—Ç–æ"]
        df_filtered = df_filtered[df_filtered['territory_type'].isin(locations)]
        
        if df_filtered.empty:
            print("‚ö†Ô∏è –ñ–æ–¥–Ω–∏—Ö –¥–∞–Ω–∏—Ö –ø—Ä–æ territory_type —Å–µ—Ä–µ–¥ –∑–∞–¥–∞–Ω–∏—Ö –∫–∞—Ç–µ–≥–æ—Ä—ñ–π")
            return
        
        total_locations = df_filtered['territory_type'].value_counts().reindex(locations, fill_value=0)
        
        result = df_filtered.groupby(['territory_type', 'education_org_type']).size().unstack(fill_value=0)
        
        for edu_type in selected_types:
            if edu_type not in result.columns:
                result[edu_type] = 0
        
        result = result.reindex(index=locations)
        
        print(f"   –ù–∞—è–≤–Ω—ñ —Ç–∏–ø–∏ –∑–∞–∫–ª–∞–¥—ñ–≤: {list(result.columns)}")
        
        relative_result = result.div(total_locations, axis=0)
        
        relative_result_long = relative_result.reset_index().melt(
            id_vars='territory_type',
            var_name='education_org_type',
            value_name='relative_count'
        )
        
        sns.set_style("whitegrid")
        plt.figure(figsize=(10, 6))

        g = sns.catplot(
            x='territory_type',
            y='relative_count',
            hue='education_org_type',
            data=relative_result_long,
            kind='bar',
            height=6,
            aspect=1.5,
            palette='Set3',
            alpha=0.8
        )
        
        g.fig.suptitle("Relative distribution of types of establishments by type of settlement (2022)", y=1.05)
        g.set_axis_labels("Type of settlement", "Relative number of establishments per settlement")
        g.set_xticklabels(rotation=45, ha='right')

        g._legend.set_bbox_to_anchor((1.05, 1.1)) 
        g._legend.set_loc('upper right') 
        
        graph_path = os.path.join(GRAPHS_FOLDER, "education_types_by_location_2022_relative.png")
        g.figure.tight_layout()
        g.figure.savefig(graph_path, dpi=300, bbox_inches="tight")
        plt.close()
        print(f"‚úÖ –ì—Ä–∞—Ñ—ñ–∫ –∑–±–µ—Ä–µ–∂–µ–Ω–æ —É: {graph_path}")

    
    except Exception as e:
        print(f"‚ùå –ü–æ–º–∏–ª–∫–∞ —É —Ñ–∞–π–ª—ñ {filename}: {e}")
        import traceback
        traceback.print_exc()

visualize_education_types_by_location_2022()

print(f"\nüéâ –ê–Ω–∞–ª—ñ–∑ –∑–∞–≤–µ—Ä—à–µ–Ω–æ! –†–µ–∑—É–ª—å—Ç–∞—Ç–∏ –∑–±–µ—Ä–µ–∂–µ–Ω–æ –≤ '{RESULTS_FOLDER}'")

In [31]:
import pandas as pd
import os
import re

INPUT_FOLDER = "filtered_data"
RESULTS_FOLDER = "results"
for folder in [RESULTS_FOLDER]:
    os.makedirs(folder, exist_ok=True)

selected_types = ["–ª—ñ—Ü–µ–π", "–≥—ñ–º–Ω–∞–∑—ñ—è", "–Ω–∞–≤—á–∞–ª—å–Ω–æ-–≤–∏—Ö–æ–≤–Ω–∏–π –∫–æ–º–ø–ª–µ–∫—Å", 
                  "—Å–ø–µ—Ü—ñ–∞–ª—ñ–∑–æ–≤–∞–Ω–∞ —à–∫–æ–ª–∞", "–∑–∞–∫–ª–∞–¥ —Ñ–∞—Ö–æ–≤–æ—ó –ø–µ—Ä–µ–¥–≤–∏—â–æ—ó –æ—Å–≤—ñ—Ç–∏", "—Å–µ—Ä–µ–¥–Ω—è –∑–∞–≥–∞–ª—å–Ω–æ–æ—Å–≤—ñ—Ç–Ω—è —à–∫–æ–ª–∞"]

def analyze_averages_by_year():
    for filename in os.listdir(INPUT_FOLDER):
        if not filename.endswith(".csv"):
            continue
        year = re.search(r'(\d{4})', filename)
        if not year:
            print(f"‚ö†Ô∏è –ù–µ–º–æ–∂–ª–∏–≤–æ –≤–∏–∑–Ω–∞—á–∏—Ç–∏ —Ä—ñ–∫ –¥–ª—è {filename}")
            continue
        year = year.group(1)
        
        file_path = os.path.join(INPUT_FOLDER, filename)
        print(f"üîÑ –û–±—Ä–æ–±–∫–∞ {filename} ({year})...")
        
        try:
            df = pd.read_csv(file_path, encoding='utf-8', on_bad_lines='skip', low_memory=False)
            print(f"   –ó–∞–≤–∞–Ω—Ç–∞–∂–µ–Ω–æ {len(df)} —Ä—è–¥–∫—ñ–≤, {len(df.columns)} —Å—Ç–æ–≤–ø—Ü—ñ–≤")

            required_cols = ['education_org_type', 'territory_type', 'average_score']
            if not all(col in df.columns for col in required_cols):
                print(f"‚ö†Ô∏è –í—ñ–¥—Å—É—Ç–Ω—ñ –∫–æ–ª–æ–Ω–∫–∏: {', '.join(col for col in required_cols if col not in df.columns)} —É {filename}")
                continue

            df_filtered = df[df['education_org_type'].isin(selected_types)].copy()

            locations = ["—Å–µ–ª–∏—â–µ, —Å–µ–ª–æ", "—Å–µ–ª–∏—â–µ –º—ñ—Å—å–∫–æ–≥–æ —Ç–∏–ø—É", "–º—ñ—Å—Ç–æ", "—ñ–Ω—à–∞ –∫—Ä–∞—ó–Ω–∞", "—Å–µ–ª–æ"]
            df_filtered = df_filtered[df_filtered['territory_type'].isin(locations)]
            
            if df_filtered.empty:
                print(f"‚ö†Ô∏è –ñ–æ–¥–Ω–∏—Ö –¥–∞–Ω–∏—Ö –¥–ª—è –≤–∏–±—Ä–∞–Ω–∏—Ö —Ç–∏–ø—ñ–≤ –∑–∞–∫–ª–∞–¥—ñ–≤ –∞–±–æ –ø–æ—Å–µ–ª–µ–Ω—å —É {filename}")
                continue
                
            avg_result = df_filtered.groupby(['territory_type', 'education_org_type']).agg({
                'average_score': 'mean', 
                'average_score': 'size'   
            }).reset_index()
            avg_result.rename(columns={'average_score': 'count'}, inplace=True)  
            avg_result['average_score'] = df_filtered.groupby(['territory_type', 'education_org_type'])['average_score'].mean().values 

            avg_result['year'] = year

            output_path = os.path.join(RESULTS_FOLDER, f"average_scores_{year}.csv")
            avg_result.to_csv(output_path, index=False)
            print(f"‚úÖ –¢–∞–±–ª–∏—á–∫–∞ –¥–ª—è {year} –∑–±–µ—Ä–µ–∂–µ–Ω–æ —É: {output_path}")
        
        except Exception as e:
            print(f"‚ùå –ü–æ–º–∏–ª–∫–∞ —É —Ñ–∞–π–ª—ñ {filename}: {e}")
            import traceback
            traceback.print_exc()

    print(f"\nüéâ –ê–Ω–∞–ª—ñ–∑ –∑–∞–≤–µ—Ä—à–µ–Ω–æ! –†–µ–∑—É–ª—å—Ç–∞—Ç–∏ –∑–±–µ—Ä–µ–∂–µ–Ω–æ –≤ '{RESULTS_FOLDER}'")

analyze_averages_by_year()

üîÑ –û–±—Ä–æ–±–∫–∞ 2020.csv (2020)...
   –ó–∞–≤–∞–Ω—Ç–∞–∂–µ–Ω–æ 201212 —Ä—è–¥–∫—ñ–≤, 131 —Å—Ç–æ–≤–ø—Ü—ñ–≤
‚úÖ –¢–∞–±–ª–∏—á–∫–∞ –¥–ª—è 2020 –∑–±–µ—Ä–µ–∂–µ–Ω–æ —É: results/average_scores_2020.csv
üîÑ –û–±—Ä–æ–±–∫–∞ 2021.csv (2021)...
   –ó–∞–≤–∞–Ω—Ç–∞–∂–µ–Ω–æ 188609 —Ä—è–¥–∫—ñ–≤, 152 —Å—Ç–æ–≤–ø—Ü—ñ–≤
‚úÖ –¢–∞–±–ª–∏—á–∫–∞ –¥–ª—è 2021 –∑–±–µ—Ä–µ–∂–µ–Ω–æ —É: results/average_scores_2021.csv
üîÑ –û–±—Ä–æ–±–∫–∞ 2023.csv (2023)...
   –ó–∞–≤–∞–Ω—Ç–∞–∂–µ–Ω–æ 256313 —Ä—è–¥–∫—ñ–≤, 64 —Å—Ç–æ–≤–ø—Ü—ñ–≤
‚úÖ –¢–∞–±–ª–∏—á–∫–∞ –¥–ª—è 2023 –∑–±–µ—Ä–µ–∂–µ–Ω–æ —É: results/average_scores_2023.csv
üîÑ –û–±—Ä–æ–±–∫–∞ 2022.csv (2022)...
   –ó–∞–≤–∞–Ω—Ç–∞–∂–µ–Ω–æ 213647 —Ä—è–¥–∫—ñ–≤, 36 —Å—Ç–æ–≤–ø—Ü—ñ–≤
‚úÖ –¢–∞–±–ª–∏—á–∫–∞ –¥–ª—è 2022 –∑–±–µ—Ä–µ–∂–µ–Ω–æ —É: results/average_scores_2022.csv
üîÑ –û–±—Ä–æ–±–∫–∞ 2019.csv (2019)...
   –ó–∞–≤–∞–Ω—Ç–∞–∂–µ–Ω–æ 172734 —Ä—è–¥–∫—ñ–≤, 131 —Å—Ç–æ–≤–ø—Ü—ñ–≤
‚úÖ –¢–∞–±–ª–∏—á–∫–∞ –¥–ª—è 2019 –∑–±–µ—Ä–µ–∂–µ–Ω–æ —É: results/average_scores_2019.csv
üîÑ –û–±—Ä–æ–±–∫–∞ 2024.cs