In [24]:
import pandas as pd
import os
import chardet
import re
import matplotlib.pyplot as plt
import numpy as np
from matplotlib.colors import LinearSegmentedColormap

# Set up directories
graphs_dir = os.path.join('graphs')
os.makedirs(graphs_dir, exist_ok=True)

# Detect encoding of a file
def detect_encoding(file_path):
    with open(file_path, 'rb') as f:
        return chardet.detect(f.read(10000))['encoding']

# Extract year from filename
def extract_year_from_filename(filename):
    match = re.search(r'(\d{4})\.csv', filename)
    return int(match.group(1)) if match else None

# Function to generate and save histogram with gradient colors
def generate_and_save_histogram(data, year):
    # Define bins for average_score (up to 200 with step 20)
    bins = range(100, 201, 20)  # From 100 to 200 with step 20
    filtered_data = data[data['average_score'] <= 200]  # Limit to 200
    
    plt.figure(figsize=(10, 6))
    n, bins, patches = plt.hist(filtered_data['average_score'], bins=bins, edgecolor='white', linewidth=1.5)
    
    # Create a soft gradient colormap
    colors = ['#A3BFFA', '#96CEB4', '#D4A5A5', '#FFD3B6', '#FFAAA5', '#CDB4DB']  # Soft pastel colors
    cmap = LinearSegmentedColormap.from_list('custom_gradient', colors, N=len(patches))
    for i, patch in enumerate(patches):
        patch.set_facecolor(cmap(i / len(patches)))
        patch.set_edgecolor('white')
        patch.set_linewidth(1.5)
    
    # Enhance design with simple year title
    plt.title(f'{year}', fontsize=16, pad=15, fontweight='bold', color='#333333')
    plt.xlabel('Average Score Range', fontsize=12, color='#555555')
    plt.ylabel('Count', fontsize=12, color='#555555')
    plt.grid(True, alpha=0.1, linestyle='--', linewidth=0.5, color='#CCCCCC')
    plt.xticks(bins, fontsize=10, color='#666666')
    plt.yticks(fontsize=10, color='#666666')
    plt.gca().set_facecolor('#F5F6F5')
    
    # Save as SVG
    output_path = os.path.join(graphs_dir, f'average_score_histogram_{year}.svg')
    plt.savefig(output_path, format='svg', bbox_inches='tight', transparent=True)
    plt.close()

# Function to generate and save line plot for multiple years
def generate_and_save_line_plot(data_by_year):
    years = sorted(data_by_year.keys())
    if len(years) < 6:
        print("‚ö†Ô∏è –ù–µ–¥–æ—Å—Ç–∞—Ç–Ω—å–æ —Ä–æ–∫—ñ–≤ –¥–ª—è 6 –ª—ñ–Ω—ñ–π. –ó–Ω–∞–π–¥–µ–Ω–æ —Ç—ñ–ª—å–∫–∏:", years)
        return
    
    plt.figure(figsize=(12, 6))
    
    # Soft colors for lines
    line_colors = ['#A3BFFA', '#96CEB4', '#D4A5A5', '#FFD3B6', '#FFAAA5', '#CDB4DB']
    
    for i, year in enumerate(years[:6]):  # Limit to 6 years
        data = data_by_year[year][data_by_year[year]['average_score'] <= 200]
        bins = range(100, 201, 20)  # Ensure range up to 200
        hist, bin_edges = np.histogram(data['average_score'], bins=bins, density=True)
        # Use full bin_edges including the last point
        plt.plot(bin_edges, np.append(hist, 0), marker='o', label=f'{year}', linewidth=2, color=line_colors[i % len(line_colors)], markeredgecolor='white', markersize=6)
    
    # Enhance design with simple title
    plt.title('Average Scores Distribution', fontsize=16, pad=15, fontweight='bold', color='#333333')
    plt.xlabel('Average Score', fontsize=12, color='#555555')
    plt.ylabel('Density', fontsize=12, color='#555555')
    plt.grid(True, alpha=0.1, linestyle='--', linewidth=0.5, color='#CCCCCC')
    plt.xticks(range(100, 201, 20), fontsize=10, color='#666666')
    plt.yticks(fontsize=10, color='#666666')
    plt.legend(title='', fontsize=10, frameon=True, facecolor='#F5F6F5', edgecolor='white', loc='best')
    plt.gca().set_facecolor('#F5F6F5')
    
    # Save as SVG
    output_path = os.path.join(graphs_dir, 'average_score_line_plot.svg')
    plt.savefig(output_path, format='svg', bbox_inches='tight', transparent=True)
    plt.close()

# Process data and generate histograms and line plot
def process_and_generate_histograms(input_folder="filtered_data", output_folder="aggregated_data"):
    os.makedirs(output_folder, exist_ok=True)
    data_by_year = {}

    available_years = set()
    for filename in os.listdir(input_folder):
        if filename.endswith(".csv"):
            year = extract_year_from_filename(filename)
            if year:
                available_years.add(year)

    if not available_years:
        print("‚ö†Ô∏è –ù–µ –∑–Ω–∞–π–¥–µ–Ω–æ —Ñ–∞–π–ª—ñ–≤ —ñ–∑ —Ä–æ–∫–∞–º–∏ –≤ –Ω–∞–∑–≤–∞—Ö")
        return

    for filename in os.listdir(input_folder):
        if filename.endswith(".csv"):
            input_path = os.path.join(input_folder, filename)
            year = extract_year_from_filename(filename)
            if year not in available_years:
                continue
            print(f"üîÑ –û–±—Ä–æ–±–∫–∞ {filename} (—Ä—ñ–∫: {year})...")
            
            try:
                encoding = detect_encoding(input_path)
                df = pd.read_csv(input_path, encoding=encoding, low_memory=False)
                
                initial_count = len(df)
                filtered_df = df[
                    (df['subjects_count'] >= 3) & 
                    (df['total_score'] > 0) & 
                    (df['average_score'] > 100)
                ]
                new_count = len(filtered_df)
                
                if new_count == 0:
                    print(f"‚ö†Ô∏è –£ —Ñ–∞–π–ª—ñ {filename} –Ω–µ–º–∞—î –¥–∞–Ω–∏—Ö –ø—ñ—Å–ª—è —Ñ—ñ–ª—å—Ç—Ä–∞—Ü—ñ—ó")
                    continue

                filtered_df['year'] = year
                data_by_year[year] = filtered_df

                # Generate and save histogram
                generate_and_save_histogram(filtered_df, year)
                print(f"‚úÖ –ó–±–µ—Ä–µ–∂–µ–Ω–æ –≥—ñ—Å—Ç–æ–≥—Ä–∞–º—É –¥–ª—è —Ä–æ–∫—É {year} —É {graphs_dir}/average_score_histogram_{year}.svg")
                print(f"   –ó–∞–ª–∏—à–µ–Ω–æ —Ä—è–¥–∫—ñ–≤: {new_count}/{initial_count} ({new_count/initial_count:.1%})")

            except Exception as e:
                print(f"‚ùå –ö—Ä–∏—Ç–∏—á–Ω–∞ –ø–æ–º–∏–ª–∫–∞ —É —Ñ–∞–π–ª—ñ {filename}: {str(e)}")

    # Generate and save line plot
    if data_by_year:
        generate_and_save_line_plot(data_by_year)
        print(f"‚úÖ –ó–±–µ—Ä–µ–∂–µ–Ω–æ –ª—ñ–Ω—ñ–π–Ω–∏–π –≥—Ä–∞—Ñ—ñ–∫ —É {graphs_dir}/average_score_line_plot.svg")

    print("\nüéâ –û–±—Ä–æ–±–∫—É —Ç–∞ –≥–µ–Ω–µ—Ä–∞—Ü—ñ—é –≥—ñ—Å—Ç–æ–≥—Ä–∞–º/–≥—Ä–∞—Ñ—ñ–∫–∞ –∑–∞–≤–µ—Ä—à–µ–Ω–æ! –†–µ–∑—É–ª—å—Ç–∞—Ç–∏ –∑–±–µ—Ä–µ–∂–µ–Ω–æ —É –ø–∞–ø—Ü—ñ 'graphs'")

# Run the process
if __name__ == "__main__":
    process_and_generate_histograms()

üîÑ –û–±—Ä–æ–±–∫–∞ 2020.csv (—Ä—ñ–∫: 2020)...
‚úÖ –ó–±–µ—Ä–µ–∂–µ–Ω–æ –≥—ñ—Å—Ç–æ–≥—Ä–∞–º—É –¥–ª—è —Ä–æ–∫—É 2020 —É graphs/average_score_histogram_2020.svg
   –ó–∞–ª–∏—à–µ–Ω–æ —Ä—è–¥–∫—ñ–≤: 201212/201212 (100.0%)
üîÑ –û–±—Ä–æ–±–∫–∞ 2021.csv (—Ä—ñ–∫: 2021)...
‚úÖ –ó–±–µ—Ä–µ–∂–µ–Ω–æ –≥—ñ—Å—Ç–æ–≥—Ä–∞–º—É –¥–ª—è —Ä–æ–∫—É 2021 —É graphs/average_score_histogram_2021.svg
   –ó–∞–ª–∏—à–µ–Ω–æ —Ä—è–¥–∫—ñ–≤: 188609/188609 (100.0%)
üîÑ –û–±—Ä–æ–±–∫–∞ 2023.csv (—Ä—ñ–∫: 2023)...
‚úÖ –ó–±–µ—Ä–µ–∂–µ–Ω–æ –≥—ñ—Å—Ç–æ–≥—Ä–∞–º—É –¥–ª—è —Ä–æ–∫—É 2023 —É graphs/average_score_histogram_2023.svg
   –ó–∞–ª–∏—à–µ–Ω–æ —Ä—è–¥–∫—ñ–≤: 256313/256313 (100.0%)
üîÑ –û–±—Ä–æ–±–∫–∞ 2022.csv (—Ä—ñ–∫: 2022)...
‚úÖ –ó–±–µ—Ä–µ–∂–µ–Ω–æ –≥—ñ—Å—Ç–æ–≥—Ä–∞–º—É –¥–ª—è —Ä–æ–∫—É 2022 —É graphs/average_score_histogram_2022.svg
   –ó–∞–ª–∏—à–µ–Ω–æ —Ä—è–¥–∫—ñ–≤: 213647/213647 (100.0%)
üîÑ –û–±—Ä–æ–±–∫–∞ 2019.csv (—Ä—ñ–∫: 2019)...
‚úÖ –ó–±–µ—Ä–µ–∂–µ–Ω–æ –≥—ñ—Å—Ç–æ–≥—Ä–∞–º—É –¥–ª—è —Ä–æ–∫—É 2019 —É graphs/average_score_histogram_2019.s

## Box Plots

In [26]:
import pandas as pd
import os
import chardet
import re
import matplotlib.pyplot as plt

graphs_dir = os.path.join('graphs')
os.makedirs(graphs_dir, exist_ok=True)

def detect_encoding(file_path):
    with open(file_path, 'rb') as f:
        return chardet.detect(f.read(10000))['encoding']

def extract_year_from_filename(filename):
    match = re.search(r'(\d{4})\.csv', filename)
    return int(match.group(1)) if match else None

def generate_and_save_box_plot(data_by_year):
    years = sorted(data_by_year.keys())
    if len(years) < 6:
        print("‚ö†Ô∏è –ù–µ–¥–æ—Å—Ç–∞—Ç–Ω—å–æ —Ä–æ–∫—ñ–≤ –¥–ª—è 6 –±–æ–∫—Å-–ø–ª–æ—Ç—ñ–≤. –ó–Ω–∞–π–¥–µ–Ω–æ —Ç—ñ–ª—å–∫–∏:", years)
        return
    
    plt.figure(figsize=(12, 6))
    
    data_for_box = [data_by_year[year]['average_score'].dropna() for year in years[:6]]
    
    blue_shades = ['#A3BFFA', '#87CEEB', '#ADD8E6', '#B0E0E6', '#AFEEEE', '#B0C4DE']
    
    plt.boxplot(data_for_box, labels=[str(year) for year in years[:6]], patch_artist=True)
    for patch, color in zip(plt.gca().artists, blue_shades):
        patch.set_facecolor(color)
        patch.set_edgecolor('white')
        patch.set_linewidth(1.5)
    
    plt.title('Box Plots of Average Scores', fontsize=16, pad=15, fontweight='bold', color='#333333')
    plt.xlabel('Year', fontsize=12, color='#555555')
    plt.ylabel('Average Score', fontsize=12, color='#555555')
    plt.grid(True, alpha=0.1, linestyle='--', linewidth=0.5, color='#CCCCCC')
    plt.xticks(fontsize=10, color='#666666')
    plt.yticks(range(100, 201, 20), fontsize=10, color='#666666')
    plt.gca().set_facecolor('#F5F6F5')
    
    output_path = os.path.join(graphs_dir, 'average_score_box_plot.svg')
    plt.savefig(output_path, format='svg', bbox_inches='tight', transparent=True)
    plt.close()

def process_and_generate_histograms(input_folder="filtered_data", output_folder="aggregated_data"):
    os.makedirs(output_folder, exist_ok=True)
    data_by_year = {}

    available_years = set()
    for filename in os.listdir(input_folder):
        if filename.endswith(".csv"):
            year = extract_year_from_filename(filename)
            if year:
                available_years.add(year)

    if not available_years:
        print("‚ö†Ô∏è –ù–µ –∑–Ω–∞–π–¥–µ–Ω–æ —Ñ–∞–π–ª—ñ–≤ —ñ–∑ —Ä–æ–∫–∞–º–∏ –≤ –Ω–∞–∑–≤–∞—Ö")
        return

    for filename in os.listdir(input_folder):
        if filename.endswith(".csv"):
            input_path = os.path.join(input_folder, filename)
            year = extract_year_from_filename(filename)
            if year not in available_years:
                continue
            print(f"üîÑ –û–±—Ä–æ–±–∫–∞ {filename} (—Ä—ñ–∫: {year})...")
            
            try:
                encoding = detect_encoding(input_path)
                df = pd.read_csv(input_path, encoding=encoding, low_memory=False)
                
                initial_count = len(df)
                filtered_df = df[
                    (df['subjects_count'] >= 3) & 
                    (df['total_score'] > 0) & 
                    (df['average_score'] > 100)
                ]
                new_count = len(filtered_df)
                
                if new_count == 0:
                    print(f"‚ö†Ô∏è –£ —Ñ–∞–π–ª—ñ {filename} –Ω–µ–º–∞—î –¥–∞–Ω–∏—Ö –ø—ñ—Å–ª—è —Ñ—ñ–ª—å—Ç—Ä–∞—Ü—ñ—ó")
                    continue

                filtered_df['year'] = year
                data_by_year[year] = filtered_df

                print(f"‚úÖ –û–±—Ä–æ–±–ª–µ–Ω–æ {filename} –¥–ª—è —Ä–æ–∫—É {year}")
                print(f"   –ó–∞–ª–∏—à–µ–Ω–æ —Ä—è–¥–∫—ñ–≤: {new_count}/{initial_count} ({new_count/initial_count:.1%})")

            except Exception as e:
                print(f"‚ùå –ö—Ä–∏—Ç–∏—á–Ω–∞ –ø–æ–º–∏–ª–∫–∞ —É —Ñ–∞–π–ª—ñ {filename}: {str(e)}")

    if data_by_year:
        generate_and_save_box_plot(data_by_year)
        print(f"‚úÖ –ó–±–µ—Ä–µ–∂–µ–Ω–æ –±–æ–∫—Å-–ø–ª–æ—Ç —É {graphs_dir}/average_score_box_plot.svg")

    print("\nüéâ –û–±—Ä–æ–±–∫—É —Ç–∞ –≥–µ–Ω–µ—Ä–∞—Ü—ñ—é –±–æ–∫—Å-–ø–ª–æ—Ç—ñ–≤ –∑–∞–≤–µ—Ä—à–µ–Ω–æ! –†–µ–∑—É–ª—å—Ç–∞—Ç–∏ –∑–±–µ—Ä–µ–∂–µ–Ω–æ —É –ø–∞–ø—Ü—ñ 'graphs'")

if __name__ == "__main__":
    process_and_generate_histograms()

üîÑ –û–±—Ä–æ–±–∫–∞ 2020.csv (—Ä—ñ–∫: 2020)...
‚úÖ –û–±—Ä–æ–±–ª–µ–Ω–æ 2020.csv –¥–ª—è —Ä–æ–∫—É 2020
   –ó–∞–ª–∏—à–µ–Ω–æ —Ä—è–¥–∫—ñ–≤: 201212/201212 (100.0%)
üîÑ –û–±—Ä–æ–±–∫–∞ 2021.csv (—Ä—ñ–∫: 2021)...
‚úÖ –û–±—Ä–æ–±–ª–µ–Ω–æ 2021.csv –¥–ª—è —Ä–æ–∫—É 2021
   –ó–∞–ª–∏—à–µ–Ω–æ —Ä—è–¥–∫—ñ–≤: 188609/188609 (100.0%)
üîÑ –û–±—Ä–æ–±–∫–∞ 2023.csv (—Ä—ñ–∫: 2023)...
‚úÖ –û–±—Ä–æ–±–ª–µ–Ω–æ 2023.csv –¥–ª—è —Ä–æ–∫—É 2023
   –ó–∞–ª–∏—à–µ–Ω–æ —Ä—è–¥–∫—ñ–≤: 256313/256313 (100.0%)
üîÑ –û–±—Ä–æ–±–∫–∞ 2022.csv (—Ä—ñ–∫: 2022)...
‚úÖ –û–±—Ä–æ–±–ª–µ–Ω–æ 2022.csv –¥–ª—è —Ä–æ–∫—É 2022
   –ó–∞–ª–∏—à–µ–Ω–æ —Ä—è–¥–∫—ñ–≤: 213647/213647 (100.0%)
üîÑ –û–±—Ä–æ–±–∫–∞ 2019.csv (—Ä—ñ–∫: 2019)...
‚úÖ –û–±—Ä–æ–±–ª–µ–Ω–æ 2019.csv –¥–ª—è —Ä–æ–∫—É 2019
   –ó–∞–ª–∏—à–µ–Ω–æ —Ä—è–¥–∫—ñ–≤: 172734/172734 (100.0%)
üîÑ –û–±—Ä–æ–±–∫–∞ 2024.csv (—Ä—ñ–∫: 2024)...
‚úÖ –û–±—Ä–æ–±–ª–µ–Ω–æ 2024.csv –¥–ª—è —Ä–æ–∫—É 2024
   –ó–∞–ª–∏—à–µ–Ω–æ —Ä—è–¥–∫—ñ–≤: 264164/264164 (100.0%)


  plt.boxplot(data_for_box, labels=[str(year) for year in years[:6]], patch_artist=True)


‚úÖ –ó–±–µ—Ä–µ–∂–µ–Ω–æ –±–æ–∫—Å-–ø–ª–æ—Ç —É graphs/average_score_box_plot.svg

üéâ –û–±—Ä–æ–±–∫—É —Ç–∞ –≥–µ–Ω–µ—Ä–∞—Ü—ñ—é –±–æ–∫—Å-–ø–ª–æ—Ç—ñ–≤ –∑–∞–≤–µ—Ä—à–µ–Ω–æ! –†–µ–∑—É–ª—å—Ç–∞—Ç–∏ –∑–±–µ—Ä–µ–∂–µ–Ω–æ —É –ø–∞–ø—Ü—ñ 'graphs'
