In [1]:
import pandas as pd
import numpy as np

In [3]:
def select_aa(df, q, q_a):
    # abilityの高い順にソート
    sorted_df = df.sort_values(by='ability', ascending=False)
    
    # abilityの上位q人を選択
    top_q_df = sorted_df.head(q)

    # 上位q人の中で女子を選択
    top_women_df = top_q_df[top_q_df['sex'] == 'women']
    
    # 上位q人の中で男子を選択
    top_men_df = top_q_df[top_q_df['sex'] == 'men']

    # 女子がq_a人に満たない場合、女子の追加分を選ぶ
    if len(top_women_df) < q_a:
        # 不足している女子の人数
        deficit = q_a - len(top_women_df)

        # 女子全体のデータから、すでに選んだ女子を除いて残りから補う
        remaining_women_df = sorted_df[(sorted_df['sex'] == 'women') & (~sorted_df.index.isin(top_women_df.index))]
        additional_women_df = remaining_women_df.head(deficit)

        # 男子の下位から補う女子の人数分削除
        top_men_df = top_men_df.head(len(top_men_df) - deficit)

        # 最終的な選ばれたデータフレーム
        selected_df = pd.concat([top_women_df, additional_women_df, top_men_df])

    else:
        selected_df = top_q_df

    return top_q_df, selected_df, len(top_women_df)

In [4]:
def calc_utility_single(df):
    total_count = len(df)
    women_ratio = len(df[df['sex'] == 'women']) / total_count
    men_ratio = len(df[df['sex'] == 'men']) / total_count

    ratio_product = women_ratio * men_ratio

    ability_sum = df['ability'].sum()

    utility = ratio_product * ability_sum

    return utility

In [5]:
def calc_utility(women_a, men_a, q, q_a):
    results = []
    for seed in range(5):
        np.random.seed(seed)
        sex = ['women'] * women_a + ['men'] * men_a
        ability = np.random.rand(women_a + men_a)
        df = pd.DataFrame({'sex': sex, 'ability': ability})

        top_q_df, selected_df, n = select_aa(df, q, q_a)
        
        u_aa = calc_utility_single(selected_df)
        u_top = calc_utility_single(top_q_df)
        
        results.append((seed, u_aa, u_top))
    
    return results

In [6]:
results = calc_utility(women_a=1959, men_a=7524, q=3000, q_a=1500)
results_df = pd.DataFrame(results, columns=['seed', 'u_aa', 'u_top'])
results_df

Unnamed: 0,seed,u_aa,u_top
0,0,568.933484,429.989496
1,1,570.97415,426.034007
2,2,563.405349,395.755139
3,3,569.052976,410.748064
4,4,570.418271,416.713156


In [7]:
# q_aを0から100刻みで試行し、それぞれのu_aaの平均を計算
def run_experiments(women_a, men_a, q, q_a_range):
    results_list = []
    
    for q_a in q_a_range:
        results = calc_utility(women_a=women_a, men_a=men_a, q=q, q_a=q_a)
        results_df = pd.DataFrame(results, columns=['seed', 'u_aa', 'u_top'])
        
        # 各q_aに対して、u_aaの平均を計算
        avg_u_aa = results_df['u_aa'].mean()
        results_list.append({'q_a': q_a, 'avg_u_aa': avg_u_aa})
    
    return pd.DataFrame(results_list)

# パラメータの設定
women_a = 1959
men_a = 7524
q = 3000
q_a_range = range(0, 1600, 100)  # q_aを0から1500まで100刻みで試行

# 実験の実行
final_results_df = run_experiments(women_a, men_a, q, q_a_range)

final_results_df

Unnamed: 0,q_a,avg_u_aa
0,0,415.847972
1,100,415.847972
2,200,415.847972
3,300,415.847972
4,400,415.847972
5,500,415.847972
6,600,417.475402
7,700,450.58821
8,800,490.995067
9,900,524.215269


In [8]:
import plotly.express as px
fig = px.line(final_results_df, x='q_a', y='avg_u_aa', title='Average u_aa vs q_a', labels={'q_a': 'q_a', 'avg_u_aa': 'Average u_aa'})
fig.show()

In [9]:

# q_aを0から100刻みで試行し、それぞれのu_aaの平均を計算
def run_experiments(women_a, men_a, q, q_a_range):
    results_list = []
    
    for q_a in q_a_range:
        results = calc_utility(women_a=women_a, men_a=men_a, q=q, q_a=q_a)
        results_df = pd.DataFrame(results, columns=['seed', 'u_aa', 'u_top'])
        
        # 各q_aに対して、u_aaの平均を計算
        avg_u_aa = results_df['u_aa'].mean()
        results_list.append({'q_a': q_a, 'avg_u_aa': avg_u_aa, 'women_a': women_a})
    
    return pd.DataFrame(results_list)

# 実験の実行：女子の人数を500人から8000人まで500刻みで変更
def run_all_experiments(men_a, q, q_a_range, women_a_range):
    all_results_list = []
    
    for women_a in women_a_range:
        # 各女性人数に対して実験を実行
        results_df = run_experiments(women_a=women_a, men_a=men_a, q=q, q_a_range=q_a_range)
        all_results_list.append(results_df)
    
    # 全ての結果を1つのDataFrameに結合
    return pd.concat(all_results_list)

# パラメータの設定
men_a = 7524
q = 3000
q_a_range = range(0, 1600, 100)  # q_aを0から1500まで100刻みで試行
women_a_range = range(500, 8500, 500)  # 女子の人数を500人から8000人まで500刻み

# 全ての実験を実行
final_results_df = run_all_experiments(men_a=men_a, q=q, q_a_range=q_a_range, women_a_range=women_a_range)

# Plotlyを使用してプロット、色分けはwomen_a（女子の人数）
fig = px.line(final_results_df, x='q_a', y='avg_u_aa', color='women_a', 
              title='Average u_aa vs q_a for Different Women_a',
              labels={'q_a': 'q_a', 'avg_u_aa': 'Average u_aa', 'women_a': 'Women Count (women_a)'})
fig.show()