In [31]:
# Load famaly name
import pandas as pd
PAGE = 80
fn_list = list()
for p in range(PAGE):
    fn_list.append(pd.read_json(f'family_name/family_name_page_{p}.json')[["rank", "family_name"]])
fn_df = pd.concat(fn_list).reset_index(drop=True).rename(columns={"family_name": "name"})[["name", "rank"]]
print(fn_df)

      name   rank
0       佐藤      1
1       鈴木      2
2       高橋      3
3       田中      4
4       伊藤      5
...    ...    ...
39992    陰  39996
39993  日名川  39997
39994   梶梅  39998
39995   西途  39999
39996   鈴記  40000

[39997 rows x 2 columns]


In [32]:
def read_given_names(csv_path: str) -> pd.DataFrame:
    df = pd.read_json(csv_path)[["name", "rank", "sex"]].set_index("name")
    # The rank is for male and female respectively
    # Here we take the averave rank as the rank of names
    df_m = df[df['sex'] == "m"][["rank"]]
    df_f = df[df['sex'] == "f"][["rank"]]
    df = df_m.join(df_f, how="outer", lsuffix="_m", rsuffix="_f")
    df["rank"] = df.mean(1)
    df = df.reset_index()
    return df[["name", "rank"]]

file = "given_name/name_ranking_data_2004.json"
df = read_given_names(file)
print(df)


    name  rank
0    くるみ  43.0
1    こころ  25.0
2    さくら   1.0
3    ひなた  10.0
4    ほのか  61.0
..   ...   ...
197   颯人  35.0
198   颯太   2.0
199   颯斗  45.0
200   颯汰  18.0
201    駿  59.0

[202 rows x 2 columns]


In [33]:
# import given name
max_rank = 100
rank_titles = list()

file = "given_name/name_ranking_data_2004.json"
df = read_given_names(file).set_index("name")

for year in range(2004, 2005):
    file = f"given_name/name_ranking_data_{year}.json"
    df_ = read_given_names(file).set_index("name")

    # Incase there is any rank value greater than 100 in some file
    max_rank = max(max_rank, df_["rank"].max())

    # set rsuffix to the value of year, thus the replicated column "rank" will be updated to f"rank{year}"
    df = df.join(df_, rsuffix=str(year), how="outer")

    rank_titles.append(f"rank{year}")
df = df.reset_index().fillna(max_rank)
print(df.loc[50:])
df['rank'] = df[rank_titles].mean(axis=1)
gn_df = df[["name", "rank"]]
gn_df

    name  rank  rank2004
50    大地  18.0      18.0
51    大智  76.0      76.0
52    大河  76.0      76.0
53    大空  24.0      24.0
54    大翔   5.0       5.0
..   ...   ...       ...
197   颯人  35.0      35.0
198   颯太   2.0       2.0
199   颯斗  45.0      45.0
200   颯汰  18.0      18.0
201    駿  59.0      59.0

[152 rows x 3 columns]


Unnamed: 0,name,rank
0,くるみ,43.0
1,こころ,25.0
2,さくら,1.0
3,ひなた,10.0
4,ほのか,61.0
...,...,...
197,颯人,35.0
198,颯太,2.0
199,颯斗,45.0
200,颯汰,18.0


In [34]:
# get the top/middle/bottom 100 names (300 in total)
def get_head_mid_tail_n(df: pd.DataFrame, num: int) -> pd.DataFrame:
    if len(df) <= num:
        return df
    head = df.head(num)
    tail = df.tail(num)
    mid_loc = len(df)//2
    mid = df.loc[mid_loc - num // 2: mid_loc + num // 2 - 1]
    return pd.concat((head, mid, tail)).reset_index(drop=True)



In [35]:
# get 300 family names and 300 given names
fn_samples = get_head_mid_tail_n(fn_df, 100)
print(fn_samples)
gn_samples = get_head_mid_tail_n(gn_df, 100)
print(gn_samples)

    name   rank
0     佐藤      1
1     鈴木      2
2     高橋      3
3     田中      4
4     伊藤      5
..   ...    ...
295    陰  39996
296  日名川  39997
297   梶梅  39998
298   西途  39999
299   鈴記  40000

[300 rows x 2 columns]
    name  rank
0    くるみ  43.0
1    こころ  25.0
2    さくら   1.0
3    ひなた  10.0
4    ほのか  61.0
..   ...   ...
295   颯人  35.0
296   颯太   2.0
297   颯斗  45.0
298   颯汰  18.0
299    駿  59.0

[300 rows x 2 columns]


In [36]:
# Extend names to katakana/hiragana/roma-ji
import pykakasi
import regex
kakasi = pykakasi.Kakasi()

def extend_names(name_df: pd.DataFrame) -> pd.DataFrame:
    '''
    Add columns of katakana/hiragana/roma-ji of the names
    :param name_df: pd.DataFrame with columns of "name" and "rank"
    '''
    kanji, katakana, hiragana, roma = list(), list(), list(), list()
    r_katakana = regex.compile(r'\p{Script=Katakana}+')
    r_hiragana = regex.compile(r'\p{Script=Hiragana}+')
    for n in name_df.name:
        if r_katakana.match(n) or r_hiragana.match(n):
            kanji.append("")
        else:
            kanji.append(n)
        cvt = kakasi.convert(n)
        katakana.append(''.join([c['kana'] for c in cvt]))
        hiragana.append(''.join([c['hira'] for c in cvt]))
        roma.append(''.join([c['passport'] for c in cvt]))
    name_df['kanji'] = kanji
    name_df['katakana'] = katakana
    name_df['hiragana'] = hiragana
    name_df['roma'] = roma

In [37]:
extend_names(fn_samples)
extend_names(gn_samples)
print(fn_samples)
print(gn_samples)

    name   rank kanji katakana hiragana         roma
0     佐藤      1    佐藤      サトウ      さとう         sato
1     鈴木      2    鈴木      スズキ      すずき       suzuki
2     高橋      3    高橋     タカハシ     たかはし    takahashi
3     田中      4    田中      タナカ      たなか       tanaka
4     伊藤      5    伊藤      イトウ      いとう          ito
..   ...    ...   ...      ...      ...          ...
295    陰  39996     陰       イン       いん           in
296  日名川  39997   日名川    ニチナガワ    にちながわ  nichinagawa
297   梶梅  39998    梶梅     カジウメ     かじうめ      kajiume
298   西途  39999    西途      ニシト      にしと      nishito
299   鈴記  40000    鈴記      スズキ      すずき       suzuki

[300 rows x 6 columns]
    name  rank kanji katakana hiragana     roma
0    くるみ  43.0            クルミ      くるみ   kurumi
1    こころ  25.0            ココロ      こころ   kokoro
2    さくら   1.0            サクラ      さくら   sakura
3    ひなた  10.0            ヒナタ      ひなた   hinata
4    ほのか  61.0            ホノカ      ほのか   honoka
..   ...   ...   ...      ...      ...      ...
295 

In [38]:
# generate names
def generate_names(family_name: pd.DataFrame, given_name: pd.DataFrame) -> pd.DataFrame:
    '''
    Generate names by concatenating family_names and given_names.
    :param family_name: pandas.DataFrame with columns of "name" and "rank"
    :param given_name: pandas.DataFrame with columns of "name" and "rank"
    :return: pandas.DataFrame with columsn of "name" and "rank"
    '''
    names, rank, kanji, katakana, hiragana, roma = list(), list(), list(), list(), list(), list()
    for _, fn in family_name.iterrows():
        for _, gn in given_name.iterrows():
            names.append(fn["name"] + gn["name"])
            if fn["kanji"] and gn["kanji"]:
                kanji.append(fn["kanji"] + gn["kanji"])
            else:
                kanji.append("")
            katakana.append(fn["katakana"] + gn["katakana"])
            hiragana.append(fn["hiragana"] + gn["hiragana"])
            roma.append(fn["roma"] + gn["roma"])
            rank.append(float(fn['rank']) * float(gn['rank']))

    name_df = pd.DataFrame({"name": names, "rank": rank, "kanji": kanji, "katakana": katakana, "hiragana": hiragana, "roma": roma})
    return name_df

In [39]:
name_df = generate_names(fn_samples, gn_samples)
name_df = name_df.sort_values(by="rank").reset_index(drop=True)
print(name_df)

        name       rank kanji katakana hiragana          roma
0        佐藤蓮        1.0   佐藤蓮    サトウハス    さとうはす      satohasu
1      佐藤さくら        1.0         サトウサクラ   さとうさくら    satosakura
2       佐藤美咲        1.0  佐藤美咲   サトウミサキ   さとうみさき    satomisaki
3       佐藤美咲        1.0  佐藤美咲   サトウミサキ   さとうみさき    satomisaki
4       佐藤颯太        2.0  佐藤颯太   サトウフウタ   さとうふうた     satofuuta
...      ...        ...   ...      ...      ...           ...
89995    鈴記渚  3280000.0   鈴記渚   スズキナギサ   すずきなぎさ  suzukinagisa
89996   鈴記梨乃  3280000.0  鈴記梨乃    スズキリノ    すずきりの    suzukirino
89997   鈴記里菜  3280000.0  鈴記里菜    スズキリナ    すずきりな    suzukirina
89998   鈴記優芽  3280000.0  鈴記優芽    スズキユメ    すずきゆめ    suzukiyume
89999   鈴記菜央  3280000.0  鈴記菜央    スズキナオ    すずきなお     suzukinao

[90000 rows x 6 columns]


In [40]:
fn_samples.to_csv("family_names.csv", index=False, float_format="%d")
gn_samples.to_csv("given_names.csv", index=False, float_format="%d")
name_df.to_csv("full_names.csv", index=False, float_format="%d")