In [41]:
# Load famaly name
import pandas as pd
PAGE = 80
fn_list = list()
for p in range(PAGE):
    fn_list.append(pd.read_json(f'family_name/family_name_page_{p}.json')[["rank", "family_name"]])
fn_df = pd.concat(fn_list).rename(columns={"family_name": "name"})[["name", "rank"]]
fn_df = fn_df.sort_values(by="rank").reset_index(drop=True)
print(fn_df)

      name   rank
0       佐藤      1
1       鈴木      2
2       高橋      3
3       田中      4
4       伊藤      5
...    ...    ...
39992    陰  39996
39993  日名川  39997
39994   梶梅  39998
39995   西途  39999
39996   鈴記  40000

[39997 rows x 2 columns]


In [42]:
def read_given_names(csv_path: str) -> pd.DataFrame:
    df = pd.read_json(csv_path)[["name", "rank", "sex"]].set_index("name")
    # The rank is for male and female respectively
    # Here we take the averave rank as the rank of names
    df_m = df[df['sex'] == "m"][["rank"]]
    df_f = df[df['sex'] == "f"][["rank"]]
    df = df_m.join(df_f, how="outer", lsuffix="_m", rsuffix="_f")
    df["rank"] = df.mean(1)
    df = df.sort_values(by="rank").reset_index()
    return df[["name", "rank"]]

file = "given_name/name_ranking_data_2004.json"
df = read_given_names(file)
print(df)


    name  rank
0    くるみ  43.0
1    こころ  25.0
2    さくら   1.0
3    ひなた  10.0
4    ほのか  61.0
..   ...   ...
197   颯人  35.0
198   颯太   2.0
199   颯斗  45.0
200   颯汰  18.0
201    駿  59.0

[202 rows x 2 columns]


In [43]:
# import given name
max_rank = 100
rank_titles = list()

file = "given_name/name_ranking_data_2004.json"
df = read_given_names(file).set_index("name")

for year in range(2004, 2011):
    file = f"given_name/name_ranking_data_{year}.json"
    df_ = read_given_names(file).set_index("name")

    # Incase there is any rank value greater than 100 in some file
    max_rank = max(max_rank, df_["rank"].max())

    # set rsuffix to the value of year, thus the replicated column "rank" will be updated to f"rank{year}"
    df = df.join(df_, rsuffix=str(year), how="outer")

    rank_titles.append(f"rank{year}")
df = df.reset_index().fillna(max_rank)
print(df.loc[50:])
df['rank'] = df[rank_titles].mean(axis=1)
gn_df = df[["name", "rank"]]
gn_df

    name   rank  rank2004  rank2005  rank2006  rank2007  rank2008  rank2009  \
50    優愛  100.0     100.0     100.0     100.0      88.0     100.0      81.0   
51    優斗    7.0       7.0      27.0      41.0       7.0      26.0      17.0   
52    優月  100.0     100.0      45.0      33.0      66.0     100.0      26.0   
53    優樹  100.0     100.0      59.0     100.0     100.0     100.0     100.0   
54    優汰  100.0     100.0     100.0     100.0     100.0     100.0     100.0   
..   ...    ...       ...       ...       ...       ...       ...       ...   
553    麗  100.0     100.0      72.0     100.0     100.0     100.0     100.0   
554   麻央  100.0     100.0     100.0     100.0     100.0     100.0     100.0   
555   麻衣  100.0     100.0      89.0     100.0     100.0     100.0     100.0   
556  鼓太郎  100.0     100.0     100.0     100.0     100.0      72.0     100.0   
557  龍之介  100.0     100.0      59.0      31.0      62.0      21.0      41.0   

     rank2010  
50       53.0  
51        6.0  
52 

Unnamed: 0,name,rank
0,あおい,96.285714
1,あかり,82.142857
2,くるみ,81.714286
3,こころ,30.714286
4,さくら,2.857143
...,...,...
553,麗,96.000000
554,麻央,93.285714
555,麻衣,98.428571
556,鼓太郎,96.000000


In [44]:
# get the top/middle/bottom 100 names (300 in total)
def get_head_mid_tail_n(df: pd.DataFrame, num: int) -> pd.DataFrame:
    if len(df) <= num:
        return df
    head = df.head(num)
    tail = df.tail(num)
    mid_loc = len(df)//2
    mid = df.loc[mid_loc - num // 2: mid_loc + num // 2 - 1]
    return pd.concat((head, mid, tail)).reset_index(drop=True)



In [45]:
# get 300 family names and 300 given names
fn_samples = get_head_mid_tail_n(fn_df, 100)
print(fn_samples)
gn_samples = get_head_mid_tail_n(gn_df, 100)
print(gn_samples)

    name   rank
0     佐藤      1
1     鈴木      2
2     高橋      3
3     田中      4
4     伊藤      5
..   ...    ...
295    陰  39996
296  日名川  39997
297   梶梅  39998
298   西途  39999
299   鈴記  40000

[300 rows x 2 columns]
    name       rank
0    あおい  96.285714
1    あかり  82.142857
2    くるみ  81.714286
3    こころ  30.714286
4    さくら   2.857143
..   ...        ...
295    麗  96.000000
296   麻央  93.285714
297   麻衣  98.428571
298  鼓太郎  96.000000
299  龍之介  50.000000

[300 rows x 2 columns]


In [46]:
# Extend names to katakana/hiragana/roma-ji
import pykakasi
import regex
kakasi = pykakasi.Kakasi()

def extend_names(name_df: pd.DataFrame) -> pd.DataFrame:
    '''
    Add columns of katakana/hiragana/roma-ji of the names
    :param name_df: pd.DataFrame with columns of "name" and "rank"
    '''
    kanji, katakana, hiragana, roma = list(), list(), list(), list()
    r_katakana = regex.compile(r'\p{Script=Katakana}+')
    r_hiragana = regex.compile(r'\p{Script=Hiragana}+')
    for n in name_df.name:
        if r_katakana.match(n) or r_hiragana.match(n):
            kanji.append("")
        else:
            kanji.append(n)
        cvt = kakasi.convert(n)
        katakana.append(''.join([c['kana'] for c in cvt]))
        hiragana.append(''.join([c['hira'] for c in cvt]))
        roma.append(''.join([c['passport'] for c in cvt]))
    name_df['kanji'] = kanji
    name_df['katakana'] = katakana
    name_df['hiragana'] = hiragana
    name_df['roma'] = roma

In [47]:
extend_names(fn_samples)
extend_names(gn_samples)
print(fn_samples)
print(gn_samples)

    name   rank kanji katakana hiragana         roma
0     佐藤      1    佐藤      サトウ      さとう         sato
1     鈴木      2    鈴木      スズキ      すずき       suzuki
2     高橋      3    高橋     タカハシ     たかはし    takahashi
3     田中      4    田中      タナカ      たなか       tanaka
4     伊藤      5    伊藤      イトウ      いとう          ito
..   ...    ...   ...      ...      ...          ...
295    陰  39996     陰       イン       いん           in
296  日名川  39997   日名川    ニチナガワ    にちながわ  nichinagawa
297   梶梅  39998    梶梅     カジウメ     かじうめ      kajiume
298   西途  39999    西途      ニシト      にしと      nishito
299   鈴記  40000    鈴記      スズキ      すずき       suzuki

[300 rows x 6 columns]
    name       rank kanji katakana hiragana          roma
0    あおい  96.285714            アオイ      あおい           aoi
1    あかり  82.142857            アカリ      あかり         akari
2    くるみ  81.714286            クルミ      くるみ        kurumi
3    こころ  30.714286            ココロ      こころ        kokoro
4    さくら   2.857143            サクラ      さくら       

In [48]:
# generate names
def generate_names(family_name: pd.DataFrame, given_name: pd.DataFrame) -> pd.DataFrame:
    '''
    Generate names by concatenating family_names and given_names.
    :param family_name: pandas.DataFrame with columns of "name" and "rank"
    :param given_name: pandas.DataFrame with columns of "name" and "rank"
    :return: pandas.DataFrame with columsn of "name" and "rank"
    '''
    names, rank, kanji, katakana, hiragana, roma = list(), list(), list(), list(), list(), list()
    for _, fn in family_name.iterrows():
        for _, gn in given_name.iterrows():
            names.append(fn["name"] + gn["name"])
            if fn["kanji"] and gn["kanji"]:
                kanji.append(fn["kanji"] + gn["kanji"])
            else:
                kanji.append("")
            katakana.append(fn["katakana"] + gn["katakana"])
            hiragana.append(fn["hiragana"] + gn["hiragana"])
            roma.append(fn["roma"] + gn["roma"])
            rank.append(float(fn['rank']) * float(gn['rank']))

    name_df = pd.DataFrame({"name": names, "rank": rank, "kanji": kanji, "katakana": katakana, "hiragana": hiragana, "roma": roma})
    return name_df

In [49]:
name_df = generate_names(fn_samples, gn_samples)
name_df = name_df.sort_values(by="rank").reset_index(drop=True)
print(name_df)

        name          rank kanji   katakana   hiragana                 roma
0       佐藤陽菜  2.000000e+00  佐藤陽菜     サトウハルナ     さとうはるな           satoharuna
1      佐藤さくら  2.857143e+00           サトウサクラ     さとうさくら           satosakura
2       鈴木陽菜  4.000000e+00  鈴木陽菜     スズキハルナ     すずきはるな         suzukiharuna
3      鈴木さくら  5.714286e+00           スズキサクラ     すずきさくら         suzukisakura
4       高橋陽菜  6.000000e+00  高橋陽菜    タカハシハルナ    たかはしはるな      takahashiharuna
...      ...           ...   ...        ...        ...                  ...
89995   鈴記智貴  4.000000e+06  鈴記智貴  スズキサトルタカシ  すずきさとるたかし  suzukisatorutakashi
89996   鈴記侑真  4.000000e+06  鈴記侑真   スズキユウマコト   すずきゆうまこと      suzukiyuumakoto
89997   鈴記柑太  4.000000e+06  鈴記柑太     スズキカンタ     すずきかんた          suzukikanta
89998   鈴記正宗  4.000000e+06  鈴記正宗    スズキマサムネ    すずきまさむね       suzukimasamune
89999   鈴記颯介  4.000000e+06  鈴記颯介    スズキサツスケ    すずきさつすけ      suzukisatsusuke

[90000 rows x 6 columns]


In [50]:
fn_samples.to_csv("family_names.csv", index=False, float_format="%d")
gn_samples.to_csv("given_names.csv", index=False, float_format="%d")
name_df.to_csv("full_names.csv", index=False, float_format="%d")