### 中文歌词押韵计算：
本来是想计算，一个词语后面最常跟着的词语是什么……  
但好像有点太难了……为了更新，所以先缩小结果范围：  
算出来最常出现的韵脚，及这些韵脚下出现最常的词儿。

In [1]:
import json
import pandas as pd
import numpy as np
from xpinyin import Pinyin
import re

### 读取歌词数据

In [2]:
f = open("data/songs_cut.json")
songs_file = json.load(f)

### 把每条歌词放进表格里
挑出每句歌词的最后一个字  
再选出每个字的韵脚  

In [3]:
p = Pinyin()

song_df = (
    pd.DataFrame(songs_file.items(), columns=['song', 'lines'])
    .assign(
        last_words = lambda df:df['lines'].apply(lambda x: [sublist[-1] for sublist in x]).astype(str).str.replace("[","").str.replace("]","").str.replace("'",""), 
        pinyin = lambda df:df['last_words'].apply(lambda x: p.get_pinyin(x)),
        rhymns = lambda df:df['pinyin'].apply(lambda x: re.sub(' -','', x)).apply(lambda x: x.split(',')).apply(lambda x: [item.split("-") for item in x]).apply(lambda x: [re.findall(".*[b,c,d,f,h,j,k,l,m,p,q,r,s,t,w,x,y,z](.*).*",item[-2]) if item[-1]=='' else re.findall(".*[b,c,d,f,h,j,k,l,m,p,q,r,s,t,w,x,y,z](.*).*",item[-1]) for item in x]).apply(lambda x: [item for sublist in x for item in sublist])
    )
)

song_df.head()

Unnamed: 0,song,lines,last_words,pinyin,rhymns
0,Tank_是谁,"[[是, 谁, 把, 这, 世界, 弄, 的, 乱糟糟], [是, 谁, 做, 了, 蠢事,...","乱糟糟, 笑, 吵, 不到, 意义, 存在, 说不出口, 嘲笑, 好, 飈, 走掉, 自己,...","luan-zao-zao-, -xiao-, -chao-, -bu-dao-, -yi-y...","[ao, iao, ao, ao, i, ai, ou, iao, ao, iao, iao..."
1,赵鹏_如果这都不算爱,"[[是否, 爱, 就是, 忍耐, 不问, 该不该], [都, 怪, 我, 没, 能耐, 转身...","该不该, 走开, 实在, 爱, 悲哀, 悲哀, 独白, 走开, 实在, 爱, 割爱, 删改,...","gai-bu-gai-, -zou-kai-, -shi-zai-, -ai-, -bei-...","[ai, ai, ai, ai, ai, uai]"
2,陶晶莹_你本来就美,"[[牛仔裤, 紧, 了, 一点], [何必, 感觉, 象是, 犯, 了, 死罪], [跑步机...","一点, 死罪, 水, 泪, 谁, 惭愧, 险, 对, 一点, 类, 对, 美, 分类, 世界...","yi-dian-, -si-zui-, -shui-, -lei-, -shui-, -ca...","[ian, ui, ui, ei, ui, ui, ian, ui, ian, ei, ui..."
3,陈绮贞_鱼,"[[我, 坐, 在, 椅子, 上, 看, 日出, 复活], [我, 坐, 在, 夕阳, 里,...","复活, 衰弱, 我, 变化, 说话, 腐化, 的, 吧, 边界, 挣扎, 勇敢, 代价, 豢...","fu-huo-, -shuai-ruo-, -wo-, -bian-hua-, -shuo-...","[uo, uo, o, ua, ua, ua, e, a, ie, a, ia, ang, ..."
4,张信哲_难道,"[[为何, 不肯, 让, 我, 吻, 你], [为何, 把, 脸, 埋, 在, 你, 双手,...","你, 里, 犹豫, 美丽, 你, 哭泣, 秘密, 距离, 你, 真心, 着迷, 无情, 你,...","ni-, -li-, -you-yu-, -mei-li-, -ni-, -ku-qi-, ...","[i, u, i, i, i, i, in, i, ing, ing, i, i]"


In [4]:
# 计算每首歌出现最多的韵脚是什么

import nltk
from collections import Counter

counter = Counter()

song_df =(
    song_df
    .assign(
        rhymns_count = lambda df:df['rhymns'].apply(lambda x: nltk.FreqDist(x).most_common()),
        rhymns_final = lambda df:df['rhymns_count'].apply(lambda x: x[0][0] if len(x)>0 else ''),
        rhymns_final_count = lambda df:df['rhymns_count'].apply(lambda x: x[0][1] if len(x)>0 else ''),
    )
    .loc[
        lambda x: x['rhymns_final']!='' #删掉15首没有韵脚的歌曲
    ]
    [['song','lines','last_words','pinyin','rhymns','rhymns_count','rhymns_final','rhymns_final_count']]
)

song_df.head()

Unnamed: 0,song,lines,last_words,pinyin,rhymns,rhymns_count,rhymns_final,rhymns_final_count
0,Tank_是谁,"[[是, 谁, 把, 这, 世界, 弄, 的, 乱糟糟], [是, 谁, 做, 了, 蠢事,...","乱糟糟, 笑, 吵, 不到, 意义, 存在, 说不出口, 嘲笑, 好, 飈, 走掉, 自己,...","luan-zao-zao-, -xiao-, -chao-, -bu-dao-, -yi-y...","[ao, iao, ao, ao, i, ai, ou, iao, ao, iao, iao...","[(ao, 4), (iao, 4), (i, 2), (ou, 2), (a, 2), (...",ao,4
1,赵鹏_如果这都不算爱,"[[是否, 爱, 就是, 忍耐, 不问, 该不该], [都, 怪, 我, 没, 能耐, 转身...","该不该, 走开, 实在, 爱, 悲哀, 悲哀, 独白, 走开, 实在, 爱, 割爱, 删改,...","gai-bu-gai-, -zou-kai-, -shi-zai-, -ai-, -bei-...","[ai, ai, ai, ai, ai, uai]","[(ai, 5), (uai, 1)]",ai,5
2,陶晶莹_你本来就美,"[[牛仔裤, 紧, 了, 一点], [何必, 感觉, 象是, 犯, 了, 死罪], [跑步机...","一点, 死罪, 水, 泪, 谁, 惭愧, 险, 对, 一点, 类, 对, 美, 分类, 世界...","yi-dian-, -si-zui-, -shui-, -lei-, -shui-, -ca...","[ian, ui, ui, ei, ui, ui, ian, ui, ian, ei, ui...","[(ui, 12), (ei, 12), (ian, 8), (ie, 1), (an, 1)]",ui,12
3,陈绮贞_鱼,"[[我, 坐, 在, 椅子, 上, 看, 日出, 复活], [我, 坐, 在, 夕阳, 里,...","复活, 衰弱, 我, 变化, 说话, 腐化, 的, 吧, 边界, 挣扎, 勇敢, 代价, 豢...","fu-huo-, -shuai-ruo-, -wo-, -bian-hua-, -shuo-...","[uo, uo, o, ua, ua, ua, e, a, ie, a, ia, ang, ...","[(ua, 8), (a, 8), (ang, 7), (uo, 6), (e, 4), (...",ua,8
4,张信哲_难道,"[[为何, 不肯, 让, 我, 吻, 你], [为何, 把, 脸, 埋, 在, 你, 双手,...","你, 里, 犹豫, 美丽, 你, 哭泣, 秘密, 距离, 你, 真心, 着迷, 无情, 你,...","ni-, -li-, -you-yu-, -mei-li-, -ni-, -ku-qi-, ...","[i, u, i, i, i, i, in, i, ing, ing, i, i]","[(i, 8), (ing, 2), (u, 1), (in, 1)]",i,8


### 计算华语歌曲采用的最多韵脚

In [5]:
most_rhymns = song_df['rhymns_final'].value_counts().reset_index().rename(columns={'index':'rhymns_final','rhymns_final':'counts'})

most_rhymns

Unnamed: 0,rhymns_final,counts
0,i,11294
1,ang,5390
2,ian,5205
3,ou,4283
4,ai,3797
5,u,3369
6,a,2520
7,ing,2514
8,en,2478
9,ao,2449


In [6]:
# 计算华语歌曲采用最多韵脚的最多歌曲

def get_the_song(rhymns):
    new_df = song_df[song_df['rhymns_final']==rhymns]
    the_most_song_list = new_df.sort_values(by='rhymns_final_count', ascending=False)[:30]['song'].tolist()
#     the_most_rhymns_list = new_df.sort_values(by='rhymns_final_count', ascending=False)[:30]['last_words'].tolist()
    
    return the_most_song_list

In [7]:
# get_the_song('ao')

### 计算每个韵脚里最常用的词语

In [8]:
last_words_all = song_df['last_words'].tolist()

last_words_list = []
for item in last_words_all:
    last_words_list.append(item.split(', '))
    
last_words_list = [item for sublist in last_words_list for item in sublist]

In [9]:
last_rhymns_list = []

for item in last_words_list:
    rs = p.get_pinyin(item).split('-')
    if rs[-1]=='':
        r = re.findall(".*[b,c,d,f,h,j,k,l,m,p,q,r,s,t,w,x,y,z](.*).*",rs[-2])
    else:
        r = re.findall(".*[b,c,d,f,h,j,k,l,m,p,q,r,s,t,w,x,y,z](.*).*",rs[-1])
        
    if r==[]:
        last_rhymns_list.append("")
    elif r !=[]:
        last_rhymns_list.append(r[0])
            
# 歌词最后一个词和韵脚的dataframe
last_rhymns_df = pd.DataFrame({'word':last_words_list, 'rhymns':last_rhymns_list})

In [10]:
# 计算该韵脚下，出现频率最高的词语

def count_most_words(rhymns):
    song_list_all = []
    new_df = last_rhymns_df[last_rhymns_df['rhymns']==rhymns]
    count_df = new_df['word'].value_counts().rename('count').reset_index()
    count_df = count_df[count_df['count']>1000]
    
    count_df_word_list = count_df['index'].tolist()
    for word in count_df_word_list:
        song_list = song_df[song_df['last_words'].str.contains(word)]['song'].tolist()
        song_list_all.append(song_list)
    
    count_df['song']=song_list_all
    
    count_df = (
        count_df
        .assign(
            song_num = lambda df:df['song'].apply(lambda x: len(x)),
            rhymns_this = rhymns,
        )
    )
        
    return count_df

### 最后组装df

In [16]:
# 把计算出最高的韵脚、相关的歌曲等等，全部放进一个df!

frequent_rhymns_df_list = []

for ry in most_rhymns['rhymns_final'].tolist():
    frequent_rhymns_df_list.append(count_most_words(ry))
    
frequent_rhymns_df = (
    pd.concat(frequent_rhymns_df_list)
    .assign(
        song = lambda df:df['song'].apply(lambda x:x[:5]),
    )
)


In [18]:
frequent_rhymns_df.to_csv("result/frequent_rhymns_df2.csv",encoding="utf-8",index=False)
frequent_rhymns_df

Unnamed: 0,index,count,song,song_num,rhymns_this
0,里,7798,"[张信哲_难道, 满文军_最近最远的人, 阿悄_小背叛, 费玉清_奈何, 高凌风_老爷车]",7603.0,i
1,自己,7696,"[Tank_是谁, 蔡琴_出人头地, 林忆莲_苦难中的少年, 郭子_亲爱的妈咪, 许茹芸_给...",3461.0,i
2,美丽,3008,"[张信哲_难道, 容祖儿_新贵, 张清芳_呼喊快乐, 范文芳_爱上了你, 尚雯婕_有了爱]",1620.0,i
3,回忆,2982,"[胡夏_黑框眼镜, 苏有朋_逃兵, 胡彦斌_不是不想, 姜育恒_我找到自己, 庾澄庆_下个雨季]",1621.0,i
4,忘记,2960,"[陈瑞_鱼水情歌, 樊桐舟_雪花, 庾澄庆_下个雨季, 张震岳_乾妹妹, 黄妃_黄昏的故乡]",1477.0,i
...,...,...,...,...,...
0,家,2205,"[龙飘飘_雪里红, 卓依婷_浪迹天涯, 徐小凤_恭喜发财, 童丽_远山含笑, 卓依婷_谁料皇...",2848.0,ia
1,下,2002,"[Tank_是谁, 张智成_迷魂阵, 蔡卓妍_薄雾, 谭晶_春韵, 古巨基_幸福号列车]",5023.0,ia
2,回家,1135,"[米线_天脉传奇, 樊桐舟_雪花, 林宥嘉_推动摇篮的手, 馨予_火柴天堂, 香香_躲猫猫]",519.0,ia
0,感觉,2773,"[郑国锋_停不了, 王筝_爱情一直在经过, 林淑容_我怎麽哭了, 孙悦_香水, 李维_女人的选择]",1530.0,ue


### 直接读取处理完的数据，节省时间~

In [12]:
df = (
    pd.read_csv("result/frequent_rhymns_df.csv",encoding="utf-8")
    .assign(
        song = lambda df:df['song'].str.replace(",","、")
                        .str.replace("'","")
                        .str.replace("[","")
                        .str.replace("]","")
                        .str.replace("_","-")
                        .str.split("、").apply(lambda x: x[:1])
                        .astype(str)
                        .str.replace(",","、")
                        .str.replace("'","")
                        .str.replace("[","")
                        .str.replace("]","")
                        .str.replace("  ","")
    )
    .rename(columns={
        'index':'rhymes_word',
        'rhymns_this':'rhymes_this',
    })
    
)

df.head()

Unnamed: 0,rhymes_word,count,song,song_num,rhymes_this
0,里,7798,张信哲-难道,7606.0,i
1,自己,7696,Tank-是谁,3463.0,i
2,美丽,3008,张信哲-难道,1621.0,i
3,回忆,2982,胡夏-黑框眼镜,1622.0,i
4,忘记,2960,陈瑞-鱼水情歌,1477.0,i


In [13]:
len(df['rhymes_this'].unique())

28

In [14]:
# 做成d3 treemap需要的格式
treemap_data = (
    df
    .assign(
        id = 'rhymes.' + df['rhymes_this'] + "." + df['rhymes_word'],
        id_order = 'rhymes.' + df['rhymes_this'],
    )
    [['id_order','id','rhymes_word','count','song']]
    .rename(columns={'count':'value', 'rhymes_word':'name'})
)

treemap_data.head()

Unnamed: 0,id_order,id,name,value,song
0,rhymes.i,rhymes.i.里,里,7798,张信哲-难道
1,rhymes.i,rhymes.i.自己,自己,7696,Tank-是谁
2,rhymes.i,rhymes.i.美丽,美丽,3008,张信哲-难道
3,rhymes.i,rhymes.i.回忆,回忆,2982,胡夏-黑框眼镜
4,rhymes.i,rhymes.i.忘记,忘记,2960,陈瑞-鱼水情歌


In [15]:
df_new = (
    pd.DataFrame({'id': 'rhymes.' + df['rhymes_this'], 'id_order': 'rhymes.' + df['rhymes_this']})
    .drop_duplicates(subset='id', keep='first')
    .reset_index(drop=True)
)

rhymes_order = dict(zip(df_new['id'],df_new.index))

In [16]:
treemap_data2 = (
    treemap_data.append(df_new)
    .assign(
        order = lambda df:df['id_order'].map(rhymes_order),
        value = lambda df:df['value'].fillna(10000) # 为了把parent那一列放在前面，给了个假数值
    )
#     .fillna(10000) 
    .sort_values(by=["order","value"], ascending=[True,False])
    .rename(columns={'id_order':'group'})
    .assign(
        value = lambda df:df['value'].apply(lambda x: np.nan if x == 10000 else x)
    )
    [['id','name','value','group','song']]
)

treemap_data2

Unnamed: 0,id,name,value,group,song
0,rhymes.i,,,rhymes.i,
0,rhymes.i.里,里,7798.0,rhymes.i,张信哲-难道
1,rhymes.i.自己,自己,7696.0,rhymes.i,Tank-是谁
2,rhymes.i.美丽,美丽,3008.0,rhymes.i,张信哲-难道
3,rhymes.i.回忆,回忆,2982.0,rhymes.i,胡夏-黑框眼镜
...,...,...,...,...,...
225,rhymes.ia.回家,回家,1135.0,rhymes.ia,米线-天脉传奇
26,rhymes.ue,,,rhymes.ue,
226,rhymes.ue.感觉,感觉,2773.0,rhymes.ue,郑国锋-停不了
27,rhymes.uai,,,rhymes.uai,


In [17]:
treemap_data2.to_csv("result/frequent_rhymns_treemap.csv",encoding="utf-8",index=False)

### 参考： 

基于tensorflow, seq2seq, bahdanau注意力机制的中文歌词生成研究  
https://github.com/dengxiuqi/Lyricist-tensorflow  
python查询双押词  
https://zhuanlan.zhihu.com/p/45198100  
中国有嘻哈————押韵机器人  
https://blog.csdn.net/aab438346/article/details/102440418?utm_medium=distribute.pc_relevant_t0.none-task-blog-BlogCommendFromMachineLearnPai2-1.channel_param&depth_1-utm_source=distribute.pc_relevant_t0.none-task-blog-BlogCommendFromMachineLearnPai2-1.channel_param  
python-pinyin  
https://github.com/mozillazg/python-pinyin
xpinyin  
https://github.com/lxneng/xpinyin