In [134]:
import pandas as pd
# no cut off the output
pd.set_option('display.max_rows', None)

chars = pd.read_csv('characters_freq.csv')
chars.head()

Unnamed: 0,index,level,character,pinyin
0,7.79,一级,的,de
1,7.79,四级,的,dí
2,7.79,二级,的,dì
3,7.17,一级,是,shì
4,7.16,一级,在,zài


In [135]:
# group by level column

chars.groupby('level').count()

Unnamed: 0_level_0,index,character,pinyin
level,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
一级,307,307,307
三级,310,310,310
二级,321,321,321
五级,316,316,316
六级,316,316,316
四级,323,323,323
高等,1256,1256,1256


In [136]:
import pandas as pd
import unicodedata
# open harry.txt
with open('text_to_analyse.txt', 'r') as f:
    text = f.read()

# count characters
from collections import Counter
counts = Counter(text)

# convert to dataframe
counts = pd.DataFrame.from_dict(counts, orient='index').reset_index()
counts.columns = ['character', 'count']

counts = counts.sort_values(by='count', ascending=False)

counts.head()

Unnamed: 0,character,count
95,,153778
11,\n,77194
32,的,74910
76,。,56182
126,他,42563


In [137]:
import unicodedata


def is_han(char):
    try:
        return all('CJK UNIFIED' in unicodedata.name(c) for c in char)
    except:
        return False


# filter out non-Chinese characters
counts['is_chinese'] = counts['character'].apply(is_han)
counts = counts[counts['is_chinese'] == True]

In [143]:

def filter_by_level(level):
    return counts[counts['character'].isin(chars[chars['level'] == level]['character'])].sort_values('count', ascending=False)


# '一级', '二级', '三级', '四级', '五级', '六级', '高等'
filter_by_level('高等').head(10)

Unnamed: 0,character,count,is_chinese,percentage,level
308,着,16641,True,0.979324,高等
258,和,7761,True,0.456735,一级
108,斯,7730,True,0.454911,高等
9,罗,6545,True,0.385174,高等
1600,魔,5398,True,0.317673,高等
1980,赫,3416,True,0.201032,高等
3121,邓,2411,True,0.141887,高等
1300,杖,1912,True,0.112521,高等
761,伯,1503,True,0.088452,高等
234,别,1493,True,0.087863,高等


In [144]:
# print the chars that are not in the list of chars

non_in_hsk = counts[~counts['character'].isin(
    chars['character'])].sort_values('count', ascending=False)
non_in_hsk.head(20)

Unnamed: 0,character,count,is_chinese,percentage,level
958,莱,1637,True,0.096338,
847,弗,1620,True,0.095337,
682,妮,1260,True,0.074151,
1645,咒,1180,True,0.069443,
1651,韦,890,True,0.052377,
786,娜,825,True,0.048551,
3277,卢,723,True,0.042549,
862,莉,446,True,0.026247,
1767,帚,416,True,0.024482,
679,嗯,381,True,0.022422,


In [147]:
total_count = counts['count'].sum()

counts = counts.copy() # copy the dataframe
counts['percentage'] = counts['count'] / total_count * 100
counts['level'] = counts['character'].apply(get_level)

def get_level(character):
    try:
        return chars[chars['character'] == character]['level'].values[0]
    except:
        return 'N/A'

counts['level'] = counts['character'].apply(get_level)
counts.groupby('level').sum()['percentage'].sort_values(ascending=False)

level
一级     50.670184
二级     17.260424
三级     11.426207
高等      6.683129
四级      5.978635
五级      3.309374
六级      3.240342
N/A     1.431704
Name: percentage, dtype: float64

In [157]:
# save the dataframe to csv grouped by level

counts.groupby('level').sum()['percentage'].sort_values(ascending=False).to_csv('level_percentage.csv')

# save chars to csv
counts.sort_values('count', ascending=False)
# save only character,count,is_chinese,percentage,level columns
counts[['character', 'count', 'is_chinese', 'percentage', 'level']].sort_values('count', ascending=False).to_csv('chars.csv')