## Python读取Word统计词频输出到Excel

### 1. 安装依赖的包

In [1]:
# 读取docx
!pip install -i https://pypi.tuna.tsinghua.edu.cn/simple python-docx
# 中英文分词
!pip install -i https://pypi.tuna.tsinghua.edu.cn/simple jieba
# 输出到excel
!pip install -i https://pypi.tuna.tsinghua.edu.cn/simple pandas

Looking in indexes: https://pypi.tuna.tsinghua.edu.cn/simple
Looking in indexes: https://pypi.tuna.tsinghua.edu.cn/simple
Collecting jieba
  Downloading https://pypi.tuna.tsinghua.edu.cn/packages/c6/cb/18eeb235f833b726522d7ebed54f2278ce28ba9438e3135ab0278d9792a2/jieba-0.42.1.tar.gz (19.2 MB)
[K     |████████████████████████████████| 19.2 MB 228 kB/s eta 0:00:011
[?25hBuilding wheels for collected packages: jieba
  Building wheel for jieba (setup.py) ... [?25ldone
[?25h  Created wheel for jieba: filename=jieba-0.42.1-py3-none-any.whl size=19314478 sha256=2c41c8cb763ab518c77144ec4a5df069c889770ad0ff7bba8c84d96580e2c3b4
  Stored in directory: /Users/peishuaishuai/Library/Caches/pip/wheels/95/1a/6d/75355e7a5c76ed48e2d6cde3b95c4828e83274b93f5392ac96
Successfully built jieba
Installing collected packages: jieba
Successfully installed jieba-0.42.1
Looking in indexes: https://pypi.tuna.tsinghua.edu.cn/simple


### 2. 读取docx文件到一个大字符串

In [2]:
import docx

In [3]:
document = docx.Document("Python（计算机程序设计语言）.docx")

In [4]:
content = " ".join([para.text for para in document.paragraphs])

In [5]:
len(content)

24039

In [6]:
content[:10]

' Python（计算'

### 3. 中文分词

In [7]:
import jieba

In [8]:
seg_list = jieba.cut(content, cut_all=False)
print(type(seg_list))

<class 'generator'>


In [9]:
# 过滤标点符号、无意义的单个字
seg_list = [
    word
    for word in seg_list
    if len(word) > 1
]

Building prefix dict from the default dictionary ...
Dumping model to file cache /var/folders/b8/2sztxg454bd4pwqhpktxhqsm0000gn/T/jieba.cache
Loading model cost 0.728 seconds.
Prefix dict has been built successfully.


In [10]:
seg_list[:30]

['Python',
 '计算机',
 '程序设计',
 '语言',
 '编辑',
 'Python',
 '一种',
 '跨平台',
 '计算机',
 '程序设计',
 '语言',
 '一个',
 '高层次',
 '结合',
 '解释性',
 '编译',
 '互动性',
 '面向对象',
 '脚本语言',
 '最初',
 '设计',
 '用于',
 '编写',
 '自动化',
 '脚本',
 'shell',
 '随着',
 '版本',
 '不断更新',
 '语言']

### 4. 统计词频

In [11]:
from collections import Counter

In [12]:
counter = Counter(seg_list)

In [13]:
for key,count in list(counter.items())[:10]:
    print(key,count)

Python 221
计算机 9
程序设计 7
语言 60
编辑 24
一种 17
跨平台 3
一个 52
高层次 1
结合 2


### 5. 构造pandas并且排序

In [14]:
import pandas as pd

In [15]:
df = pd.DataFrame(list(counter.items()), columns=["word", "count"])

In [16]:
df.head()

Unnamed: 0,word,count
0,Python,221
1,计算机,9
2,程序设计,7
3,语言,60
4,编辑,24


In [17]:
df.sort_values(by="count", ascending=False, inplace=True)

In [18]:
df.head()

Unnamed: 0,word,count
0,Python,221
179,使用,67
3,语言,60
131,可以,55
7,一个,52


### 6. 输出到Excel文件

In [19]:
df.to_excel("分析结果-词频数据.xlsx", index=False)