# 爬取Leetcode习题

In [1]:
import json
import requests
import threadpool

## 第一部分：获取题目内容

### 获取所有题目描述信息的响应

In [None]:
def get_all_description_response():
    all_description_url = 'https://leetcode-cn.com/api/problems/all/'
    response = requests.get(all_description_url)
    return response

### 从题目描述信息的响应中解析出描述信息

In [None]:
def parse_all_description_list(response):
    text = response.text
    all_description_list = json.loads(text)['stat_status_pairs']
    return all_description_list

### 提取题目标题、难度等级、题目序号

In [None]:
def get_title_and_level(all_description_list):
    title_and_level_pairs = []
    for description in all_description_list:
        item = {}
        item['question_title_slug'] = description['stat']['question__title_slug']
        item['frontend_question_id'] = description['stat']['frontend_question_id']
        item['level'] = description['difficulty']['level']
        title_and_level_pairs.append(item)
    return title_and_level_pairs

### 获取单个问题的响应

In [None]:
def get_question_response(question_title_slug):
    question_query_url = 'https://leetcode-cn.com/graphql'
    request_payload = {"operationName":"questionData",
                   "variables":{"titleSlug":question_title_slug},
                   "query":"query questionData($titleSlug: String!) {\n  question(titleSlug: $titleSlug) {\n    questionId\n    questionFrontendId\n    boundTopicId\n    title\n    titleSlug\n    content\n    translatedTitle\n    translatedContent\n    isPaidOnly\n    difficulty\n    likes\n    dislikes\n    isLiked\n    similarQuestions\n    contributors {\n      username\n      profileUrl\n      avatarUrl\n      __typename\n    }\n    langToValidPlayground\n    topicTags {\n      name\n      slug\n      translatedName\n      __typename\n    }\n    companyTagStats\n    codeSnippets {\n      lang\n      langSlug\n      code\n      __typename\n    }\n    stats\n    hints\n    solution {\n      id\n      canSeeDetail\n      __typename\n    }\n    status\n    sampleTestCase\n    metaData\n    judgerAvailable\n    judgeType\n    mysqlSchemas\n    enableRunCode\n    enableTestMode\n    envInfo\n    __typename\n  }\n}\n"}
    response = requests.post(question_query_url, json=request_payload)
    return response

### 解析问题的内容文本

In [None]:
def parse_question_content(response):
    text = response.text
    question_content = json.loads(text)['data']['question']['translatedContent']
    return question_content

In [None]:
def get_question_content(pair):
    print('正在获取：', pair['question_title_slug'])
    response = get_question_response(pair['question_title_slug'])
    question_content = parse_question_content(response)
    pair['question_content'] = question_content

### 主程序

In [None]:
def main():
    # 获取描述响应
    response = get_all_description_response()
    # 解析描述信息获得信息列表
    all_description_list = parse_all_description_list(response)
    # 提取题目标题、难度等级、题目序号
    title_and_level_pairs = get_title_and_level(all_description_list)
    pool = threadpool.ThreadPool(10)
    requests = threadpool.makeRequests(get_question_content, title_and_level_pairs)
    [pool.putRequest(req) for req in requests]
    pool.wait()
    return title_and_level_pairs

In [None]:
result = main()

### 将结果保存到文件中

In [None]:
with open('leecode.json', 'w') as f:
    json.dump(result, f)

## 第二部分：生成html文件

In [None]:
import pandas as pd

### 读取文件

In [None]:
data = pd.read_json('../Downloads/leecode.json')
data = data.sort_values('frontend_question_id')
data.head()

### 读取html模板

In [None]:
with open('template.html', 'r') as f:
    html = f.read()

### 拼接内容字符串

In [None]:
content = ''
content += '<h1 id=0>leetcode题库</h1>'
content += '<div>'
for each in data.index[::-1]:
    question_title_slug = data.iloc[each, 3]
    content += '<div>%d.<a href=#%d>%s</a></div>' % (each+1, each+1, question_title_slug)
content += '</div>'

content += '<div>'
for each in data.index[::-1]:
    level = data.iloc[each, 1]
    question_content = data.iloc[each, 2]
    question_title_slug = data.iloc[each, 3]
    content += '<div>'
    content += '<h2 id=%d>%d.%s</h2> difficulty-level:%s\n' % (each+1, each+1, question_title_slug, level)
    content += '<div align="right"><a href=#0>返回</a></div>'
    content += question_content
    content += '</div>'
content += '</div>'

### 将拼接好的字符串替换到html模板中保存

In [None]:
html = html.replace('%title%', 'leetcode题库')
html = html.replace('%content%', content)
with open('/home/python/Desktop/leetcode.html', 'w') as f:
    f.write(html)