Skip to content

Commit

Permalink
[clangd] Add include-mapping for C symbols.
Browse files Browse the repository at this point in the history
Summary:
This resolves the issue of introducing c++-style includes for C files.

- refactor the gen_std.py, make it reusable for parsing C symbols.
- add a language mode to the mapping method to use different mapping for
  C and C++ files.

Reviewers: kadircet

Subscribers: ilya-biryukov, MaskRay, jkorous, arphaman, jfb, cfe-commits

Tags: #clang

Differential Revision: https://reviews.llvm.org/D63270

llvm-svn: 364044
  • Loading branch information
hokein committed Jun 21, 2019
1 parent c07cfce commit 34f5188
Show file tree
Hide file tree
Showing 11 changed files with 1,198 additions and 192 deletions.
944 changes: 944 additions & 0 deletions clang-tools-extra/clangd/CSymbolMap.inc

Large diffs are not rendered by default.

4 changes: 2 additions & 2 deletions clang-tools-extra/clangd/ClangdUnit.cpp
Expand Up @@ -130,7 +130,6 @@ class CppFilePreambleCallbacks : public PreambleCallbacks {
public:
CppFilePreambleCallbacks(PathRef File, PreambleParsedCallback ParsedCallback)
: File(File), ParsedCallback(ParsedCallback) {
addSystemHeadersMapping(&CanonIncludes);
}

IncludeStructure takeIncludes() { return std::move(Includes); }
Expand All @@ -149,6 +148,7 @@ class CppFilePreambleCallbacks : public PreambleCallbacks {
}

void BeforeExecute(CompilerInstance &CI) override {
addSystemHeadersMapping(&CanonIncludes, CI.getLangOpts());
SourceMgr = &CI.getSourceManager();
}

Expand Down Expand Up @@ -414,7 +414,7 @@ ParsedAST::build(std::unique_ptr<CompilerInvocation> CI,
if (Preamble)
CanonIncludes = Preamble->CanonIncludes;
else
addSystemHeadersMapping(&CanonIncludes);
addSystemHeadersMapping(&CanonIncludes, Clang->getLangOpts());
std::unique_ptr<CommentHandler> IWYUHandler =
collectIWYUHeaderMaps(&CanonIncludes);
Clang->getPreprocessor().addCommentHandler(IWYUHandler.get());
Expand Down
2 changes: 1 addition & 1 deletion clang-tools-extra/clangd/StdSymbolMap.inc
@@ -1,6 +1,6 @@
//===-- gen_std.py generated file -------------------------------*- C++ -*-===//
//
// Used to build a lookup table (qualified names => include headers) for C++
// Used to build a lookup table (qualified names => include headers) for CPP
// Standard Library symbols.
//
// Automatically generated file, DO NOT EDIT!
Expand Down
172 changes: 172 additions & 0 deletions clang-tools-extra/clangd/include-mapping/cppreference_parser.py
@@ -0,0 +1,172 @@
#!/usr/bin/env python
#===- cppreference_parser.py - ------------------------------*- python -*--===#
#
# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
# See https://llvm.org/LICENSE.txt for license information.
# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
#
#===------------------------------------------------------------------------===#

from bs4 import BeautifulSoup, NavigableString

import collections
import multiprocessing
import os
import re
import signal
import sys


class Symbol:

def __init__(self, name, namespace, headers):
# unqualifed symbol name, e.g. "move"
self.name = name
# namespace of the symbol (with trailing "::"), e.g. "std::", "" (global scope)
# None for C symbols.
self.namespace = namespace
# a list of corresponding headers
self.headers = headers


def _HasClass(tag, *classes):
for c in tag.get('class', []):
if c in classes:
return True
return False


def _ParseSymbolPage(symbol_page_html, symbol_name):
"""Parse symbol page and retrieve the include header defined in this page.
The symbol page provides header for the symbol, specifically in
"Defined in header <header>" section. An example:
<tr class="t-dsc-header">
<td colspan="2"> <div>Defined in header <code>&lt;ratio&gt;</code> </div>
</td></tr>
Returns a list of headers.
"""
headers = set()
all_headers = set()

soup = BeautifulSoup(symbol_page_html, "html.parser")
# Rows in table are like:
# Defined in header <foo> .t-dsc-header
# Defined in header <bar> .t-dsc-header
# decl1 .t-dcl
# Defined in header <baz> .t-dsc-header
# decl2 .t-dcl
for table in soup.select('table.t-dcl-begin, table.t-dsc-begin'):
current_headers = []
was_decl = False
for row in table.select('tr'):
if _HasClass(row, 't-dcl', 't-dsc'):
was_decl = True
# Symbols are in the first cell.
found_symbols = row.find('td').stripped_strings
if not symbol_name in found_symbols:
continue
headers.update(current_headers)
elif _HasClass(row, 't-dsc-header'):
# If we saw a decl since the last header, this is a new block of headers
# for a new block of decls.
if was_decl:
current_headers = []
was_decl = False
# There are also .t-dsc-header for "defined in namespace".
if not "Defined in header " in row.text:
continue
# The interesting header content (e.g. <cstdlib>) is wrapped in <code>.
for header_code in row.find_all("code"):
current_headers.append(header_code.text)
all_headers.add(header_code.text)
# If the symbol was never named, consider all named headers.
return headers or all_headers


def _ParseIndexPage(index_page_html):
"""Parse index page.
The index page lists all std symbols and hrefs to their detailed pages
(which contain the defined header). An example:
<a href="abs.html" title="abs"><tt>abs()</tt></a> (int) <br>
<a href="acos.html" title="acos"><tt>acos()</tt></a> <br>
Returns a list of tuple (symbol_name, relative_path_to_symbol_page, variant).
"""
symbols = []
soup = BeautifulSoup(index_page_html, "html.parser")
for symbol_href in soup.select("a[title]"):
# Ignore annotated symbols like "acos<>() (std::complex)".
# These tend to be overloads, and we the primary is more useful.
# This accidentally accepts begin/end despite the (iterator) caption: the
# (since C++11) note is first. They are good symbols, so the bug is unfixed.
caption = symbol_href.next_sibling
variant = isinstance(caption, NavigableString) and "(" in caption
symbol_tt = symbol_href.find("tt")
if symbol_tt:
symbols.append((symbol_tt.text.rstrip("<>()"), # strip any trailing <>()
symbol_href["href"], variant))
return symbols


def _ReadSymbolPage(path, name):
with open(path) as f:
return _ParseSymbolPage(f.read(), name)


def _GetSymbols(pool, root_dir, index_page_name, namespace):
"""Get all symbols listed in the index page. All symbols should be in the
given namespace.
Returns a list of Symbols.
"""

# Workflow steps:
# 1. Parse index page which lists all symbols to get symbol
# name (unqualified name) and its href link to the symbol page which
# contains the defined header.
# 2. Parse the symbol page to get the defined header.
index_page_path = os.path.join(root_dir, index_page_name)
with open(index_page_path, "r") as f:
# Read each symbol page in parallel.
results = [] # (symbol_name, promise of [header...])
for symbol_name, symbol_page_path, variant in _ParseIndexPage(f.read()):
# Variant symbols (e.g. the std::locale version of isalpha) add ambiguity.
# FIXME: use these as a fallback rather than ignoring entirely.
if variant:
continue
path = os.path.join(root_dir, symbol_page_path)
results.append((symbol_name,
pool.apply_async(_ReadSymbolPage, (path, symbol_name))))

# Build map from symbol name to a set of headers.
symbol_headers = collections.defaultdict(set)
for symbol_name, lazy_headers in results:
symbol_headers[symbol_name].update(lazy_headers.get())

symbols = []
for name, headers in sorted(symbol_headers.items(), key=lambda t : t[0]):
symbols.append(Symbol(name, namespace, list(headers)))
return symbols


def GetSymbols(parse_pages):
"""Get all symbols by parsing the given pages.
Args:
parse_pages: a list of tuples (page_root_dir, index_page_name, namespace)
"""
symbols = []
# Run many workers to process individual symbol pages under the symbol index.
# Don't allow workers to capture Ctrl-C.
pool = multiprocessing.Pool(
initializer=lambda: signal.signal(signal.SIGINT, signal.SIG_IGN))
try:
for root_dir, page_name, namespace in parse_pages:
symbols.extend(_GetSymbols(pool, root_dir, page_name, namespace))
finally:
pool.terminate()
pool.join()
return symbols

0 comments on commit 34f5188

Please sign in to comment.