diff --git a/.circleci/config.yml b/.circleci/config.yml index 5858d0ccbcec18..37c0c3e9e0c4e9 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -56,8 +56,8 @@ do_steps: &do_steps command: | if [[ $CROSS_COMPILE == *android* ]]; then emulator -avd Nexus5_API24 -no-window -no-audio -no-boot-anim; - elif [[ $CROSS_COMPILE == *freebsd* ]]; then - cd /home/ubuntu && $QEMU + elif [[ $CROSS_COMPILE == *freebsd* ]] || [[ -n "$LKL_QEMU_TEST" ]]; then + cd /home/ubuntu && eval $QEMU fi background: true - run: cd tools/lkl && make -j8 ${MKARG} @@ -71,7 +71,7 @@ do_steps: &do_steps command: | if [[ $CROSS_COMPILE == *android* ]]; then /home/ubuntu/circle-android.sh wait-for-boot; - elif [[ $CROSS_COMPILE == *freebsd* ]]; then + elif [[ $CROSS_COMPILE == *freebsd* ]] || [[ -n "$LKL_QEMU_TEST" ]]; then while ! $MYSSH -o ConnectTimeout=1 exit 2> /dev/null do sleep 5 @@ -147,6 +147,15 @@ jobs: VALGRIND: 1 <<: *do_steps + x86_64_qemu: + docker: + - image: lkldocker/circleci-qemu-x86_64:v1.1 + environment: + CROSS_COMPILE: "" + MKARG: "dpdk=no" + LKL_QEMU_TEST: 1 + <<: *do_steps + checkpatch: docker: - image: lkldocker/circleci:0.5 @@ -167,6 +176,7 @@ workflows: build: jobs: - x86_64 + - x86_64_qemu - mingw32 - android-arm32 - android-aarch64 diff --git a/arch/lkl/Kconfig b/arch/lkl/Kconfig index f6fcfc1486929a..2e96420411d901 100644 --- a/arch/lkl/Kconfig +++ b/arch/lkl/Kconfig @@ -36,6 +36,7 @@ config LKL select ARCH_NO_COHERENT_DMA_MMAP select HAVE_MEMBLOCK select NO_BOOTMEM + select BLK_DEV_NVME config OUTPUT_FORMAT string "Output format" @@ -93,4 +94,8 @@ config CONSOLE_LOGLEVEL_QUIET will be used as the loglevel. IOW passing "quiet" will be the equivalent of passing "loglevel=" - +config PCI + bool "PCI support" + select NO_GENERIC_PCI_IOPORT_MAP + select GENERIC_PCI_IOMAP + default y diff --git a/arch/lkl/Makefile b/arch/lkl/Makefile index 62b4370195ca43..f9ff1f78f7d49b 100644 --- a/arch/lkl/Makefile +++ b/arch/lkl/Makefile @@ -36,6 +36,7 @@ endif core-y += arch/lkl/kernel/ core-y += arch/lkl/mm/ +core-y += arch/lkl/drivers/ all: lkl.o diff --git a/arch/lkl/drivers/Makefile b/arch/lkl/drivers/Makefile new file mode 100644 index 00000000000000..56353c816b7f11 --- /dev/null +++ b/arch/lkl/drivers/Makefile @@ -0,0 +1,2 @@ + +obj-y = pci.o diff --git a/arch/lkl/drivers/pci.c b/arch/lkl/drivers/pci.c new file mode 100644 index 00000000000000..fa600805b3d7c6 --- /dev/null +++ b/arch/lkl/drivers/pci.c @@ -0,0 +1,271 @@ +// SPDX-License-Identifier: GPL-2.0 +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +static int lkl_pci_generic_read(struct pci_bus *bus, unsigned int devfn, + int where, int size, u32 *val) +{ + if (devfn == 0 && + lkl_ops->pci_ops->read(bus->sysdata, where, size, val) == size) + return PCIBIOS_SUCCESSFUL; + else + return PCIBIOS_FUNC_NOT_SUPPORTED; +} + +static int lkl_pci_generic_write(struct pci_bus *bus, unsigned int devfn, + int where, int size, u32 val) +{ + if (devfn == 0 && + lkl_ops->pci_ops->write(bus->sysdata, where, size, &val) == size) + return PCIBIOS_SUCCESSFUL; + else + return PCIBIOS_FUNC_NOT_SUPPORTED; +} + +void __iomem *__pci_ioport_map(struct pci_dev *dev, unsigned long port, + unsigned int nr) +{ + panic("%s is not supported\n", __func__); + return NULL; +} + +static int lkl_pci_override_resource(struct pci_dev *dev, void *data) +{ + int i; + struct resource *r; + resource_size_t start, size; + void *remapped_start = NULL; + + if (dev->devfn != 0) + return 0; + + for (i = 0; i < PCI_NUM_RESOURCES; i++) { + r = &dev->resource[i]; + + if (!r->parent && r->start && r->flags) { + dev_info(&dev->dev, "claiming resource %s/%d\n", + pci_name(dev), i); + if (pci_claim_resource(dev, i)) { + dev_err(&dev->dev, + "Could not claim resource %s/%d!", + pci_name(dev), i); + } + + size = pci_resource_len(dev, i); + + if (pci_resource_flags(dev, i) & IORESOURCE_MEM) { + remapped_start = + lkl_ops->pci_ops->resource_alloc( + dev->sysdata, size, i); + } + + if (remapped_start) { + /* override values */ + start = (resource_size_t)remapped_start; + pci_resource_start(dev, i) = start; + pci_resource_end(dev, i) = start + size - 1; + } else { + /* + * A host library or the application could + * not handle the resource. Disable it + * not to be touched by drivers. + */ + pci_resource_flags(dev, i) |= + IORESOURCE_DISABLED; + } + } + } + + dev->irq = lkl_get_free_irq("pci"); + + if (lkl_ops->pci_ops->irq_init(dev->sysdata, dev->irq) < 0) + return -ENOMEM; + + return 0; +} + +static int lkl_pci_remove_devices(struct pci_dev *dev, void *data) +{ + lkl_ops->pci_ops->remove(dev->sysdata); + return 0; +} + +static struct pci_ops lkl_pci_root_ops = { + .read = lkl_pci_generic_read, + .write = lkl_pci_generic_write, +}; + +static void *lkl_dma_alloc(struct device *dev, size_t size, + dma_addr_t *dma_handle, gfp_t gfp, + unsigned long attrs) +{ + void *vaddr = page_to_virt(alloc_pages(gfp, get_order(size))); + *dma_handle = (dma_addr_t)lkl_ops->pci_ops->map_page( + to_pci_dev(dev)->sysdata, vaddr, size); + return vaddr; +} + +static void lkl_dma_free(struct device *dev, size_t size, void *cpu_addr, + dma_addr_t dma_addr, unsigned long attrs) +{ + lkl_ops->pci_ops->unmap_page(to_pci_dev(dev)->sysdata, dma_addr, size); + __free_pages(cpu_addr, get_order(size)); +} + +static dma_addr_t lkl_dma_map_page(struct device *dev, struct page *page, + unsigned long offset, size_t size, + enum dma_data_direction dir, + unsigned long attrs) +{ + dma_addr_t dma_handle = (dma_addr_t)lkl_ops->pci_ops->map_page( + to_pci_dev(dev)->sysdata, page_to_virt(page) + offset, size); + if (dma_handle == 0) + return DMA_MAPPING_ERROR; + + return dma_handle; +} + +static void lkl_dma_unmap_page(struct device *dev, dma_addr_t dma_addr, + size_t size, enum dma_data_direction dir, + unsigned long attrs) +{ + lkl_ops->pci_ops->unmap_page(to_pci_dev(dev)->sysdata, dma_addr, size); +} + +static int lkl_dma_map_sg(struct device *dev, struct scatterlist *sgl, + int nents, enum dma_data_direction dir, + unsigned long attrs) +{ + int i; + struct scatterlist *sg; + + for_each_sg(sgl, sg, nents, i) { + void *va; + + WARN_ON(!sg_page(sg)); + va = sg_virt(sg); + sg_dma_address(sg) = (dma_addr_t)lkl_dma_map_page( + dev, sg_page(sg), sg->offset, sg->length, dir, attrs); + sg_dma_len(sg) = sg->length; + } + return nents; +} + +static void lkl_dma_unmap_sg(struct device *dev, struct scatterlist *sgl, + int nents, enum dma_data_direction dir, + unsigned long attrs) +{ + int i; + struct scatterlist *sg; + + for_each_sg(sgl, sg, nents, i) + lkl_dma_unmap_page(dev, sg_dma_address(sg), sg_dma_len(sg), dir, + attrs); +} + +static int lkl_dma_supported(struct device *dev, u64 mask) +{ + return 1; +} + +static char *pcidev_name; + +static int __init setup_pci_device(char *str) +{ + if (pcidev_name) { + pr_info("The PCI driver supports only one PCI device."); + pr_info("'%s' will be discarded.", str); + return -1; + } + pcidev_name = str; + return 0; +} + +early_param("lkl_pci", setup_pci_device); + +const struct dma_map_ops lkl_dma_ops = { + .alloc = lkl_dma_alloc, + .free = lkl_dma_free, + .map_sg = lkl_dma_map_sg, + .unmap_sg = lkl_dma_unmap_sg, + .map_page = lkl_dma_map_page, + .unmap_page = lkl_dma_unmap_page, + .dma_supported = lkl_dma_supported, +}; + +static int lkl_pci_probe(struct platform_device *pdev) +{ + struct lkl_pci_dev *dev; + struct pci_bus *bus; + + if (!lkl_ops->pci_ops || !pcidev_name) + return -1; + + dev = lkl_ops->pci_ops->add(pcidev_name, (void *)memory_start, + memory_end - memory_start); + if (!dev) + return -1; + + bus = pci_scan_bus(0, &lkl_pci_root_ops, (void *)dev); + if (!bus) { + lkl_ops->pci_ops->remove(dev); + return -1; + } + pci_walk_bus(bus, lkl_pci_override_resource, NULL); + pci_bus_add_devices(bus); + dev_set_drvdata(&pdev->dev, bus); + + return 0; +} + +static void lkl_pci_shutdown(struct platform_device *pdev) +{ + struct pci_bus *bus = (struct pci_bus *)dev_get_drvdata(&pdev->dev); + + if (bus) + pci_walk_bus(bus, lkl_pci_remove_devices, NULL); +} + +static struct platform_driver lkl_pci_driver = { + .driver = { + .name = "lkl_pci", + }, + .probe = lkl_pci_probe, + .shutdown = lkl_pci_shutdown, +}; + +static int __init lkl_pci_init(void) +{ + int ret; + struct platform_device *dev; + + /*register a platform driver*/ + ret = platform_driver_register(&lkl_pci_driver); + if (ret != 0) + return ret; + + dev = platform_device_alloc("lkl_pci", -1); + if (!dev) + return -ENOMEM; + + ret = platform_device_add(dev); + if (ret != 0) + goto error; + + return 0; +error: + platform_device_put(dev); + return ret; +} + +subsys_initcall(lkl_pci_init); diff --git a/arch/lkl/include/asm/Kbuild b/arch/lkl/include/asm/Kbuild index b062fce7f6ce0f..0ed9b78068b6b8 100644 --- a/arch/lkl/include/asm/Kbuild +++ b/arch/lkl/include/asm/Kbuild @@ -14,8 +14,6 @@ generic-y += current.h generic-y += delay.h generic-y += device.h generic-y += div64.h -generic-y += dma.h -generic-y += dma-mapping.h generic-y += emergency-restart.h generic-y += errno.h generic-y += extable.h @@ -42,7 +40,7 @@ generic-y += module.h generic-y += msgbuf.h generic-y += param.h generic-y += parport.h -generic-y += pci.h +generic-y += pci_iomap.h generic-y += percpu.h generic-y += pgalloc.h generic-y += poll.h @@ -75,5 +73,6 @@ generic-y += topology.h generic-y += trace_clock.h generic-y += unaligned.h generic-y += user.h +generic-y += vga.h generic-y += word-at-a-time.h generic-y += kprobes.h diff --git a/arch/lkl/include/asm/dma-mapping.h b/arch/lkl/include/asm/dma-mapping.h new file mode 100644 index 00000000000000..10e75f00c8a528 --- /dev/null +++ b/arch/lkl/include/asm/dma-mapping.h @@ -0,0 +1,12 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _ASM_LKL_DMA_MAPPING_H +#define _ASM_LKL_DMA_MAPPING_H + +extern const struct dma_map_ops lkl_dma_ops; + +static inline const struct dma_map_ops *get_arch_dma_ops(struct bus_type *bus) +{ + return &lkl_dma_ops; +} + +#endif diff --git a/arch/lkl/include/asm/dma.h b/arch/lkl/include/asm/dma.h new file mode 100644 index 00000000000000..61f3e5ed30d5de --- /dev/null +++ b/arch/lkl/include/asm/dma.h @@ -0,0 +1,13 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _ASM_LKL_DMA_H +#define _ASM_LKL_DMA_H + +#include + +#ifdef CONFIG_PCI +extern int isa_dma_bridge_buggy; +#else +#define isa_dma_bridge_buggy (0) +#endif + +#endif /* _ASM_LKL_DMA_H */ diff --git a/arch/lkl/include/asm/pci.h b/arch/lkl/include/asm/pci.h new file mode 100644 index 00000000000000..94b4265be23f1e --- /dev/null +++ b/arch/lkl/include/asm/pci.h @@ -0,0 +1,11 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _ASM_LKL_PCI_H +#define _ASM_LKL_PCI_H + +#include + +#define pcibios_assign_all_busses() 0 +#define PCIBIOS_MIN_IO 0x1000 +#define PCIBIOS_MIN_MEM 0x10000000 + +#endif /* _ASM_LKL_PCI_H */ diff --git a/arch/lkl/include/uapi/asm/host_ops.h b/arch/lkl/include/uapi/asm/host_ops.h index fdeb56225e4dd9..66aebd77e093d9 100644 --- a/arch/lkl/include/uapi/asm/host_ops.h +++ b/arch/lkl/include/uapi/asm/host_ops.h @@ -9,6 +9,40 @@ typedef unsigned long lkl_thread_t; struct lkl_jmp_buf { unsigned long buf[128]; }; +struct lkl_pci_dev; + +/** + * lkl_dev_pci_ops - PCI host operations + * + * These operations would be a wrapper of userspace PCI drvier and + * must be provided by a host library or by the application. + * + * @add - add a new PCI device; returns a handler or NULL if fails + * @remove - release resources + * @init_irq - allocate resources for interrupts + * @read - read the PCI Configuration Space + * @write - write the PCI Configuration Space + * @resource_alloc - map BARx and return the mapped address. x is resource_index + * + * @map_page - return the DMA address of pages; vaddr might not be page-aligned + * @unmap_page - cleanup DMA region if needed + * + */ +struct lkl_dev_pci_ops { + struct lkl_pci_dev *(*add)(const char *name, void *kernel_ram, + unsigned long ram_size); + void (*remove)(struct lkl_pci_dev *dev); + int (*irq_init)(struct lkl_pci_dev *dev, int irq); + int (*read)(struct lkl_pci_dev *dev, int where, int size, void *val); + int (*write)(struct lkl_pci_dev *dev, int where, int size, void *val); + void *(*resource_alloc)(struct lkl_pci_dev *dev, + unsigned long resource_size, + int resource_index); + unsigned long long (*map_page)(struct lkl_pci_dev *dev, void *vaddr, + unsigned long size); + void (*unmap_page)(struct lkl_pci_dev *dev, + unsigned long long dma_handle, unsigned long size); +}; /** * lkl_host_operations - host operations used by the Linux kernel @@ -54,6 +88,8 @@ struct lkl_jmp_buf { * * @mem_alloc - allocate memory * @mem_free - free memory + * @page_alloc - allocate page aligned memory + * @page_free - free memory allocated by page_alloc * * @timer_create - allocate a host timer that runs fn(arg) when the timer * fires. @@ -83,6 +119,7 @@ struct lkl_jmp_buf { * @jmp_buf_longjmp - perform a jump back to the saved jump buffer * * @memcpy - copy memory + * @pci_ops - pointer to PCI host operations */ struct lkl_host_operations { const char *virtio_devices; @@ -114,6 +151,8 @@ struct lkl_host_operations { void* (*mem_alloc)(unsigned long); void (*mem_free)(void *); + void* (*page_alloc)(unsigned long size); + void (*page_free)(void *addr, unsigned long size); unsigned long long (*time)(void); @@ -131,6 +170,7 @@ struct lkl_host_operations { void (*jmp_buf_longjmp)(struct lkl_jmp_buf *jmpb, int val); void* (*memcpy)(void *dest, const void *src, unsigned long count); + struct lkl_dev_pci_ops *pci_ops; }; /** diff --git a/arch/lkl/mm/bootmem.c b/arch/lkl/mm/bootmem.c index 0baf948c28777f..4bcb6c3228de48 100644 --- a/arch/lkl/mm/bootmem.c +++ b/arch/lkl/mm/bootmem.c @@ -12,7 +12,13 @@ void __init bootmem_init(unsigned long mem_sz) { mem_size = mem_sz; - _memory_start = (unsigned long)lkl_ops->mem_alloc(mem_size); + if (lkl_ops->page_alloc) { + mem_size = PAGE_ALIGN(mem_size); + _memory_start = (unsigned long)lkl_ops->page_alloc(mem_size); + } else { + _memory_start = (unsigned long)lkl_ops->mem_alloc(mem_size); + } + memory_start = _memory_start; BUG_ON(!memory_start); memory_end = memory_start + mem_size; @@ -62,5 +68,8 @@ void free_initmem(void) void free_mem(void) { - lkl_ops->mem_free((void *)_memory_start); + if (lkl_ops->page_free) + lkl_ops->page_free((void *)_memory_start, mem_size); + else + lkl_ops->mem_free((void *)_memory_start); } diff --git a/tools/lkl/.gitignore b/tools/lkl/.gitignore index 79678598633638..9a1faed464c029 100644 --- a/tools/lkl/.gitignore +++ b/tools/lkl/.gitignore @@ -8,6 +8,7 @@ tests/valgrind*.xml *.dll tests/net-test tests/disk +tests/vfio-pci Makefile.conf include/lkl_autoconf.h tests/autoconf.sh diff --git a/tools/lkl/Makefile.autoconf b/tools/lkl/Makefile.autoconf index 38a2473c70623f..5e11c3650a3e46 100644 --- a/tools/lkl/Makefile.autoconf +++ b/tools/lkl/Makefile.autoconf @@ -55,6 +55,7 @@ endef define posix_host $(call set_autoconf_var,POSIX,y) $(call set_autoconf_var,VIRTIO_NET,y) + $(if $(strip $(call find_include,linux/vfio.h)),$(call set_autoconf_var,VFIO_PCI,y)) LDFLAGS += -pie CFLAGS += -fPIC -pthread SOSUF := .so diff --git a/tools/lkl/Targets b/tools/lkl/Targets index e6394fae452657..9cb11bda20374d 100644 --- a/tools/lkl/Targets +++ b/tools/lkl/Targets @@ -23,5 +23,6 @@ LDLIBS_cptofs-$(LKL_HOST_CONFIG_NEEDS_LARGP) += -largp progs-y += tests/boot progs-y += tests/disk +progs-y += tests/disk-vfio-pci progs-y += tests/net-test diff --git a/tools/lkl/include/lkl.h b/tools/lkl/include/lkl.h index 35afe6ef4566ac..0c1bfcd0d727f1 100644 --- a/tools/lkl/include/lkl.h +++ b/tools/lkl/include/lkl.h @@ -434,6 +434,25 @@ long lkl_mount_dev(unsigned int disk_id, unsigned int part, const char *fs_type, int flags, const char *opts, char *mnt_str, unsigned int mnt_str_len); +/** + * lkl_mount_blkdev - mount a block device + * + * Like lkl_mount_dev, but mounts the device specified by dev. + * + * @dev - the device id (can be generated by LKL_MKDEV()) identifying the device + * to be mounted + * @fs_type - filesystem type + * @flags - mount flags + * @opts - additional filesystem specific mount options + * @mnt_str - a string that will be filled by this function with the path where + * the filesystem has been mounted + * @mnt_str_len - size of mnt_str + * @returns - 0 on success, a negative value on error + */ +long lkl_mount_blkdev(unsigned int dev, const char *fs_type, int flags, + const char *opts, char *mnt_str, + unsigned int mnt_str_len); + /** * lkl_umount_dev - umount a disk * @@ -450,6 +469,19 @@ long lkl_mount_dev(unsigned int disk_id, unsigned int part, const char *fs_type, long lkl_umount_dev(unsigned int disk_id, unsigned int part, int flags, long timeout_ms); +/** + * lkl_umount_blkdev - umount a block device + * + * Like lkl_umount_dev, but unmounts the device specified by dev. + * + * @dev - the device id identifying the device to be mounted + * @flags - umount flags + * @timeout_ms - timeout to wait for the kernel to flush closed files so that + * umount can succeed + * @returns - 0 on success, a negative value on error + */ +long lkl_umount_blkdev(unsigned int dev, int flags, long timeout_ms); + /** * lkl_umount_timeout - umount filesystem with timeout * diff --git a/tools/lkl/lib/Build b/tools/lkl/lib/Build index 719c7308c8305f..f9a47540ebf974 100644 --- a/tools/lkl/lib/Build +++ b/tools/lkl/lib/Build @@ -21,5 +21,6 @@ liblkl-$(LKL_HOST_CONFIG_VIRTIO_NET_MACVTAP) += virtio_net_macvtap.o liblkl-$(LKL_HOST_CONFIG_VIRTIO_NET_DPDK) += virtio_net_dpdk.o liblkl-$(LKL_HOST_CONFIG_VIRTIO_NET_VDE) += virtio_net_vde.o liblkl-$(LKL_HOST_CONFIG_VIRTIO_NET) += virtio_net_pipe.o +liblkl-$(LKL_HOST_CONFIG_VFIO_PCI) += vfio_pci.o liblkl-y += ../../perf/pmu-events/jsmn.o liblkl-y += config.o diff --git a/tools/lkl/lib/fs.c b/tools/lkl/lib/fs.c index 7040b3800ccbfe..9fefa1f5c542a7 100644 --- a/tools/lkl/lib/fs.c +++ b/tools/lkl/lib/fs.c @@ -200,22 +200,16 @@ int lkl_get_virtio_blkdev(int disk_id, unsigned int part, uint32_t *pdevid) return lkl_encode_dev_from_sysfs(sysfs_path, pdevid); } -long lkl_mount_dev(unsigned int disk_id, unsigned int part, - const char *fs_type, int flags, - const char *data, char *mnt_str, unsigned int mnt_str_len) +long lkl_mount_blkdev(unsigned int dev, const char *fs_type, int flags, + const char *data, char *mnt_str, unsigned int mnt_str_len) { char dev_str[] = { "/dev/xxxxxxxx" }; - unsigned int dev; int err; char _data[4096]; /* FIXME: PAGE_SIZE is not exported by LKL */ if (mnt_str_len < sizeof(dev_str)) return -LKL_ENOMEM; - err = lkl_get_virtio_blkdev(disk_id, part, &dev); - if (err < 0) - return err; - snprintf(dev_str, sizeof(dev_str), "/dev/%08x", dev); snprintf(mnt_str, mnt_str_len, "/mnt/%08x", dev); @@ -263,6 +257,21 @@ long lkl_mount_dev(unsigned int disk_id, unsigned int part, return 0; } +long lkl_mount_dev(unsigned int disk_id, unsigned int part, + const char *fs_type, int flags, + const char *data, char *mnt_str, unsigned int mnt_str_len) +{ + unsigned int dev; + int err; + + err = lkl_get_virtio_blkdev(disk_id, part, &dev); + if (err < 0) + return err; + + return lkl_mount_blkdev(dev, fs_type, flags, data, mnt_str, + mnt_str_len); +} + long lkl_umount_timeout(char *path, int flags, long timeout_ms) { long incr = 10000000; /* 10 ms */ @@ -284,18 +293,12 @@ long lkl_umount_timeout(char *path, int flags, long timeout_ms) return err; } -long lkl_umount_dev(unsigned int disk_id, unsigned int part, int flags, - long timeout_ms) +long lkl_umount_blkdev(unsigned int dev, int flags, long timeout_ms) { char dev_str[] = { "/dev/xxxxxxxx" }; char mnt_str[] = { "/mnt/xxxxxxxx" }; - unsigned int dev; int err; - err = lkl_get_virtio_blkdev(disk_id, part, &dev); - if (err < 0) - return err; - snprintf(dev_str, sizeof(dev_str), "/dev/%08x", dev); snprintf(mnt_str, sizeof(mnt_str), "/mnt/%08x", dev); @@ -310,6 +313,19 @@ long lkl_umount_dev(unsigned int disk_id, unsigned int part, int flags, return lkl_sys_rmdir(mnt_str); } +long lkl_umount_dev(unsigned int disk_id, unsigned int part, int flags, + long timeout_ms) +{ + unsigned int dev; + int err; + + err = lkl_get_virtio_blkdev(disk_id, part, &dev); + if (err < 0) + return err; + + return lkl_umount_blkdev(dev, flags, timeout_ms); +} + struct lkl_dir { int fd; char buf[1024]; diff --git a/tools/lkl/lib/posix-host.c b/tools/lkl/lib/posix-host.c index 6738b1c2b6f1c8..cbbc54cf2d98f2 100644 --- a/tools/lkl/lib/posix-host.c +++ b/tools/lkl/lib/posix-host.c @@ -13,6 +13,7 @@ #include #include #include +#include #include #include #include "iomem.h" @@ -311,6 +312,28 @@ static long _gettid(void) #endif } +static void *page_alloc(unsigned long size) +{ + void *addr; + + addr = mmap(NULL, size, PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); + + if (addr == MAP_FAILED) + return NULL; + + return addr; +} + +static void page_free(void *addr, unsigned long size) +{ + munmap((void *)addr, size); +} + +#ifdef LKL_HOST_CONFIG_VFIO_PCI +extern struct lkl_dev_pci_ops vfio_pci_ops; +#endif + struct lkl_host_operations lkl_host_ops = { .panic = panic, .thread_create = thread_create, @@ -338,6 +361,8 @@ struct lkl_host_operations lkl_host_ops = { .print = print, .mem_alloc = malloc, .mem_free = free, + .page_alloc = page_alloc, + .page_free = page_free, .ioremap = lkl_ioremap, .iomem_access = lkl_iomem_access, .virtio_devices = lkl_virtio_devs, @@ -345,6 +370,9 @@ struct lkl_host_operations lkl_host_ops = { .jmp_buf_set = jmp_buf_set, .jmp_buf_longjmp = jmp_buf_longjmp, .memcpy = memcpy, +#ifdef LKL_HOST_CONFIG_VFIO_PCI + .pci_ops = &vfio_pci_ops, +#endif }; static int fd_get_capacity(struct lkl_disk disk, unsigned long long *res) diff --git a/tools/lkl/lib/vfio_pci.c b/tools/lkl/lib/vfio_pci.c new file mode 100644 index 00000000000000..172e0175e6826d --- /dev/null +++ b/tools/lkl/lib/vfio_pci.c @@ -0,0 +1,401 @@ +// SPDX-License-Identifier: GPL-2.0 +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "iomem.h" + +struct lkl_pci_dev { + struct lkl_sem *thread_init_sem; + int irq; + lkl_thread_t int_thread; + int quit; + int fd; + int irq_fd; + struct vfio_device_info device_info; + struct vfio_region_info config_reg; + struct vfio_iommu_type1_dma_map dma_map; +}; + +/** + * vfio_pci_add - Create a new pci device + * + * The device should be assigned to VFIO by the host in advance. + * + * @name - PCI device name (as %x:%x:%x.%x format) + * @kernel_ram - the start address of kernel memory needed to be mapped for DMA. + * The address must be aligned to the page size. + * @ram_size - the size of kernel memory, should be page-aligned as well. + */ + +static struct lkl_pci_dev *vfio_pci_add(const char *name, void *kernel_ram, + unsigned long ram_size) +{ + struct lkl_pci_dev *dev; + char path[128]; + int segn, busn, devn, funcn; + int i; + int container_fd = 0, group_fd = 0; + struct vfio_group_status group_status = { .argsz = sizeof( + group_status) }; + struct vfio_iommu_type1_info iommu_info = { .argsz = sizeof( + iommu_info) }; + + dev = malloc(sizeof(*dev)); + if (!dev) + return NULL; + + memset(dev, 0, sizeof(*dev)); + + dev->device_info.argsz = sizeof(struct vfio_device_info); + dev->config_reg.argsz = sizeof(struct vfio_region_info); + dev->dma_map.argsz = sizeof(struct vfio_iommu_type1_dma_map); + + container_fd = open("/dev/vfio/vfio", O_RDWR); + if (container_fd < 0) + goto error; + + if (ioctl(container_fd, VFIO_GET_API_VERSION) != VFIO_API_VERSION || + ioctl(container_fd, VFIO_CHECK_EXTENSION, VFIO_TYPE1_IOMMU) == 0) + goto error; + + if (sscanf(name, "vfio%x:%x:%x.%x", &segn, &busn, &devn, &funcn) != 4) + goto error; + + snprintf(path, sizeof(path), + "/sys/bus/pci/devices/%04x:%02x:%02x.%01x/iommu_group", segn, + busn, devn, funcn); + + i = readlink(path, path, sizeof(path)); + if (i < 0) + goto error; + + path[i] = '\0'; + snprintf(path, sizeof(path), "/dev/vfio%s", strrchr(path, '/')); + + group_fd = open(path, O_RDWR); + if (group_fd < 0) + goto error; + + if (ioctl(group_fd, VFIO_GROUP_GET_STATUS, &group_status) < 0) + goto error; + + if (!(group_status.flags & VFIO_GROUP_FLAGS_VIABLE)) + goto error; + + if (ioctl(group_fd, VFIO_GROUP_SET_CONTAINER, &container_fd) < 0) + goto error; + + if (ioctl(container_fd, VFIO_SET_IOMMU, VFIO_TYPE1_IOMMU) < 0) + goto error; + + if (ioctl(container_fd, VFIO_IOMMU_GET_INFO, &iommu_info) < 0) + goto error; + + /* if kernel_ram is null, assume the memory is already initialized + * by another device, and skip this step. + */ + if (kernel_ram) { + dev->dma_map.vaddr = (uint64_t)kernel_ram; + dev->dma_map.size = ram_size; + dev->dma_map.iova = 0; + dev->dma_map.flags = + VFIO_DMA_MAP_FLAG_READ | VFIO_DMA_MAP_FLAG_WRITE; + if (ioctl(container_fd, VFIO_IOMMU_MAP_DMA, &dev->dma_map) < 0) + goto error; + } + + snprintf(path, sizeof(path), "%04x:%02x:%02x.%01x", segn, busn, devn, + funcn); + dev->fd = ioctl(group_fd, VFIO_GROUP_GET_DEVICE_FD, path); + + if (dev->fd < 0) + goto error; + + if (ioctl(dev->fd, VFIO_DEVICE_GET_INFO, &dev->device_info) < 0) + goto error; + + if (dev->device_info.num_regions <= VFIO_PCI_CONFIG_REGION_INDEX) + goto error; + + dev->config_reg.index = VFIO_PCI_CONFIG_REGION_INDEX; + + if (ioctl(dev->fd, VFIO_DEVICE_GET_REGION_INFO, &dev->config_reg) < 0) + goto error; + + return dev; + +error: + lkl_printf("lkl_vfio_pci: failed to create a PCI device for %s\n", + name); + if (container_fd > 0) + close(container_fd); + if (group_fd > 0) + close(group_fd); + free(dev); + return NULL; +} + +static void vfio_pci_remove(struct lkl_pci_dev *dev) +{ + dev->quit = 1; + lkl_host_ops.thread_join(dev->int_thread); + close(dev->fd); + free(dev); +} + +static int check_irq_status(struct lkl_pci_dev *dev) +{ + unsigned short status; + + if (pread(dev->fd, &status, 2, dev->config_reg.offset + 6) != 2) + return 0; + return (status & (1 << 3)) ? 1 : 0; +} + +/* Currently, we only support INTx. */ +static void vfio_int_thread(void *_dev) +{ + eventfd_t icount; + struct lkl_pci_dev *dev = (struct lkl_pci_dev *)_dev; + struct timespec req = { 0, 1000 * 1000 }; + struct vfio_irq_info irq = { .argsz = sizeof(irq) }; + struct vfio_irq_set *irq_set; + char irq_set_buf[sizeof(struct vfio_irq_set) + sizeof(int)]; + fd_set rfds; + + if (dev->device_info.num_irqs <= VFIO_PCI_INTX_IRQ_INDEX) + goto init_error; + + irq.index = VFIO_PCI_INTX_IRQ_INDEX; + + if (ioctl(dev->fd, VFIO_DEVICE_GET_IRQ_INFO, &irq)) + goto init_error; + + if (irq.count != 1) + goto init_error; + + irq_set = (struct vfio_irq_set *)irq_set_buf; + irq_set->argsz = sizeof(irq_set_buf); + irq_set->count = 1; + irq_set->flags = + VFIO_IRQ_SET_DATA_EVENTFD | VFIO_IRQ_SET_ACTION_TRIGGER; + irq_set->index = VFIO_PCI_INTX_IRQ_INDEX; + irq_set->start = 0; + dev->irq_fd = eventfd(0, EFD_CLOEXEC); + if (dev->irq_fd < 0) + goto init_error; + *(int *)&irq_set->data = dev->irq_fd; + + if (ioctl(dev->fd, VFIO_DEVICE_SET_IRQS, irq_set)) + goto init_error; + + lkl_host_ops.sem_up(dev->thread_init_sem); + + while (1) { + /* We should wait until the driver actually handles + * an interrupt by monitoring the PCI interrupt status bit. + */ + while (check_irq_status(dev) && !dev->quit) { + lkl_trigger_irq(dev->irq); + nanosleep(&req, NULL); + } + + if (dev->quit) + return; + + /* unmask interrupts */ + irq_set->argsz = sizeof(*irq_set); + irq_set->count = 1; + irq_set->flags = + VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_UNMASK; + irq_set->index = VFIO_PCI_INTX_IRQ_INDEX; + irq_set->start = 0; + if (ioctl(dev->fd, VFIO_DEVICE_SET_IRQS, irq_set)) + goto handling_error; + + /* Wait for next interrupt. */ + while (1) { + struct timeval tv; + int rc; + + FD_ZERO(&rfds); + FD_SET(dev->irq_fd, &rfds); + tv.tv_sec = 0; + tv.tv_usec = 100 * 1000; + rc = select(dev->irq_fd + 1, &rfds, NULL, NULL, &tv); + if (rc == -1) + goto handling_error; + else if (rc) + if (read(dev->irq_fd, &icount, sizeof(icount)) < + 0) + goto handling_error; + else + break; + else if (dev->quit) + return; + } + } + +init_error: + lkl_printf("lkl_vfio_pci: failed to setup INTx for a device\n"); + return; +handling_error: + lkl_printf("lkl_vfio_pci: unknown error in the interrupt handler\n"); +} + +static int vfio_pci_irq_init(struct lkl_pci_dev *dev, int irq) +{ + dev->thread_init_sem = lkl_host_ops.sem_alloc(0); + if (!dev->thread_init_sem) + return -1; + + dev->irq = irq; + + dev->int_thread = + lkl_host_ops.thread_create(vfio_int_thread, (void *)dev); + if (!dev->int_thread) { + lkl_host_ops.sem_free(dev->thread_init_sem); + return -1; + } + + /* wait until the interrupt handler thread is ready */ + lkl_host_ops.sem_down(dev->thread_init_sem); + lkl_host_ops.sem_free(dev->thread_init_sem); + return 0; +} + +static unsigned long long vfio_map_page(struct lkl_pci_dev *dev, void *vaddr, + unsigned long size) +{ + return (unsigned long long)vaddr - dev->dma_map.vaddr; +} + +static void vfio_unmap_page(struct lkl_pci_dev *dev, + unsigned long long dma_handle, unsigned long size) +{ +} + +static int vfio_pci_read(struct lkl_pci_dev *dev, int where, int size, + void *val) +{ + return pread(dev->fd, val, size, dev->config_reg.offset + where); +} + +static int vfio_pci_write(struct lkl_pci_dev *dev, int where, int size, + void *val) +{ + return pwrite(dev->fd, val, size, dev->config_reg.offset + where); +} + +static int pci_resource_read(void *data, int offset, void *res, int size) +{ + void *addr = data + offset; + + switch (size) { + case 8: + *(uint64_t *)res = *(uint64_t *)addr; + break; + case 4: + *(uint32_t *)res = *(uint32_t *)addr; + break; + case 2: + *(uint16_t *)res = *(uint16_t *)addr; + break; + case 1: + *(uint8_t *)res = *(uint8_t *)addr; + break; + default: + return -LKL_EOPNOTSUPP; + } + return 0; +} + +static int pci_resource_write(void *data, int offset, void *res, int size) +{ + void *addr = data + offset; + + switch (size) { + case 8: + *(uint64_t *)addr = *(uint64_t *)res; + break; + case 4: + *(uint32_t *)addr = *(uint32_t *)res; + break; + case 2: + *(uint16_t *)addr = *(uint16_t *)res; + break; + case 1: + *(uint8_t *)addr = *(uint8_t *)res; + break; + default: + return -LKL_EOPNOTSUPP; + } + return 0; +} + +static const struct lkl_iomem_ops pci_resource_ops = { + .read = pci_resource_read, + .write = pci_resource_write, +}; + +#define ARRAY_SIZE(arr) (sizeof(arr) / sizeof((arr)[0])) + +static void *vfio_resource_alloc(struct lkl_pci_dev *dev, + unsigned long resource_size, + int resource_index) +{ + unsigned int region_index_list[] = { + VFIO_PCI_BAR0_REGION_INDEX, VFIO_PCI_BAR1_REGION_INDEX, + VFIO_PCI_BAR2_REGION_INDEX, VFIO_PCI_BAR3_REGION_INDEX, + VFIO_PCI_BAR4_REGION_INDEX, VFIO_PCI_BAR5_REGION_INDEX, + }; + struct vfio_region_info reg = { .argsz = sizeof(reg) }; + void *mmio_addr; + + if ((unsigned int)resource_index >= ARRAY_SIZE(region_index_list)) + return NULL; + + reg.index = region_index_list[resource_index]; + + if (dev->device_info.num_regions <= reg.index) + return NULL; + + /* We assume the resource is a memory space. */ + + if (ioctl(dev->fd, VFIO_DEVICE_GET_REGION_INFO, ®) < 0) + return NULL; + + if (reg.size < resource_size) + return NULL; + + mmio_addr = mmap(NULL, resource_size, PROT_READ | PROT_WRITE, + MAP_SHARED, dev->fd, reg.offset); + + if (mmio_addr == MAP_FAILED) + return NULL; + + return register_iomem(mmio_addr, resource_size, &pci_resource_ops); +} + +struct lkl_dev_pci_ops vfio_pci_ops = { + .add = vfio_pci_add, + .remove = vfio_pci_remove, + .irq_init = vfio_pci_irq_init, + .read = vfio_pci_read, + .write = vfio_pci_write, + .resource_alloc = vfio_resource_alloc, + .map_page = vfio_map_page, + .unmap_page = vfio_unmap_page, +}; diff --git a/tools/lkl/tests/Build b/tools/lkl/tests/Build index ace86a3d34383e..86625df4415afe 100644 --- a/tools/lkl/tests/Build +++ b/tools/lkl/tests/Build @@ -1,3 +1,4 @@ boot-y += boot.o test.o disk-y += disk.o cla.o test.o +disk-vfio-pci-y += disk-vfio-pci.o cla.o test.o net-test-y += net-test.o cla.o test.o diff --git a/tools/lkl/tests/disk-vfio-pci.c b/tools/lkl/tests/disk-vfio-pci.c new file mode 100644 index 00000000000000..46e022764d6553 --- /dev/null +++ b/tools/lkl/tests/disk-vfio-pci.c @@ -0,0 +1,112 @@ +// SPDX-License-Identifier: GPL-2.0 +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "test.h" +#include "cla.h" + +static struct { + int printk; + const char *fstype; + const char *pciname; +} cla; + +struct cl_arg args[] = { + { "type", 't', "filesystem type", 1, CL_ARG_STR, &cla.fstype }, + { "pciname", 'n', "PCI device name (as %x:%x:%x.%x format)", 1, + CL_ARG_STR, &cla.pciname }, + { 0 }, +}; + +static char mnt_point[32]; +static char bootparams[128]; + +static int lkl_test_umount_dev(void) +{ + long ret, ret2; + + ret = lkl_sys_chdir("/"); + + ret2 = lkl_umount_blkdev(LKL_MKDEV(259, 0), 0, 1000); + + lkl_test_logf("%ld %ld", ret, ret2); + + if (!ret && !ret2) + return TEST_SUCCESS; + + return TEST_FAILURE; +} + +struct lkl_dir *dir; + +static int lkl_test_opendir(void) +{ + int err; + + dir = lkl_opendir(mnt_point, &err); + + lkl_test_logf("lkl_opedir(%s) = %d %s\n", mnt_point, err, + lkl_strerror(err)); + + if (err == 0) + return TEST_SUCCESS; + + return TEST_FAILURE; +} + +static int lkl_test_readdir(void) +{ + struct lkl_linux_dirent64 *de = lkl_readdir(dir); + int wr = 0; + + while (de) { + wr += lkl_test_logf("%s ", de->d_name); + if (wr >= 70) { + lkl_test_logf("\n"); + wr = 0; + break; + } + de = lkl_readdir(dir); + } + + if (lkl_errdir(dir) == 0) + return TEST_SUCCESS; + + return TEST_FAILURE; +} + +LKL_TEST_CALL(mount_dev, lkl_mount_blkdev, 0, LKL_MKDEV(259, 0), + cla.fstype, 0, NULL, mnt_point, sizeof(mnt_point)) +LKL_TEST_CALL(closedir, lkl_closedir, 0, dir); +LKL_TEST_CALL(chdir_mnt_point, lkl_sys_chdir, 0, mnt_point); +LKL_TEST_CALL(start_kernel, lkl_start_kernel, 0, &lkl_host_ops, bootparams); +LKL_TEST_CALL(stop_kernel, lkl_sys_halt, 0); + +struct lkl_test tests[] = { + LKL_TEST(start_kernel), LKL_TEST(mount_dev), + LKL_TEST(chdir_mnt_point), LKL_TEST(opendir), + LKL_TEST(readdir), LKL_TEST(closedir), + LKL_TEST(umount_dev), LKL_TEST(stop_kernel), +}; + +int main(int argc, const char **argv) +{ + if (parse_args(argc, argv, args) < 0) + return -1; + + snprintf(bootparams, sizeof(bootparams), + "mem=16M loglevel=8 lkl_pci=vfio%s", cla.pciname); + + lkl_host_ops.print = lkl_test_log; + + return lkl_test_run(tests, sizeof(tests) / sizeof(struct lkl_test), + "disk-vfio-pci %s", cla.fstype); +} diff --git a/tools/lkl/tests/disk-vfio-pci.sh b/tools/lkl/tests/disk-vfio-pci.sh new file mode 100755 index 00000000000000..fc888e59d9a604 --- /dev/null +++ b/tools/lkl/tests/disk-vfio-pci.sh @@ -0,0 +1,69 @@ +#!/usr/bin/env bash +# SPDX-License-Identifier: GPL-2.0 + +script_dir=$(cd $(dirname ${BASH_SOURCE:-$0}); pwd) +source $script_dir/test.sh + +pciname="0000:00:03.0" +nvme_id="8086 5845" +bin_name="disk-vfio-pci" + +function wait_guest() +{ + for i in `seq 300`; do + if $MYSSH exit 2> /dev/null; then + break + fi + sleep 1 + done +} + +function init() +{ + # initialize + dd if=/dev/zero of=/home/ubuntu/nvme.img bs=1024 count=102400 + yes | sudo mkfs.$fstype /home/ubuntu/nvme.img + $MYSSH sudo modprobe vfio-pci + $MYSSH "sh -c 'echo $nvme_id | + sudo tee /sys/bus/pci/drivers/vfio-pci/new_id'" + $MYSSH "sh -c 'echo $pciname | + sudo tee /sys/bus/pci/drivers/nvme/unbind'" + $MYSSH "sh -c 'echo $pciname | + sudo tee /sys/bus/pci/drivers/vfio-pci/bind'" + $MYSSH sudo chown lkl:lkl /dev/vfio/3 + $MYSCP $script_dir/$bin_name lkl@localhost: +} + +function cleanup() +{ + $MYSSH "sh -c 'echo $pciname | + sudo tee /sys/bus/pci/drivers/vfio-pci/unbind'" + $MYSSH "sh -c 'echo $pciname | + sudo tee /sys/bus/pci/drivers/nvme/bind'" +} + +function run() +{ + if [ -z "$LKL_QEMU_TEST" ]; then + lkl_test_plan 0 "disk-vfio-pci $fstype" + echo "vfio not supported" + else + lkl_test_plan 1 "disk-vfio-pci $fstype" + lkl_test_run 1 init + lkl_test_exec $MYSSH ./$bin_name -n 0000:00:03.0 -t $fstype + lkl_test_plan 1 "disk-vfio-pci $fstype" + lkl_test_run 1 cleanup + fi +} + +if [ "$1" = "-t" ]; then + shift + fstype=$1 + shift +fi + +if [ -z "$fstype" ]; then + fstype="ext4" +fi + +"$@" diff --git a/tools/lkl/tests/run.py b/tools/lkl/tests/run.py index f34d0eb3389700..b16f675d311349 100755 --- a/tools/lkl/tests/run.py +++ b/tools/lkl/tests/run.py @@ -62,7 +62,11 @@ def end(self, obj): 'lklfuse.sh -t btrfs', 'lklfuse.sh -t vfat', 'lklfuse.sh -t xfs', - 'hijack-test.sh' + 'hijack-test.sh', + 'disk-vfio-pci.sh -t ext4 run', + 'disk-vfio-pci.sh -t btrfs run', + 'disk-vfio-pci.sh -t vfat run', + 'disk-vfio-pci.sh -t xfs run' ] parser = argparse.ArgumentParser(description='LKL test runner')