Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

runtime: agent: use PCI segments 1+ for blk devices #183

Merged
merged 5 commits into from
Apr 30, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
---
# This pod fails to start if Kata debug features are enabled.
# It starts successfully without Kata debug output.
#
# The policy generated by the genpolicy tool is also too large
# and therefore gets rejected by "kubectl apply".
apiVersion: v1
kind: Pod
metadata:
name: debug-failing-many-layers
spec:
terminationGracePeriodSeconds: 0
restartPolicy: Never
runtimeClassName: kata-cc
containers:
# 29 layers
- name: azure-vote-front
image: "mcr.microsoft.com/azuredocs/azure-vote-front:v1"
command:
- sh
- "-c"
- while true; do echo hello; sleep 10; done
# 11 layers
- name: footloose
image: "quay.io/footloose/ubuntu18.04:latest"
command:
- sh
- "-c"
- while true; do echo go; sleep 25; done
# 9 layers
- name: bootloose
image: "quay.io/k0sproject/bootloose-ubuntu22.04:latest"
command:
- sh
- "-c"
- while true; do echo go; sleep 25; done
# 6 layers
- name: nginx
image: "mcr.microsoft.com/cbl-mariner/base/nginx:1.22.1-9-cm2.0.20230904-amd64"
command:
- sh
- "-c"
- while true; do echo nginx; sleep 10; done
# 5 layers
- name: redis
image: "mcr.microsoft.com/oss/bitnami/redis:6.0.8"
command:
- sh
- "-c"
- while true; do echo redis; sleep 20; done
# 4 layers
- name: go
image: "mcr.microsoft.com/appsvc/go:1.19-bullseye_20230324.1"
command:
- sh
- "-c"
- while true; do echo go; sleep 25; done
# 4 layers
- name: sysbench-kata
image: "quay.io/kata-containers/sysbench-kata:latest"
command:
- sh
- "-c"
- while true; do echo go; sleep 25; done
36 changes: 36 additions & 0 deletions src/agent/samples/policy/yaml/pod/pod-many-layers.yaml

Large diffs are not rendered by default.

28 changes: 12 additions & 16 deletions src/agent/src/device.rs
Original file line number Diff line number Diff line change
Expand Up @@ -148,9 +148,7 @@ where
// provided.
#[instrument]
pub fn pcipath_to_sysfs(root_bus_sysfs: &str, pcipath: &pci::Path) -> Result<String> {
// Support up to 10 PCI segments.
let mut bus = "000[0-9]:00".to_string();

let mut bus = "0000:00".to_string();
let mut relpath = String::new();

for i in 0..pcipath.len() {
Expand Down Expand Up @@ -227,9 +225,8 @@ struct VirtioBlkPciMatcher {
}

impl VirtioBlkPciMatcher {
fn new(relpath: &str) -> VirtioBlkPciMatcher {
let root_bus = create_pci_root_bus_pattern();
let re = format!(r"^{}{}/virtio[0-9]+/block/", root_bus, relpath);
fn new(sysfspath: &str) -> VirtioBlkPciMatcher {
let re = format!(r"^{sysfspath}/virtio[0-9]+/block/");

VirtioBlkPciMatcher {
rex: Regex::new(&re).expect("BUG: failed to compile VirtioBlkPciMatcher regex"),
Expand All @@ -246,11 +243,10 @@ impl UeventMatcher for VirtioBlkPciMatcher {
#[instrument]
pub async fn get_virtio_blk_pci_device_name(
sandbox: &Arc<Mutex<Sandbox>>,
pcipath: &pci::Path,
pciaddr: &pci::Address,
) -> Result<String> {
let root_bus_sysfs = format!("{}{}", SYSFS_DIR, create_pci_root_bus_pattern());
let sysfs_rel_path = pcipath_to_sysfs(&root_bus_sysfs, pcipath)?;
let matcher = VirtioBlkPciMatcher::new(&sysfs_rel_path);
let sysfs_path = pciaddr.get_sysfs_path();
let matcher = VirtioBlkPciMatcher::new(&sysfs_path);

let uev = wait_for_uevent(sandbox, matcher).await?;
Ok(format!("{}/{}", SYSTEM_DEV_PATH, &uev.devname))
Expand Down Expand Up @@ -350,7 +346,7 @@ struct PciMatcher {

impl PciMatcher {
fn new(relpath: &str) -> Result<PciMatcher> {
let root_bus = create_pci_root_bus_pattern();
let root_bus = create_pci_root_bus_path();
Ok(PciMatcher {
devpath: format!("{}{}", root_bus, relpath),
})
Expand All @@ -367,7 +363,7 @@ pub async fn wait_for_pci_device(
sandbox: &Arc<Mutex<Sandbox>>,
pcipath: &pci::Path,
) -> Result<pci::Address> {
let root_bus_sysfs = format!("{}{}", SYSFS_DIR, create_pci_root_bus_pattern());
let root_bus_sysfs = format!("{}{}", SYSFS_DIR, create_pci_root_bus_path());
let sysfs_rel_path = pcipath_to_sysfs(&root_bus_sysfs, pcipath)?;
let matcher = PciMatcher::new(&sysfs_rel_path)?;

Expand Down Expand Up @@ -770,7 +766,7 @@ async fn virtio_blk_device_handler(
device: &Device,
sandbox: &Arc<Mutex<Sandbox>>,
) -> Result<SpecUpdate> {
let pcipath = pci::Path::from_str(&device.id)?;
let pcipath = pci::Address::from_str(&device.id)?;
let vm_path = get_virtio_blk_pci_device_name(sandbox, &pcipath).await?;

Ok(DeviceInfo::new(&vm_path, true)
Expand Down Expand Up @@ -1490,7 +1486,7 @@ mod tests {
#[tokio::test]
async fn test_get_device_name() {
let devname = "vda";
let root_bus = create_pci_root_bus_pattern();
let root_bus = create_pci_root_bus_path();
let relpath = "/0000:00:0a.0/0000:03:0b.0";
let devpath = format!("{}{}/virtio4/block/{}", root_bus, relpath, devname);

Expand Down Expand Up @@ -1525,7 +1521,7 @@ mod tests {
#[tokio::test]
#[allow(clippy::redundant_clone)]
async fn test_virtio_blk_matcher() {
let root_bus = create_pci_root_bus_pattern();
let root_bus = create_pci_root_bus_path();
let devname = "vda";

let mut uev_a = crate::uevent::Uevent::default();
Expand Down Expand Up @@ -1610,7 +1606,7 @@ mod tests {
#[tokio::test]
#[allow(clippy::redundant_clone)]
async fn test_scsi_block_matcher() {
let root_bus = create_pci_root_bus_pattern();
let root_bus = create_pci_root_bus_path();
let devname = "sda";

let mut uev_a = crate::uevent::Uevent::default();
Expand Down
7 changes: 3 additions & 4 deletions src/agent/src/linux_abi.rs
Original file line number Diff line number Diff line change
Expand Up @@ -17,13 +17,12 @@ pub const SYSFS_DIR: &str = "/sys";
target_arch = "x86_64",
target_arch = "x86"
))]
pub fn create_pci_root_bus_pattern() -> String {
// Support up to 10 PCI segments.
String::from("/devices/pci000[0-9]:00")
pub fn create_pci_root_bus_path() -> String {
String::from("/devices/pci0000:00")
}

#[cfg(target_arch = "aarch64")]
pub fn create_pci_root_bus_pattern() -> String {
pub fn create_pci_root_bus_path() -> String {
let ret = String::from("/devices/platform/4010000000.pcie/pci0000:00");

let acpi_root_bus_path = String::from("/devices/pci0000:00");
Expand Down
8 changes: 8 additions & 0 deletions src/agent/src/pci.rs
Original file line number Diff line number Diff line change
Expand Up @@ -109,6 +109,14 @@ impl Address {
slotfn,
}
}

pub fn get_sysfs_path(&self) -> String {
// Example: /devices/pci0001:00/0001:00:01.0
format!(
"/devices/pci{:04x}:00/{:04x}:{:02x}:{}",
self.domain, self.domain, self.bus, self.slotfn
)
}
}

impl FromStr for Address {
Expand Down
26 changes: 4 additions & 22 deletions src/agent/src/rpc.rs
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,6 @@ use tokio::sync::Mutex;

use std::ffi::{CString, OsStr};
use std::fmt::Debug;
use std::io;
use std::os::unix::ffi::OsStrExt;
use std::path::Path;
use std::sync::Arc;
Expand Down Expand Up @@ -52,13 +51,12 @@ use nix::sys::{stat, statfs};
use nix::unistd::{self, Pid};
use rustjail::process::ProcessOperations;

use crate::device::{add_devices, get_virtio_blk_pci_device_name, update_env_pci};
use crate::device::{add_devices, update_env_pci};
use crate::linux_abi::*;
use crate::metrics::get_metrics;
use crate::mount::baremount;
use crate::namespace::{NSTYPEIPC, NSTYPEPID, NSTYPEUTS};
use crate::network::setup_guest_dns;
use crate::pci;
use crate::random;
use crate::sandbox::Sandbox;
use crate::storage::{add_storages, update_ephemeral_mounts, STORAGE_HANDLERS};
Expand All @@ -77,7 +75,7 @@ use tracing_opentelemetry::OpenTelemetrySpanExt;

use tracing::instrument;

use libc::{self, c_char, c_ushort, pid_t, winsize, TIOCSWINSZ};
use libc::{self, c_ushort, pid_t, winsize, TIOCSWINSZ};
use std::fs;
use std::os::unix::prelude::PermissionsExt;
use std::process::{Command, Stdio};
Expand Down Expand Up @@ -1854,24 +1852,8 @@ fn do_copy_file(req: &CopyFileRequest) -> Result<()> {
Ok(())
}

async fn do_add_swap(sandbox: &Arc<Mutex<Sandbox>>, req: &AddSwapRequest) -> Result<()> {
let mut slots = Vec::new();
for slot in &req.PCIPath {
slots.push(pci::SlotFn::new(*slot, 0)?);
}
let pcipath = pci::Path::new(slots)?;
let dev_name = get_virtio_blk_pci_device_name(sandbox, &pcipath).await?;

let c_str = CString::new(dev_name)?;
let ret = unsafe { libc::swapon(c_str.as_ptr() as *const c_char, 0) };
if ret != 0 {
return Err(anyhow!(
"libc::swapon get error {}",
io::Error::last_os_error()
));
}

Ok(())
async fn do_add_swap(_sandbox: &Arc<Mutex<Sandbox>>, _req: &AddSwapRequest) -> Result<()> {
Err(anyhow!(nix::Error::ENOTSUP))
}

// Setup container bundle under CONTAINER_BASE, which is cleaned up
Expand Down
4 changes: 2 additions & 2 deletions src/agent/src/storage/block_handler.rs
Original file line number Diff line number Diff line change
Expand Up @@ -66,8 +66,8 @@ impl StorageHandler for VirtioBlkPciHandler {
return Err(anyhow!("Invalid device {}", &storage.source));
}
} else {
let pcipath = pci::Path::from_str(&storage.source)?;
let dev_path = get_virtio_blk_pci_device_name(ctx.sandbox, &pcipath).await?;
let pciaddr = pci::Address::from_str(&storage.source)?;
let dev_path = get_virtio_blk_pci_device_name(ctx.sandbox, &pciaddr).await?;
storage.source = dev_path;
}

Expand Down
20 changes: 12 additions & 8 deletions src/runtime/virtcontainers/clh.go
Original file line number Diff line number Diff line change
Expand Up @@ -543,7 +543,7 @@ func (clh *cloudHypervisor) CreateVM(ctx context.Context, id string, network Net

clh.vmconfig.Platform = chclient.NewPlatformConfig()
platform := clh.vmconfig.Platform
platform.SetNumPciSegments(2)
platform.SetNumPciSegments(10)
if clh.config.IOMMU {
platform.SetIommuSegments([]int32{0})
}
Expand All @@ -554,11 +554,6 @@ func (clh *cloudHypervisor) CreateVM(ctx context.Context, id string, network Net
}
}

if clh.vmconfig.Platform == nil {
clh.vmconfig.Platform = chclient.NewPlatformConfig()
}
clh.vmconfig.Platform.SetNumPciSegments(10)

// Create the VM memory config via the constructor to ensure default values are properly assigned
clh.vmconfig.Memory = chclient.NewMemoryConfig(int64((utils.MemUnit(clh.config.MemorySize) * utils.MiB).ToBytes()))
// shared memory should be enabled if using vhost-user(kata uses virtiofsd)
Expand Down Expand Up @@ -898,7 +893,7 @@ func clhPciInfoToPath(pciInfo chclient.PciDeviceInfo) (types.PciPath, error) {
return types.PciPath{}, fmt.Errorf("Unexpected PCI address %q from clh hotplug", pciInfo.Bdf)
}

return types.PciPathFromString(tokens[0])
return types.PciPathFromString(pciInfo.Bdf)
}

func (clh *cloudHypervisor) hotplugAddBlockDevice(drive *config.BlockDrive) error {
Expand Down Expand Up @@ -944,7 +939,10 @@ func (clh *cloudHypervisor) hotplugAddBlockDevice(drive *config.BlockDrive) erro

// Hotplug block devices on PCI segments >= 1. PCI segment 0 is used
// for the network interface, any disks present at Guest boot time, etc.
clhDisk.SetPciSegment(int32(drive.Index)/32 + 1)
// Just bus 0 of each segment is used, and up to 31 devices can be
// plugged in to each bus.
danmihai1 marked this conversation as resolved.
Show resolved Hide resolved
pciSegment := int32(drive.Index)/31 + 1
clhDisk.SetPciSegment(pciSegment)

pciInfo, _, err := cl.VmAddDiskPut(ctx, clhDisk)

Expand All @@ -954,6 +952,12 @@ func (clh *cloudHypervisor) hotplugAddBlockDevice(drive *config.BlockDrive) erro

clh.devicesIds[driveID] = pciInfo.GetId()
drive.PCIPath, err = clhPciInfoToPath(pciInfo)
clh.Logger().
WithField("bdf", pciInfo.Bdf).
WithField("index", drive.Index).
WithField("pcipath", drive.PCIPath).
WithField("segment", pciSegment).
Debug("hotplugAddBlockDevice")

return err
}
Expand Down
55 changes: 48 additions & 7 deletions src/runtime/virtcontainers/types/pcipath.go
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,9 @@ const (
// number), giving slots 0..31
pciSlotBits = 5
maxPciSlot = (1 << pciSlotBits) - 1

pcidomainBits = 16
maxPcidomain = (1 << pcidomainBits) - 1
)

// A PciSlot describes where a PCI device sits on a single bus
Expand All @@ -25,27 +28,65 @@ const (
//
// XXX In order to support multifunction device's we'll need to extend
// this to include the PCI 3-bit function number as well.
type PciSlot struct{ slot uint8 }
type PciSlot struct {
domain uint16
slot uint8
}

func PciSlotFromString(s string) (PciSlot, error) {
v, err := strconv.ParseUint(s, 16, pciSlotBits)
tokens := strings.Split(s, ":")
if len(tokens) == 3 {
return PciSlotFromBdfString(tokens)
} else {
device, err := PciSlotFromDeviceIndexString(s)
if err != nil {
return PciSlot{}, err
}

return PciSlot{domain: 0, slot: uint8(device)}, nil
}
}

func PciSlotFromDeviceIndexString(s string) (uint64, error) {
return strconv.ParseUint(s, 16, pciSlotBits)
}

func PciSlotFromBdfString(bdfTokens []string) (PciSlot, error) {
if bdfTokens[1] != "00" {
return PciSlot{}, fmt.Errorf("Unexpected PCI bus index %q", bdfTokens[1])
}

domain, err := strconv.ParseUint(bdfTokens[0], 16, pcidomainBits)
if err != nil {
return PciSlot{}, err
}

deviceTokens := strings.Split(bdfTokens[2], ".")
if len(deviceTokens) != 2 || deviceTokens[1] != "0" || len(deviceTokens[0]) != 2 {
return PciSlot{}, fmt.Errorf("Unexpected PCI BDF device format %q", bdfTokens[2])
}

device, err := PciSlotFromDeviceIndexString(deviceTokens[0])
if err != nil {
return PciSlot{}, err
}
// The 5 bit width passed to ParseUint ensures the value is <=
// maxPciSlot
return PciSlot{slot: uint8(v)}, nil

return PciSlot{domain: uint16(domain), slot: uint8(device)}, nil
}

func PciSlotFromInt(v int) (PciSlot, error) {
if v < 0 || v > maxPciSlot {
return PciSlot{}, fmt.Errorf("PCI slot 0x%x should be in range [0..0x%x]", v, maxPciSlot)
}
return PciSlot{slot: uint8(v)}, nil
return PciSlot{domain: 0, slot: uint8(v)}, nil
}

func (slot PciSlot) String() string {
return fmt.Sprintf("%02x", slot.slot)
if slot.domain == 0 {
return fmt.Sprintf("%02x", slot.slot)
} else {
return fmt.Sprintf("%04x:00:%02x.0", slot.domain, slot.slot)
}
}

// A PciPath describes where a PCI sits in a PCI hierarchy.
Expand Down
Loading
Loading