Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions pkg/oc/cli/admin/admin.go
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@ import (
migratehpa "github.com/openshift/origin/pkg/oc/cli/admin/migrate/legacyhpa"
migratestorage "github.com/openshift/origin/pkg/oc/cli/admin/migrate/storage"
migratetemplateinstances "github.com/openshift/origin/pkg/oc/cli/admin/migrate/templateinstances"
"github.com/openshift/origin/pkg/oc/cli/admin/mustgather"
"github.com/openshift/origin/pkg/oc/cli/admin/network"
"github.com/openshift/origin/pkg/oc/cli/admin/node"
"github.com/openshift/origin/pkg/oc/cli/admin/policy"
Expand Down Expand Up @@ -62,6 +63,7 @@ func NewCommandAdmin(name, fullName string, f kcmdutil.Factory, streams genericc
Commands: []*cobra.Command{
upgrade.New(f, fullName, streams),
top.NewCommandTop(top.TopRecommendedName, fullName+" "+top.TopRecommendedName, f, streams),
mustgather.NewMustGatherCommand(f, streams),
},
},
{
Expand Down
309 changes: 309 additions & 0 deletions pkg/oc/cli/admin/mustgather/mustgather.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,309 @@
package mustgather

import (
"fmt"
"math/rand"
"time"

"github.com/spf13/cobra"

corev1 "k8s.io/api/core/v1"
rbacv1 "k8s.io/api/rbac/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/util/wait"
"k8s.io/cli-runtime/pkg/genericclioptions"
"k8s.io/cli-runtime/pkg/genericclioptions/printers"
"k8s.io/client-go/kubernetes"
"k8s.io/client-go/rest"
"k8s.io/klog"
kcmdutil "k8s.io/kubernetes/pkg/kubectl/cmd/util"
"k8s.io/kubernetes/pkg/kubectl/scheme"
"k8s.io/kubernetes/pkg/kubectl/util/templates"

"github.com/openshift/origin/pkg/oc/cli/rsync"
)

var (
mustGatherLong = templates.LongDesc(`
Launch a pod to gather debugging information

This command will launch a pod in a temporary namespace on your
cluster that gathers debugging information, using a copy of the active
client config context, and then downloads the gathered information.

Experimental: This command is under active development and may change without notice.
`)

mustGatherExample = templates.Examples(`
# gather default information using the default image and command, writing into ./must-gather.local.<rand>
oc adm must-gather
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We try not to embed the name in the examples, use %[1]s instead and then
Example: fmt.Sprintf(mustGatherExample, fullName)

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We try not to embed the name in the examples, use %[1]s instead and then
Example: fmt.Sprintf(mustGatherExample, fullName)

@soltysh I don't think we're gaining much with that, do you? Clayton came to a similar conclusion.

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I know tools that are consuming us, and just trying to be good citizen and allow that consumption. Also I'm slowly accepting the fact that both kubectl and oc are primitives that folks use to build their opinionated flows on top (see https://www.youtube.com/watch?v=ytu3aUCwlSg for example how AirBnB is wrapping kubectl just for that). But that's not a strong requirement, yet it doesn't cost us that much to have it done this way.


# gather default information with a specific local folder to copy to
oc adm must-gather --dest-dir=/local/directory

# gather default information using a specific image, command, and pod-dir
oc adm must-gather --image=my/image:tag --source-dir=/pod/directory -- myspecial-command.sh
`)
)

func NewMustGatherCommand(f kcmdutil.Factory, streams genericclioptions.IOStreams) *cobra.Command {
o := NewMustGatherOptions(streams)
rsyncCommand := rsync.NewCmdRsync(rsync.RsyncRecommendedName, "", f, streams)
cmd := &cobra.Command{
Use: "must-gather",
Short: "Launch a new instance of a pod for gathering debug information",
Long: mustGatherLong,
Example: mustGatherExample,
Hidden: true,
Run: func(cmd *cobra.Command, args []string) {
kcmdutil.CheckErr(o.Complete(f, cmd, args))
kcmdutil.CheckErr(o.Run(rsyncCommand))
},
}

cmd.Flags().StringVar(&o.NodeName, "node-name", o.NodeName, "Set a specific node to use - by default a random master will be used")
cmd.Flags().StringVar(&o.Image, "image", o.Image, "Set a specific image to use, by default the OpenShift's must-gather image will be used.")
cmd.Flags().StringVar(&o.DestDir, "dest-dir", o.DestDir, "Set a specific directory on the local machine to write gathered data to.")
cmd.Flags().BoolVar(&o.Keep, "keep", o.Keep, "Do not delete temporary resources when command completes.")
cmd.Flags().MarkHidden("keep")

return cmd
}

func NewMustGatherOptions(streams genericclioptions.IOStreams) *MustGatherOptions {
return &MustGatherOptions{IOStreams: streams}
}

func (o *MustGatherOptions) Complete(f kcmdutil.Factory, cmd *cobra.Command, args []string) error {
if i := cmd.ArgsLenAtDash(); i != -1 && i < len(args) {
o.Command = args[i:]
} else {
o.Command = args
}
var err error
if o.Config, err = f.ToRESTConfig(); err != nil {
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm not quite sure why you're adding these new lines after every call, I can understand grouping them logically, but not after every call. For example I'd group all config + client together, etc.

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done

return err
}
if o.Client, err = kubernetes.NewForConfig(o.Config); err != nil {
return err
}
if len(o.DestDir) == 0 {
o.DestDir = fmt.Sprintf("must-gather.local.%06d", rand.Int63())
}
if len(o.Image) == 0 {
// TODO lookup cluster specific default
o.Image = "quay.io/openshift/origin-must-gather:v4.0"
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Set the default in NewMustGatherOptions, this way the default will be nicely presented in the command's help.

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This isn't the real default, see #22528.

}
o.PrinterCreated, err = printers.NewTypeSetter(scheme.Scheme).WrapToPrinter(&printers.NamePrinter{Operation: "created"}, nil)
if err != nil {
return err
}
o.RsyncRshCmd = rsync.DefaultRsyncRemoteShellToUse(cmd.Parent())
return nil
}

type MustGatherOptions struct {
genericclioptions.IOStreams

Config *rest.Config
Client kubernetes.Interface

NodeName string
DestDir string
Image string
Command []string
Keep bool

RsyncRshCmd string

PrinterCreated printers.ResourcePrinter
}

// Run creates and runs a must-gather pod.d
func (o *MustGatherOptions) Run(rsyncCmd *cobra.Command) error {
if len(o.Image) == 0 {
return fmt.Errorf("missing an image")
}

var err error

// create namespace
ns, err := o.Client.CoreV1().Namespaces().Create(&corev1.Namespace{
ObjectMeta: metav1.ObjectMeta{
GenerateName: "openshift-must-gather-",
Labels: map[string]string{
"openshift.io/run-level": "0",
},
Annotations: map[string]string{
"oc.openshift.io/command": "oc adm must-gather",
},
},
})
if err != nil {
return err
}
if !o.Keep {
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

add printf indicating what namespace we created. namespace/<name> created

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done

defer func() {
err = o.Client.CoreV1().Namespaces().Delete(ns.Name, nil)
}()
} else {
o.PrinterCreated.PrintObj(ns, o.Out)
}

clusterRoleBinding, err := o.Client.RbacV1().ClusterRoleBindings().Create(o.newClusterRoleBinding(ns.Name))
if err != nil {
return err
}
if !o.Keep {
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

printf indicating what clusterrolebinding.rbac.authorization.k8s.io we created

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done

defer func() {
err = o.Client.RbacV1().ClusterRoleBindings().Delete(clusterRoleBinding.Name, &metav1.DeleteOptions{})
}()
} else {
o.PrinterCreated.PrintObj(clusterRoleBinding, o.Out)
}

// create pod
pod, err := o.Client.CoreV1().Pods(ns.Name).Create(o.newPod(o.NodeName))
if err != nil {
return err
}

// wait for pod to be running (gather has completed)
if err := o.waitForPodRunning(pod); err != nil {
return err
}

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Add a // TODO, confirm that must-gather has finish

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

must-gather is run in an init container. Pod is Pending while it's running, Running if successful, and something else if something went wrong.

// copy the gathered files into the local destination dir
err = o.copyFilesFromPod(pod)
return err
}

func (o *MustGatherOptions) copyFilesFromPod(pod *corev1.Pod) error {
rsyncOptions := &rsync.RsyncOptions{
Namespace: pod.Namespace,
Source: &rsync.PathSpec{PodName: pod.Name, Path: "/must-gather/"},
ContainerName: "copy",
Destination: &rsync.PathSpec{PodName: "", Path: o.DestDir},
Client: o.Client,
Config: o.Config,
RshCmd: fmt.Sprintf("%s --namespace=%s", o.RsyncRshCmd, pod.Namespace),
IOStreams: o.IOStreams,
}
rsyncOptions.Strategy = rsync.NewDefaultCopyStrategy(rsyncOptions)
return rsyncOptions.RunRsync()

}

func (o *MustGatherOptions) waitForPodRunning(pod *corev1.Pod) error {
phase := pod.Status.Phase
err := wait.PollImmediate(time.Second, 10*time.Minute, func() (bool, error) {
var err error
if pod, err = o.Client.CoreV1().Pods(pod.Namespace).Get(pod.Name, metav1.GetOptions{}); err != nil {
klog.Error(err)
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Don't log the error here, if you care, log it at higher debug levels, it's not an error per se.

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

removed in PR #22561

return false, nil
}
phase = pod.Status.Phase
return phase != corev1.PodPending, nil
})
if err != nil {
return err
}
if phase != corev1.PodRunning {
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The switch from pending to running in most cases should be fast, but why wait for pending and then check its status after wait, and not just wait for running in wait.PollImmediate ?

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It might go from Pending to Failed (or even Success or Completed). With a RestartPolicy: Never, I would be waiting for a transition that might never happen if I look for Running specifically. This way we fail ASAP, instead of timing out the wait.PollIntermediate call.

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Right but the transition from Pending to Running might not be as fast as this check and you will fail then, b/c pod is still in pending. Maybe you should have a more explicit checks in place then.

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

My bad, I misread the condition in wait.PollImmediate, it's 👍

return fmt.Errorf("pod is not running: %v", phase)
}
return nil
}

func (o *MustGatherOptions) newClusterRoleBinding(ns string) *rbacv1.ClusterRoleBinding {
return &rbacv1.ClusterRoleBinding{
ObjectMeta: metav1.ObjectMeta{
GenerateName: "must-gather-",
Annotations: map[string]string{
"oc.openshift.io/command": "oc adm must-gather",
},
},
RoleRef: rbacv1.RoleRef{
APIGroup: "rbac.authorization.k8s.io",
Kind: "ClusterRole",
Name: "cluster-admin",
},
Subjects: []rbacv1.Subject{
{
Kind: "ServiceAccount",
Name: "default",
Namespace: ns,
},
},
}
}

// newPod creates a pod with 2 containers with a shared volume mount:
// - gather: init container that runs gather command
// - copy: no-op container we can exec into
func (o *MustGatherOptions) newPod(node string) *corev1.Pod {
zero := int64(0)
ret := &corev1.Pod{
ObjectMeta: metav1.ObjectMeta{
GenerateName: "must-gather-",
Labels: map[string]string{
"app": "must-gather",
},
},
Spec: corev1.PodSpec{
NodeName: node,
RestartPolicy: corev1.RestartPolicyNever,
Volumes: []corev1.Volume{
{
Name: "must-gather-output",
VolumeSource: corev1.VolumeSource{
EmptyDir: &corev1.EmptyDirVolumeSource{},
},
},
},
InitContainers: []corev1.Container{
{
Name: "gather",
Image: o.Image,
Command: []string{"/bin/bash", "-c", "for resource in $RESOURCES ; do openshift-must-gather inspect ${resource} --base-dir /must-gather ; done"},
Env: []corev1.EnvVar{
{
Name: "RESOURCES",
Value: "clusteroperators certificatesigningrequests nodes machines machineconfigs ns/default ns/openshift ns/kube-system persistentvolumes volumeattachments clusternetworks hostsubnets clusterautoscaler machineautoscaler",
},
},
VolumeMounts: []corev1.VolumeMount{
{
Name: "must-gather-output",
MountPath: "/must-gather",
ReadOnly: false,
},
},
},
},
Containers: []corev1.Container{
{
Name: "copy",
Image: o.Image,
Command: []string{"/bin/bash", "-c", "trap : TERM INT; sleep infinity & wait"},
VolumeMounts: []corev1.VolumeMount{
{
Name: "must-gather-output",
MountPath: "/must-gather",
ReadOnly: false,
},
},
},
},
TerminationGracePeriodSeconds: &zero,
Tolerations: []corev1.Toleration{
{
Operator: "Exists",
},
},
},
}
if len(o.Command) > 0 {
ret.Spec.Containers[0].Command = o.Command
}
return ret
}
5 changes: 5 additions & 0 deletions test/cmd/admin.sh
Original file line number Diff line number Diff line change
Expand Up @@ -392,4 +392,9 @@ os::cmd::expect_success_and_text "oc adm prune images" "Dry run enabled - no mod
echo "images: ok"
os::test::junit::declare_suite_end

# oc adm must-gather
os::test::junit::declare_suite_start "cmd/admin/must-gather"
os::cmd::expect_success "oc adm must-gather --help"
os::test::junit::declare_suite_end

os::test::junit::declare_suite_end