Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Other Pod(s) Status Clean up & Pod(s) stuck in Terminating state #89

Open
wants to merge 3 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -19,3 +19,4 @@ USER kube-operator
COPY --from=build /build/bin/kube-cleanup-operator .

ENTRYPOINT ["./kube-cleanup-operator"]

12 changes: 9 additions & 3 deletions cmd/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,8 @@ func main() {
deleteFailedAfter := flag.Duration("delete-failed-after", 0, "Delete jobs and pods in failed state after X duration (golang duration format, e.g 5m), 0 - never delete")
deleteOrphanedAfter := flag.Duration("delete-orphaned-pods-after", 1*time.Hour, "Delete orphaned pods. Pods without an owner in non-running state (golang duration format, e.g 5m), 0 - never delete")
deleteEvictedAfter := flag.Duration("delete-evicted-pods-after", 15*time.Minute, "Delete pods in evicted state (golang duration format, e.g 5m), 0 - never delete")
deleteTerminatedAfter := flag.Duration("delete-terminated-pods-after", 0, "Delete pods in terminated/failed state (golang duration format, e.g 5m), 0 - never delete")
deleteTerminatingAfter := flag.Duration("delete-terminating-pods-after", 0, "Delete pods in which is stuck in terminating state (golang duration format, e.g 5m), 0 - never delete")
deletePendingAfter := flag.Duration("delete-pending-pods-after", 0, "Delete pods in pending state after X duration (golang duration format, e.g 5m), 0 - never delete")
ignoreOwnedByCronjob := flag.Bool("ignore-owned-by-cronjobs", false, "[EXPERIMENTAL] Do not cleanup pods and jobs created by cronjobs")

Expand All @@ -58,9 +60,9 @@ func main() {
legacyMode := flag.Bool("legacy-mode", true, "Legacy mode: `true` - use old `keep-*` flags, `false` - enable new `delete-*-after` flags")

dryRun := flag.Bool("dry-run", false, "Print only, do not delete anything.")

labelSelector := flag.String("label-selector", "", "Delete only jobs and pods that meet label selector requirements")

flag.Parse()
setupLogging()

Expand All @@ -74,13 +76,15 @@ func main() {
optsInfo.WriteString(fmt.Sprintf("\tdelete-pending-after: %s\n", *deletePendingAfter))
optsInfo.WriteString(fmt.Sprintf("\tdelete-orphaned-after: %s\n", *deleteOrphanedAfter))
optsInfo.WriteString(fmt.Sprintf("\tdelete-evicted-after: %s\n", *deleteEvictedAfter))
optsInfo.WriteString(fmt.Sprintf("\tdelete-terminated-after: %s\n", *deleteTerminatedAfter))
optsInfo.WriteString(fmt.Sprintf("\tdelete-terminating-after: %s\n", *deleteTerminatingAfter))
optsInfo.WriteString(fmt.Sprintf("\tignore-owned-by-cronjobs: %v\n", *ignoreOwnedByCronjob))

optsInfo.WriteString(fmt.Sprintf("\n\tlegacy-mode: %v\n", *legacyMode))
optsInfo.WriteString(fmt.Sprintf("\tkeep-successful: %d\n", *legacyKeepSuccessHours))
optsInfo.WriteString(fmt.Sprintf("\tkeep-failures: %d\n", *legacyKeepFailedHours))
optsInfo.WriteString(fmt.Sprintf("\tkeep-pending: %d\n", *legacyKeepPendingHours))

optsInfo.WriteString(fmt.Sprintf("\tlabel-selector: %s\n", *labelSelector))
log.Println(optsInfo.String())

Expand Down Expand Up @@ -133,6 +137,8 @@ func main() {
*deletePendingAfter,
*deleteOrphanedAfter,
*deleteEvictedAfter,
*deleteTerminatedAfter,
*deleteTerminatingAfter,
*ignoreOwnedByCronjob,
*labelSelector,
stopCh,
Expand Down
3 changes: 0 additions & 3 deletions go.sum
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
cloud.google.com/go v0.26.0/go.mod h1:aQUYkXzVsufM+DwF1aE+0xfcU+56JwCaLick0ClmMTw=
cloud.google.com/go v0.34.0 h1:eOI3/cP2VTU6uZLDYAoic+eyzzB9YyGmJ7eIjl8rOPg=
cloud.google.com/go v0.34.0/go.mod h1:aQUYkXzVsufM+DwF1aE+0xfcU+56JwCaLick0ClmMTw=
cloud.google.com/go v0.38.0 h1:ROfEUZz+Gh5pa62DJWXSaonyu3StP6EA6lPEXPI6mCo=
cloud.google.com/go v0.38.0/go.mod h1:990N+gfupTy94rShfmMCWGDn0LpTmnzTp2qbd1dvSRU=
Expand Down Expand Up @@ -111,7 +110,6 @@ github.com/valyala/histogram v1.0.1 h1:FzA7n2Tz/wKRMejgu3PV1vw3htAklTjjuoI6z3d4K
github.com/valyala/histogram v1.0.1/go.mod h1:lQy0xA4wUz2+IUnf97SivorsJIp8FxsnRd6x25q7Mto=
go.opencensus.io v0.21.0/go.mod h1:mSImk1erAIZhrmZN+AvHh14ztQfjbGwt4TtuofqLduU=
golang.org/x/crypto v0.0.0-20190211182817-74369b46fc67/go.mod h1:6SG95UA2DQfeDnfUPMdvaQW0Q7yPrPDi9nlGo2tz2b4=
golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2 h1:VklqNMn3ovrHsnt90PveolxSbWFaJdECFbxSq0Mqo2M=
golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w=
golang.org/x/crypto v0.0.0-20200220183623-bac4c82f6975 h1:/Tl7pH94bvbAAHBdZJT947M/+gp0+CqQXDtMRC0fseo=
golang.org/x/crypto v0.0.0-20200220183623-bac4c82f6975/go.mod h1:LzIPMQfyMNhhGPhUkYOs5KpL4U8rLKemX1yGLhDgUto=
Expand Down Expand Up @@ -163,7 +161,6 @@ golang.org/x/tools v0.0.0-20190226205152-f727befe758c/go.mod h1:9Yl7xja0Znq3iFh3
golang.org/x/tools v0.0.0-20190312170243-e65039ee4138/go.mod h1:LCzVGOaR6xXOjkQ3onu1FJEFr0SW1gC7cKk1uF8kGRs=
google.golang.org/api v0.4.0/go.mod h1:8k5glujaEP+g9n7WNsDg8QP6cUVNI86fCNMcbazEtwE=
google.golang.org/appengine v1.1.0/go.mod h1:EbEs0AVv82hx2wNQdGPgUI5lhzA/G0D9YwlJXL52JkM=
google.golang.org/appengine v1.4.0 h1:/wp5JvzpHIxhs/dumFmF7BXTf3Z+dd4uXta4kVyO508=
google.golang.org/appengine v1.4.0/go.mod h1:xpcJRLb0r/rnEns0DIKYYv+WjYCduHsrkT7/EB5XEv4=
google.golang.org/appengine v1.5.0 h1:KxkO13IPW4Lslp2bz+KHP2E3gtFlrIGNThxkZQ3g+4c=
google.golang.org/appengine v1.5.0/go.mod h1:xpcJRLb0r/rnEns0DIKYYv+WjYCduHsrkT7/EB5XEv4=
Expand Down
70 changes: 48 additions & 22 deletions pkg/controller/controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -44,15 +44,17 @@ type Kleaner struct {
jobInformer cache.SharedIndexInformer
kclient *kubernetes.Clientset

deleteSuccessfulAfter time.Duration
deleteFailedAfter time.Duration
deletePendingAfter time.Duration
deleteOrphanedAfter time.Duration
deleteEvictedAfter time.Duration
deleteSuccessfulAfter time.Duration
deleteFailedAfter time.Duration
deletePendingAfter time.Duration
deleteOrphanedAfter time.Duration
deleteEvictedAfter time.Duration
deleteTerminatedAfter time.Duration
deleteTerminatingAfter time.Duration

ignoreOwnedByCronjob bool
labelSelector string

labelSelector string

dryRun bool
ctx context.Context
Expand All @@ -61,7 +63,7 @@ type Kleaner struct {

// NewKleaner creates a new NewKleaner
func NewKleaner(ctx context.Context, kclient *kubernetes.Clientset, namespace string, dryRun bool, deleteSuccessfulAfter,
deleteFailedAfter, deletePendingAfter, deleteOrphanedAfter, deleteEvictedAfter time.Duration, ignoreOwnedByCronjob bool,
deleteFailedAfter, deletePendingAfter, deleteOrphanedAfter, deleteEvictedAfter time.Duration, deleteTerminatedAfter time.Duration, deleteTerminatingAfter time.Duration, ignoreOwnedByCronjob bool,
labelSelector string,
stopCh <-chan struct{}) *Kleaner {
jobInformer := cache.NewSharedIndexInformer(
Expand Down Expand Up @@ -96,17 +98,19 @@ func NewKleaner(ctx context.Context, kclient *kubernetes.Clientset, namespace st
cache.Indexers{},
)
kleaner := &Kleaner{
dryRun: dryRun,
kclient: kclient,
ctx: ctx,
stopCh: stopCh,
deleteSuccessfulAfter: deleteSuccessfulAfter,
deleteFailedAfter: deleteFailedAfter,
deletePendingAfter: deletePendingAfter,
deleteOrphanedAfter: deleteOrphanedAfter,
deleteEvictedAfter: deleteEvictedAfter,
ignoreOwnedByCronjob: ignoreOwnedByCronjob,
labelSelector: labelSelector,
dryRun: dryRun,
kclient: kclient,
ctx: ctx,
stopCh: stopCh,
deleteSuccessfulAfter: deleteSuccessfulAfter,
deleteFailedAfter: deleteFailedAfter,
deletePendingAfter: deletePendingAfter,
deleteOrphanedAfter: deleteOrphanedAfter,
deleteEvictedAfter: deleteEvictedAfter,
deleteTerminatedAfter: deleteTerminatedAfter,
deleteTerminatingAfter: deleteTerminatingAfter,
ignoreOwnedByCronjob: ignoreOwnedByCronjob,
labelSelector: labelSelector,
}
jobInformer.AddEventHandler(cache.ResourceEventHandlerFuncs{
UpdateFunc: func(old, new interface{}) {
Expand Down Expand Up @@ -171,16 +175,19 @@ func (c *Kleaner) Process(obj interface{}) {
}
case *corev1.Pod:
pod := t
// skip pods that are already in the deleting process
if !pod.DeletionTimestamp.IsZero() {

if c.deleteTerminatingAfter > 0 && shouldDeleteTerminatingPod(t, c.deleteOrphanedAfter, c.deletePendingAfter, c.deleteEvictedAfter, c.deleteTerminatingAfter, c.deleteSuccessfulAfter, c.deleteFailedAfter) {
c.DeletePodWithForce(t)
} else {
// skip pods that are already in the deleting process
return
}
// skip pods related to jobs created by cronjobs if `ignoreOwnedByCronjob` is set
if c.ignoreOwnedByCronjob && podRelatedToCronJob(pod, c.jobInformer.GetStore()) {
return
}
// normal cleanup flow
if shouldDeletePod(t, c.deleteOrphanedAfter, c.deletePendingAfter, c.deleteEvictedAfter, c.deleteSuccessfulAfter, c.deleteFailedAfter) {
if shouldDeletePod(t, c.deleteOrphanedAfter, c.deletePendingAfter, c.deleteEvictedAfter, c.deleteTerminatedAfter, c.deleteSuccessfulAfter, c.deleteFailedAfter) {
c.DeletePod(t)
}
}
Expand Down Expand Up @@ -216,3 +223,22 @@ func (c *Kleaner) DeletePod(pod *corev1.Pod) {
}
metrics.GetOrCreateCounter(metricName(podDeletedMetric, pod.Namespace)).Inc()
}

// In Case If Pod(s) is Stuck in Terminating state just in case to delete with force
// Not goood way to fid the root cause of the issue why pod is stuck in terminating state
func (c *Kleaner) DeletePodWithForce(pod *corev1.Pod) {
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

as we discussed in the issue I'd prefere not to have force-deletion in the codebase

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@lwolf yes but we can keep by enabling the flag. it will be useful when there is an issue calico and workload on this node will be blocked due to this issue so to avoid those pod(s) which are stuck in terminating state until the real issue is fixed

Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

no, the issue should be solved by node draining, not force-deletion.

if c.dryRun {
log.Printf("dry-run: Pod '%s:%s' would have been deleted", pod.Namespace, pod.Name)
return
}
log.Printf("Deleting terminating pod with force '%s/%s'", pod.Namespace, pod.Name)
var po metav1.DeleteOptions
gracePeriodSeconds := int64(0)
po.GracePeriodSeconds = &gracePeriodSeconds
if err := c.kclient.CoreV1().Pods(pod.Namespace).Delete(c.ctx, pod.Name, po); ignoreNotFound(err) != nil {
log.Printf("failed to delete pod '%s:%s': %v", pod.Namespace, pod.Name, err)
metrics.GetOrCreateCounter(metricName(podDeletedFailedMetric, pod.Namespace)).Inc()
return
}
metrics.GetOrCreateCounter(metricName(podDeletedMetric, pod.Namespace)).Inc()
}
34 changes: 33 additions & 1 deletion pkg/controller/pod.go
Original file line number Diff line number Diff line change
Expand Up @@ -24,13 +24,39 @@ func podRelatedToCronJob(pod *corev1.Pod, jobStore cache.Store) bool {
return false
}

func shouldDeletePod(pod *corev1.Pod, orphaned, pending, evicted, successful, failed time.Duration) bool {
func shouldDeleteTerminatingPod(pod *corev1.Pod, orphaned, pending, evicted, terminating, successful, failed time.Duration) bool {
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

why do you need to pass orphaned, pending, evicted,successful, failed durations if you only use terminating?

// terminating pods which got hanged, those with or without owner references, but in Evicted state
// - uses c.deleteEvictedAfter, this one is tricky, because there is no timestamp of eviction.
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I assume this comment is just a copy-paste from the shouldDeletePod, cause it doesn't make any sense

// So, basically it will be removed as soon as discovered

if !pod.DeletionTimestamp.IsZero() {
podFinishTime := podFinishTime(pod)
if !podFinishTime.IsZero() {
age := time.Since(podFinishTime)
if terminating > 0 && age >= terminating {
log.Println("Pod(s) Which Are In Terminating State")
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

log.Printf instead of 3 loglines

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@lwolf for troubleshooting I have added these logs and I thought I have removed those lines.will update the same.

log.Println(pod.Name)
log.Println("End - Pod(s) Which Are In Terminating State")
return true
}
}
}
return false
}

func shouldDeletePod(pod *corev1.Pod, orphaned, pending, evicted, terminated, successful, failed time.Duration) bool {
// evicted pods, those with or without owner references, but in Evicted state
// - uses c.deleteEvictedAfter, this one is tricky, because there is no timestamp of eviction.
// So, basically it will be removed as soon as discovered
if pod.Status.Phase == corev1.PodFailed && pod.Status.Reason == "Evicted" && evicted > 0 {
return true
}
if pod.Status.Phase == corev1.PodFailed && pod.Status.Reason == "OutOfpods" && evicted > 0 {
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

please add comment about outOfpods and outOfcpu reason or a link to the docs describing it's behavior.

return true
}
if pod.Status.Phase == corev1.PodFailed && pod.Status.Reason == "OutOfcpu" && evicted > 0 {
return true
}
owners := getPodOwnerKinds(pod)
podFinishTime := podFinishTime(pod)
if !podFinishTime.IsZero() {
Expand Down Expand Up @@ -60,6 +86,12 @@ func shouldDeletePod(pod *corev1.Pod, orphaned, pending, evicted, successful, fa
return false
}
}
if pod.Status.Phase == corev1.PodFailed && pod.Status.Reason == "Terminated" && terminated > 0 {
age := time.Since(podFinishTime)
if terminated > 0 && age >= terminated {
return true
}
}
if pod.Status.Phase == corev1.PodPending && pending > 0 {
t := podLastTransitionTime(pod)
if t.IsZero() {
Expand Down
3 changes: 2 additions & 1 deletion pkg/controller/pod_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ func TestKleaner_DeletePod(t *testing.T) {
evicted time.Duration
successful time.Duration
failed time.Duration
terminated time.Duration
expected bool
}{
"expired orphaned pods should be deleted": {
Expand Down Expand Up @@ -155,7 +156,7 @@ func TestKleaner_DeletePod(t *testing.T) {
}
for name, tc := range testCases {
t.Run(name, func(t *testing.T) {
result := shouldDeletePod(tc.podSpec, tc.orphaned, tc.pending, tc.evicted, tc.successful, tc.failed)
result := shouldDeletePod(tc.podSpec, tc.orphaned, tc.pending, tc.evicted, tc.terminated, tc.successful, tc.failed)
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

please add test scenario for every case that you're adding

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@lwolf yes. I will be adding the test cases for each scenario and pushing one by one change for this PR

if result != tc.expected {
t.Fatalf("failed, expected %v, got %v", tc.expected, result)
}
Expand Down