-
Notifications
You must be signed in to change notification settings - Fork 109
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Other Pod(s) Status Clean up & Pod(s) stuck in Terminating state #89
base: master
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -19,3 +19,4 @@ USER kube-operator | |
COPY --from=build /build/bin/kube-cleanup-operator . | ||
|
||
ENTRYPOINT ["./kube-cleanup-operator"] | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -24,13 +24,39 @@ func podRelatedToCronJob(pod *corev1.Pod, jobStore cache.Store) bool { | |
return false | ||
} | ||
|
||
func shouldDeletePod(pod *corev1.Pod, orphaned, pending, evicted, successful, failed time.Duration) bool { | ||
func shouldDeleteTerminatingPod(pod *corev1.Pod, orphaned, pending, evicted, terminating, successful, failed time.Duration) bool { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. why do you need to pass |
||
// terminating pods which got hanged, those with or without owner references, but in Evicted state | ||
// - uses c.deleteEvictedAfter, this one is tricky, because there is no timestamp of eviction. | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I assume this comment is just a copy-paste from the |
||
// So, basically it will be removed as soon as discovered | ||
|
||
if !pod.DeletionTimestamp.IsZero() { | ||
podFinishTime := podFinishTime(pod) | ||
if !podFinishTime.IsZero() { | ||
age := time.Since(podFinishTime) | ||
if terminating > 0 && age >= terminating { | ||
log.Println("Pod(s) Which Are In Terminating State") | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @lwolf for troubleshooting I have added these logs and I thought I have removed those lines.will update the same. |
||
log.Println(pod.Name) | ||
log.Println("End - Pod(s) Which Are In Terminating State") | ||
return true | ||
} | ||
} | ||
} | ||
return false | ||
} | ||
|
||
func shouldDeletePod(pod *corev1.Pod, orphaned, pending, evicted, terminated, successful, failed time.Duration) bool { | ||
// evicted pods, those with or without owner references, but in Evicted state | ||
// - uses c.deleteEvictedAfter, this one is tricky, because there is no timestamp of eviction. | ||
// So, basically it will be removed as soon as discovered | ||
if pod.Status.Phase == corev1.PodFailed && pod.Status.Reason == "Evicted" && evicted > 0 { | ||
return true | ||
} | ||
if pod.Status.Phase == corev1.PodFailed && pod.Status.Reason == "OutOfpods" && evicted > 0 { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. please add comment about |
||
return true | ||
} | ||
if pod.Status.Phase == corev1.PodFailed && pod.Status.Reason == "OutOfcpu" && evicted > 0 { | ||
return true | ||
} | ||
owners := getPodOwnerKinds(pod) | ||
podFinishTime := podFinishTime(pod) | ||
if !podFinishTime.IsZero() { | ||
|
@@ -60,6 +86,12 @@ func shouldDeletePod(pod *corev1.Pod, orphaned, pending, evicted, successful, fa | |
return false | ||
} | ||
} | ||
if pod.Status.Phase == corev1.PodFailed && pod.Status.Reason == "Terminated" && terminated > 0 { | ||
age := time.Since(podFinishTime) | ||
if terminated > 0 && age >= terminated { | ||
return true | ||
} | ||
} | ||
if pod.Status.Phase == corev1.PodPending && pending > 0 { | ||
t := podLastTransitionTime(pod) | ||
if t.IsZero() { | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -17,6 +17,7 @@ func TestKleaner_DeletePod(t *testing.T) { | |
evicted time.Duration | ||
successful time.Duration | ||
failed time.Duration | ||
terminated time.Duration | ||
expected bool | ||
}{ | ||
"expired orphaned pods should be deleted": { | ||
|
@@ -155,7 +156,7 @@ func TestKleaner_DeletePod(t *testing.T) { | |
} | ||
for name, tc := range testCases { | ||
t.Run(name, func(t *testing.T) { | ||
result := shouldDeletePod(tc.podSpec, tc.orphaned, tc.pending, tc.evicted, tc.successful, tc.failed) | ||
result := shouldDeletePod(tc.podSpec, tc.orphaned, tc.pending, tc.evicted, tc.terminated, tc.successful, tc.failed) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. please add test scenario for every case that you're adding There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @lwolf yes. I will be adding the test cases for each scenario and pushing one by one change for this PR |
||
if result != tc.expected { | ||
t.Fatalf("failed, expected %v, got %v", tc.expected, result) | ||
} | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
as we discussed in the issue I'd prefere not to have
force-deletion
in the codebaseThere was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
@lwolf yes but we can keep by enabling the flag. it will be useful when there is an issue calico and workload on this node will be blocked due to this issue so to avoid those pod(s) which are stuck in terminating state until the real issue is fixed
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
no, the issue should be solved by node draining, not force-deletion.