In [1]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from collections import Counter

In [2]:
data = pd.read_json('../../../Data/Preprocessed Data/kind:feature/merged_data_with_comments.json')

In [3]:
data.head()

Unnamed: 0,comments_url,id,title,body,issue_url,pr_url,labels,pr_number,filename,status,additions,deletions,changes,all_comments
0,https://api.github.com/repos/kubernetes/kubern...,275859420,Kubelet flag precedence order vs files/ConfigM...,See https://docs.google.com/document/d/18-MsCh...,https://github.com/kubernetes/kubernetes/issue...,https://github.com/kubernetes/kubernetes/pull/...,"[area/kubelet, area/kubelet-api]",56097,"['cmd/kubelet/kubelet.go', 'hack/make-rules/te...","['modified', 'modified', 'modified', 'modified...","[7, 1, 21, 1, 1, 1, 1, 1, 1, 15, 139, 146]","[1, 0, 4, 0, 0, 1, 1, 1, 1, 0, 9, 0]","[8, 1, 25, 1, 1, 2, 2, 2, 2, 15, 148, 146]",
1,https://api.github.com/repos/kubernetes/kubead...,262492428,Individual control of preflight checks,Many times users know better than kubeadm arou...,https://github.com/kubernetes/kubeadm/issues/480,https://github.com/kubernetes/kubernetes/pull/...,[area/kubeadm],56072,['cmd/kubeadm/app/apis/kubeadm/validation/BUIL...,"['modified', 'modified', 'modified', 'modified...","[1, 26, 29, 20, 21, 1, 3, 17, 4, 2, 6, 3, 5, 6...","[0, 1, 0, 14, 15, 0, 2, 9, 0, 0, 1, 7, 1, 0, 0...","[1, 27, 29, 34, 36, 1, 5, 26, 4, 2, 7, 10, 6, ...",New example in 1.8.0:\r\nsystemctl start kubel...
2,https://api.github.com/repos/kubernetes/kubern...,275470204,seccomp is an alpha feature and not feature gated,see #55983,https://github.com/kubernetes/kubernetes/issue...,https://github.com/kubernetes/kubernetes/pull/...,"[area/kubelet, area/kubelet-api]",55983,"['cmd/kubelet/app/options/options.go', 'cmd/ku...","['modified', 'modified', 'modified', 'modified...","[5, 6, 0, 0, 0, 0, 0, 0, 5]","[1, 3, 1, 1, 2, 4, 2, 2, 3]","[6, 9, 1, 1, 2, 4, 2, 2, 8]",
3,https://api.github.com/repos/kubernetes/kubead...,272308417,Use ComponentConfig for the kube-proxy,Important feature for v1.9; dependency for IPv...,https://github.com/kubernetes/kubeadm/issues/527,https://github.com/kubernetes/kubernetes/pull/...,[area/ipv6],55972,"['cmd/kubeadm/app/apis/kubeadm/BUILD', 'cmd/ku...","['modified', 'modified', 'modified', 'modified...","[1, 1, 36, 7, 3, 25, 7, 29, 27, 4, 6, 18, 15, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, ...","[1, 1, 36, 7, 3, 25, 7, 29, 27, 4, 6, 18, 15, ...",
4,https://api.github.com/repos/kubernetes/kubern...,251361039,Add kubeadm config for setting kube-proxy Bind...,<!-- This form is for bug reports and feature ...,https://github.com/kubernetes/kubernetes/issue...,https://github.com/kubernetes/kubernetes/pull/...,[area/ipv6],55972,"['cmd/kubeadm/app/apis/kubeadm/BUILD', 'cmd/ku...","['modified', 'modified', 'modified', 'modified...","[1, 1, 36, 7, 3, 25, 7, 29, 27, 4, 6, 18, 15, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, ...","[1, 1, 36, 7, 3, 25, 7, 29, 27, 4, 6, 18, 15, ...",/sig cluster-lifecycle\r\n/area ipv6


# Text Preprocessing

Merge Title, Description, and Comments

In [4]:
# make new column with title, and body all together
data['all_text'] = data['title'] + ' ' + data['body'] 

Lowercasing

In [5]:
# lowercase everything
data['all_text'] = data['all_text'].str.lower()

Line break removal

In [6]:
# remove line breaks (\r, \n)
data['all_text'] = data['all_text'].str.replace('\r', ' ')
data['all_text'] = data['all_text'].str.replace('\n', ' ')

Remove Non-alphanumeric character

In [7]:
# remove non-alphanumeric characters such as punctuation, symbols, emojis, etc.
data['all_text'] = data['all_text'].str.replace(r'[^a-zA-Z0-9 ]', '')

Change the datatype to string

In [8]:
# change datatype to string
data['all_text'] = data['all_text'].astype(str)

In [9]:
# print the first 5 rows of the all_text and all_comments
print(data['all_text'].head())

0    kubelet flag precedence order vs files/configm...
1    individual control of preflight checks many ti...
2    seccomp is an alpha feature and not feature ga...
3    use componentconfig for the kube-proxy importa...
4    add kubeadm config for setting kube-proxy bind...
Name: all_text, dtype: object


In [10]:
# Remove high frequency words with threshold 0.5
threshold = 0.5
print(f"Processing with threshold: {threshold}")

# Create and fit CountVectorizer
cv = CountVectorizer(max_df=threshold)
cv.fit(data['all_text'])

# Get vocabulary from fitted vectorizer
vocabulary = cv.vocabulary_

# Filter each document to only keep words in the vocabulary
filtered_texts = []
for text in data['all_text']:
    words = text.split()
    filtered_words = [word for word in words if word in vocabulary]
    filtered_texts.append(' '.join(filtered_words))

# Store filtered texts in same column
data['all_text'] = filtered_texts

# Print statistics
print(f"  Vocabulary size: {len(vocabulary)}")
print(f"  Average words per document: {sum(len(text.split()) for text in filtered_texts) / len(filtered_texts):.2f}")# Export vocabulary to file

vocab_df = pd.DataFrame(list(vocabulary.items()), columns=['Word', 'Index'])
vocab_df = vocab_df.sort_values('Index')
vocab_df.to_csv('vocabulary.csv', index=False)


Processing with threshold: 0.5
  Vocabulary size: 10084
  Average words per document: 84.37


## Analyze the label distribution

Check the label by length

In [34]:
# Check the length (number of labels) for each issue
label_lengths = data['labels'].apply(len)

# Print label length statistics
print("Label Length Distribution:")
print(f"Mean: {label_lengths.mean():.2f}")
print(f"Min: {label_lengths.min()}")
print(f"Max: {label_lengths.max()}")

# Get counts of each label length
length_counts = Counter(label_lengths)

# Print distribution
print("\nNumber of issues by label count:")
for length, count in sorted(length_counts.items()):
    print(f"{length} label(s): {count} issues ({count/len(data)*100:.2f}%)")

Label Length Distribution:
Mean: 1.76
Min: 1
Max: 16

Number of issues by label count:
1 label(s): 508 issues (58.39%)
2 label(s): 233 issues (26.78%)
3 label(s): 61 issues (7.01%)
4 label(s): 33 issues (3.79%)
5 label(s): 12 issues (1.38%)
6 label(s): 9 issues (1.03%)
7 label(s): 8 issues (0.92%)
8 label(s): 1 issues (0.11%)
10 label(s): 1 issues (0.11%)
11 label(s): 1 issues (0.11%)
13 label(s): 1 issues (0.11%)
14 label(s): 1 issues (0.11%)
16 label(s): 1 issues (0.11%)


Filter the label length (max 5)

In [35]:
# Original size of the dataset
original_size = len(data)

# Filter data to only include entries with 5 or fewer labels
data = data[data['labels'].apply(len) <= 5]

# Print information about the filtered dataset
print(f"Dataset size after filtering: {len(data)} entries")

print(f"Percentage of original data kept: {len(data)/original_size*100:.2f}%")

Dataset size after filtering: 847 entries
Percentage of original data kept: 97.36%


In [29]:
# Recalculate the label length statistics
filtered_label_lengths = data['labels'].apply(len)
filtered_length_counts = Counter(filtered_label_lengths)

print("\nFiltered Label Length Distribution:")
print(f"Mean: {filtered_label_lengths.mean():.2f}")
print(f"Min: {filtered_label_lengths.min()}")
print(f"Max: {filtered_label_lengths.max()}")

# Print distribution
print("\nNumber of issues by label count:")
for length, count in sorted(filtered_length_counts.items()):
    print(f"{length} label(s): {count} issues ({count/len(data)*100:.2f}%)")


Filtered Label Length Distribution:
Mean: 1.41
Min: 1
Max: 5

Number of issues by label count:
1 label(s): 894 issues (72.33%)
2 label(s): 251 issues (20.31%)
3 label(s): 46 issues (3.72%)
4 label(s): 22 issues (1.78%)
5 label(s): 23 issues (1.86%)


Check the label distribution

In [36]:
# Flatten the list of labels if it's a list of lists
flattened_labels = [label for sublist in data['labels'] for label in sublist]

# Print labels distribution count
label_counts = Counter(flattened_labels)
print("Label Distribution Count:")
for label, count in label_counts.items():
    print(f"{label}: {count}")

Label Distribution Count:
area/kubelet: 157
area/kubelet-api: 5
area/kubeadm: 85
area/ipv6: 4
area/hw-accelerators: 2
area/provider/openstack: 3
area/client-libraries: 2
area/apiserver: 156
area/etcd: 1
area/kubectl: 132
area/code-generation: 61
area/test: 433
area/conformance: 21
area/cloudprovider: 52
area/provider/azure: 14
area/dependency: 51
area/ipvs: 9
area/e2e-test-framework: 86
area/custom-resources: 5
area/admission-control: 5
area/security: 1
area/test-infra: 1
area/release-eng: 22
area/provider/gcp: 15
area/code-organization: 2
area/batch: 1
area/workload-api/job: 1
area/network-policy: 3
area/kube-proxy: 13
area/stable-metrics: 4
area/logging: 2


Remove labels that have occurence < 5

In [37]:
# Get original data size for comparison
original_size = len(data)

# Filter the labels that have less than 5 occurrences
rare_labels = [label for label, count in label_counts.items() if count < 5]
common_labels = [label for label, count in label_counts.items() if count >= 5]

print(f"Before filtering: {len(flattened_labels)} total label occurrences")
print(f"Removing {len(rare_labels)} rare label types with fewer than 5 occurrences:")
for label in rare_labels:
    print(f"  - {label}: {label_counts[label]} occurrences")

# Update data to keep only issues that have at least one common label
data['labels'] = data['labels'].apply(lambda x: [label for label in x if label in common_labels])
data = data[data['labels'].apply(len) > 0]

# Print statistics after filtering
remaining_size = len(data)
removed_count = original_size - remaining_size
removed_percentage = (removed_count / original_size) * 100

print(f"\nAfter filtering:")
print(f"  - Original dataset size: {original_size} issues")
print(f"  - Remaining dataset size: {remaining_size} issues")
print(f"  - Removed {removed_count} issues ({removed_percentage:.2f}%)")

# Update flattened_labels to reflect the current state
flattened_labels = [label for sublist in data['labels'] for label in sublist]
print(f"  - Remaining label occurrences: {len(flattened_labels)}")

Before filtering: 1349 total label occurrences
Removing 13 rare label types with fewer than 5 occurrences:
  - area/ipv6: 4 occurrences
  - area/hw-accelerators: 2 occurrences
  - area/provider/openstack: 3 occurrences
  - area/client-libraries: 2 occurrences
  - area/etcd: 1 occurrences
  - area/security: 1 occurrences
  - area/test-infra: 1 occurrences
  - area/code-organization: 2 occurrences
  - area/batch: 1 occurrences
  - area/workload-api/job: 1 occurrences
  - area/network-policy: 3 occurrences
  - area/stable-metrics: 4 occurrences
  - area/logging: 2 occurrences

After filtering:
  - Original dataset size: 847 issues
  - Remaining dataset size: 835 issues
  - Removed 12 issues (1.42%)
  - Remaining label occurrences: 1322


In [38]:
# Print labels distribution count
label_counts = Counter(flattened_labels)
print("Label Distribution Count:")

for i, (label, count) in enumerate(sorted(label_counts.items(), key=lambda x: x[1], reverse=True)):
    print(f"{i+1}. {label}: {count}")

Label Distribution Count:
1. area/test: 433
2. area/kubelet: 157
3. area/apiserver: 156
4. area/kubectl: 132
5. area/e2e-test-framework: 86
6. area/kubeadm: 85
7. area/code-generation: 61
8. area/cloudprovider: 52
9. area/dependency: 51
10. area/release-eng: 22
11. area/conformance: 21
12. area/provider/gcp: 15
13. area/provider/azure: 14
14. area/kube-proxy: 13
15. area/ipvs: 9
16. area/kubelet-api: 5
17. area/custom-resources: 5
18. area/admission-control: 5


In [39]:
# Set the random seed for reproducibility (same seed used in DeBERTa script)
import numpy as np
np.random.seed(42)

# Prepare for train-test split
from sklearn.preprocessing import MultiLabelBinarizer

# Encode multi-labels using MultiLabelBinarizer
print("Encoding labels...")
mlb = MultiLabelBinarizer()
labels_encoded = mlb.fit_transform(data['labels'])

# Split data into training and validation sets (80% training, 20% validation)
split_idx = int(len(data) * 0.8)
train_data = data.iloc[:split_idx].reset_index(drop=True)
val_data = data.iloc[split_idx:].reset_index(drop=True)
train_labels = labels_encoded[:split_idx]
val_labels = labels_encoded[split_idx:]

print(f"Training samples: {len(train_data)}, Validation samples: {len(val_data)}")



Encoding labels...
Training samples: 668, Validation samples: 167


In [40]:
# export the train_data and val_data to csv
train_data.to_csv('train_data.csv', index=False)
val_data.to_csv('val_data.csv', index=False)

# # export the train_labels and val_labels to csv
# # Convert numpy arrays to DataFrame before exporting to csv
# train_labels_df = pd.DataFrame(train_labels)
# val_labels_df = pd.DataFrame(val_labels)
# train_labels_df.to_csv('train_labels.csv', index=False)
# val_labels_df.to_csv('val_labels.csv', index=False)