In [None]:
import pandas as pd
from spacy.cli import download
from sklearn.feature_extraction.text import CountVectorizer
import spacy
import Counter

In [27]:
data = pd.read_json('../../Data/Preprocessed Data/kind:bug/merged_data_with_comments.json')

In [28]:
data.head()

Unnamed: 0,comments_url,id,title,body,issue_url,pr_url,labels,pr_number,filename,status,additions,deletions,changes,all_comments
0,https://api.github.com/repos/kubernetes/kubern...,2639668210,kubelet crash: fatal error: concurrent map writes,### What happened?\n\nWhile looking into three...,https://github.com/kubernetes/kubernetes/issue...,https://github.com/kubernetes/kubernetes/pull/...,[area/kubelet],128657,"['pkg/kubelet/cm/container_manager_linux.go', ...","['modified', 'modified', 'modified', 'modified']","[3, 4, 18, 75]","[3, 3, 6, 0]","[6, 7, 24, 75]",/sig node Thought to search for similar errors...
1,https://api.github.com/repos/kubernetes/kubern...,2617512099,[FG:InPlacePodVerticalScaling] failed to verif...,### What happened?\n\nOne line bug description...,https://github.com/kubernetes/kubernetes/issue...,https://github.com/kubernetes/kubernetes/pull/...,[area/kubelet],126620,"['pkg/kubelet/status/state/checkpoint.go', 'pk...","['modified', 'modified', 'added']","[42, 23, 166]","[26, 17, 0]","[68, 40, 166]",This issue is currently awaiting triage.\n\nIf...
2,https://api.github.com/repos/kubernetes/kubern...,2604613192,Restore build-tag flag for code-generator,### What happened?\n\nThe `build-tag` flag is ...,https://github.com/kubernetes/kubernetes/issue...,https://github.com/kubernetes/kubernetes/pull/...,[area/code-generation],128259,['staging/src/k8s.io/code-generator/cmd/conver...,"['modified', 'modified', 'modified', 'modified...","[12, 1, 1, 12, 1, 1]","[2, 1, 1, 1, 1, 1]","[14, 2, 2, 13, 2, 2]",@p0lyn0mial FYI /sig api-machinery /triage acc...
3,https://api.github.com/repos/kubernetes/kubern...,2596132738,[Failing Tests] ci-crio-cgroupv1-node-e2e-conf...,### Which jobs are failing?\n* master-blocking...,https://github.com/kubernetes/kubernetes/issue...,https://github.com/kubernetes/kubernetes/pull/...,"[area/kubeadm, area/dependency]",128175,"['go.mod', 'go.sum', 'hack/unwanted-dependenci...","['modified', 'modified', 'modified', 'modified...","[1, 3, 1, 21, 9, 9, 4, 0, 10, 2, 2, 5, 8, 1]","[1, 2, 0, 98, 8, 7, 6, 62, 11, 1, 4, 8, 11, 1]","[2, 5, 1, 119, 17, 16, 10, 62, 21, 3, 6, 13, 1...",@drewhagen: The provided milestone is not vali...
4,https://api.github.com/repos/kubernetes/kubern...,2591358936,Crash on kube manager's service-lb-controller ...,### What happened?\n\nIf kube manager is start...,https://github.com/kubernetes/kubernetes/issue...,https://github.com/kubernetes/kubernetes/pull/...,[area/cloudprovider],128182,['cmd/kube-controller-manager/app/controllerma...,"['modified', 'modified', 'modified']","[26, 12, 5]","[0, 1, 4]","[26, 13, 9]",This issue is currently awaiting triage.\n\nIf...


# Text Preprocessing

Merge Title, Description, and Comments

In [29]:
# make new column with title, body, and comments all together
# data['all_text'] = data['title'] + ' ' + data['body'] + ' ' + data['all_comments']
data['all_text'] = data['title'] + ' ' + data['body'] 

Lowercasing

In [30]:
# lowercase everything
data['all_text'] = data['all_text'].str.lower()

Line break removal

In [31]:
# remove line breaks (\r, \n)
data['all_text'] = data['all_text'].str.replace('\r', ' ')
data['all_text'] = data['all_text'].str.replace('\n', ' ')

Remove Non-alphanumeric character

In [32]:
# remove non-alphanumeric characters such as punctuation, symbols, emojis, etc.
data['all_text'] = data['all_text'].str.replace(r'[^a-zA-Z0-9 ]', '')

Change the datatype to string

In [33]:
# change datatype to string
data['all_text'] = data['all_text'].astype(str)

Stopwords Removal

In [34]:
# remove stopwords using spaCy
try:
    nlp = spacy.load('en_core_web_sm')
except OSError:
    download('en_core_web_sm')
    nlp = spacy.load('en_core_web_sm')

data['all_text'] = data['all_text'].apply(lambda x: ' '.join([word.text for word in nlp(x) if not word.is_stop]))

Data Lemmatization

In [35]:
# lematize the text using spaCy
data['all_text'] = data['all_text'].apply(lambda x: ' '.join([word.lemma_ for word in nlp(x)]))

Remove High Frequency Words

In [None]:
# Remove high frequency words with different thresholds
threshold = 0.9
print(f"Processing with threshold: {threshold}")

# Create and fit CountVectorizer
cv = CountVectorizer(max_df=threshold)
cv.fit(data['all_text'])

# Get vocabulary from fitted vectorizer
vocabulary = cv.vocabulary_

# Filter each document to only keep words in the vocabulary
filtered_texts = []
for text in data['all_text']:
    words = text.split()
    filtered_words = [word for word in words if word in vocabulary]
    filtered_texts.append(' '.join(filtered_words))

# Store filtered texts
data[f'all_text'] = filtered_texts

# Print statistics
print(f"  Vocabulary size: {len(vocabulary)}")
print(f"  Average words per document: {sum(len(text.split()) for text in filtered_texts) / len(filtered_texts):.2f}")


Processing with threshold: 0.5
  Vocabulary size: 26632
  Average words per document: 160.55
Processing with threshold: 0.6
  Vocabulary size: 26641
  Average words per document: 164.91
Processing with threshold: 0.7
  Vocabulary size: 26652
  Average words per document: 174.07
Processing with threshold: 0.8
  Vocabulary size: 26659
  Average words per document: 178.25
Processing with threshold: 0.9
  Vocabulary size: 26664
  Average words per document: 188.45


Remove comments_url, id, title, body, issue_url, pr_url, and all_comments

In [37]:
data.head(5)

Unnamed: 0,comments_url,id,title,body,issue_url,pr_url,labels,pr_number,filename,status,additions,deletions,changes,all_comments,all_text,all_text_0.5,all_text_0.6,all_text_0.7,all_text_0.8,all_text_0.9
0,https://api.github.com/repos/kubernetes/kubern...,2639668210,kubelet crash: fatal error: concurrent map writes,### What happened?\n\nWhile looking into three...,https://github.com/kubernetes/kubernetes/issue...,https://github.com/kubernetes/kubernetes/pull/...,[area/kubelet],128657,"['pkg/kubelet/cm/container_manager_linux.go', ...","['modified', 'modified', 'modified', 'modified']","[3, 4, 18, 75]","[3, 3, 6, 0]","[6, 7, 24, 75]",/sig node Thought to search for similar errors...,kubelet crash : fatal error : concurrent map w...,kubelet crash fatal error concurrent map write...,kubelet crash fatal error concurrent map write...,kubelet crash fatal error concurrent map write...,kubelet crash fatal error concurrent map write...,kubelet crash fatal error concurrent map write...
1,https://api.github.com/repos/kubernetes/kubern...,2617512099,[FG:InPlacePodVerticalScaling] failed to verif...,### What happened?\n\nOne line bug description...,https://github.com/kubernetes/kubernetes/issue...,https://github.com/kubernetes/kubernetes/pull/...,[area/kubelet],126620,"['pkg/kubelet/status/state/checkpoint.go', 'pk...","['modified', 'modified', 'added']","[42, 23, 166]","[26, 17, 0]","[68, 40, 166]",This issue is currently awaiting triage.\n\nIf...,[ fg : inplacepodverticalscale ] fail verify p...,fg inplacepodverticalscale fail verify pod sta...,fg inplacepodverticalscale fail verify pod sta...,fg inplacepodverticalscale fail verify pod sta...,fg inplacepodverticalscale fail verify pod sta...,fg inplacepodverticalscale fail verify pod sta...
2,https://api.github.com/repos/kubernetes/kubern...,2604613192,Restore build-tag flag for code-generator,### What happened?\n\nThe `build-tag` flag is ...,https://github.com/kubernetes/kubernetes/issue...,https://github.com/kubernetes/kubernetes/pull/...,[area/code-generation],128259,['staging/src/k8s.io/code-generator/cmd/conver...,"['modified', 'modified', 'modified', 'modified...","[12, 1, 1, 12, 1, 1]","[2, 1, 1, 1, 1, 1]","[14, 2, 2, 13, 2, 2]",@p0lyn0mial FYI /sig api-machinery /triage acc...,restore build - tag flag code - generator # # ...,restore build tag flag code generator build ta...,restore build tag flag code generator build ta...,restore build tag flag code generator build ta...,restore build tag flag code generator build ta...,restore build tag flag code generator happen b...
3,https://api.github.com/repos/kubernetes/kubern...,2596132738,[Failing Tests] ci-crio-cgroupv1-node-e2e-conf...,### Which jobs are failing?\n* master-blocking...,https://github.com/kubernetes/kubernetes/issue...,https://github.com/kubernetes/kubernetes/pull/...,"[area/kubeadm, area/dependency]",128175,"['go.mod', 'go.sum', 'hack/unwanted-dependenci...","['modified', 'modified', 'modified', 'modified...","[1, 3, 1, 21, 9, 9, 4, 0, 10, 2, 2, 5, 8, 1]","[1, 2, 0, 98, 8, 7, 6, 62, 11, 1, 4, 8, 11, 1]","[2, 5, 1, 119, 17, 16, 10, 62, 21, 3, 6, 13, 1...",@drewhagen: The provided milestone is not vali...,[ fail test ] ci-crio-cgroupv1-node-e2e-confor...,fail test impact multiple job job fail master ...,fail test impact multiple job job fail master ...,fail test impact multiple job job fail master ...,fail test impact multiple job job fail master ...,fail test impact multiple job job fail master ...
4,https://api.github.com/repos/kubernetes/kubern...,2591358936,Crash on kube manager's service-lb-controller ...,### What happened?\n\nIf kube manager is start...,https://github.com/kubernetes/kubernetes/issue...,https://github.com/kubernetes/kubernetes/pull/...,[area/cloudprovider],128182,['cmd/kube-controller-manager/app/controllerma...,"['modified', 'modified', 'modified']","[26, 12, 5]","[0, 1, 4]","[26, 13, 9]",This issue is currently awaiting triage.\n\nIf...,crash kube manager service - lb - controller v...,crash kube manager service lb controller kube ...,crash kube manager service lb controller kube ...,crash kube manager service lb controller kube ...,crash kube manager service lb controller kube ...,crash kube manager service lb controller happe...


In [None]:
from collections import Counter

# Get the labels from the data
labels = data['labels']

# Filter out empty labels and create a flat list
filtered_labels = [label_list for label_list in labels if isinstance(label_list, list) and label_list]

# Count the number of labels distribution
label_distribution = Counter([label for labels in filtered_labels for label in labels])
print('\nLabel Distribution:')
for i, (label, count) in enumerate(sorted(label_distribution.items(), key=lambda x: x[1], reverse=True)):
    print(f'{i}. {label}: {count}')

In [None]:
label_length_distribution = Counter([len(labels) for labels in filtered_labels])
print('\nLabel count per row distribution:')
for label in sorted(label_length_distribution.keys()):
    print(f'Label: {label}, count: {label_length_distribution[label]}')

In [None]:
# Filter out rows with 'nan' text in the all_text column
before_nan_filter = len(data)
data = data[~data['all_text'].apply(lambda x: x.startswith('nan') if isinstance(x, str) else False)]
nan_removed = before_nan_filter - len(data)
print(f"Removed {nan_removed} rows with 'nan' text ({nan_removed/before_nan_filter*100:.2f}% of data)")

In [None]:
# check nan data
print(f"Number of rows with NaN in 'all_text': {data['all_text'].isna().sum()}")

In [None]:
# Define a minimum frequency threshold for labels
min_label_freq = 5  

# Count the total number of unique labels before filtering
total_labels_before = len(label_distribution)

# Get the frequent labels based on the threshold
frequent_labels = [label for label, count in label_distribution.items() if count >= min_label_freq]

# Filter the labels in the data
data['filtered_labels'] = data['labels'].apply(lambda x: [label for label in x if label in frequent_labels] if isinstance(x, list) else [])

# Create a mask for rows with non-empty filtered labels
empty_labels_mask = data['filtered_labels'].apply(len) > 0

# Report on the filtering
labels_removed = total_labels_before - len(frequent_labels)
print(f"Removed {labels_removed} infrequent labels ({labels_removed/total_labels_before*100:.2f}% of labels)")
print(f"Number of labels remaining: {len(frequent_labels)} ({len(frequent_labels)/total_labels_before*100:.2f}% of labels)")
print(f"Number of rows with labels before filtering: {len(data)}")
print(f"Number of rows with labels after filtering: {empty_labels_mask.sum()}")
print(f"Removed {len(data) - empty_labels_mask.sum()} rows due to empty labels")

In [None]:
# Print the label distribution of filtered_labels
filtered_label_distribution = Counter([label for labels in data['filtered_labels'] for label in labels])
print('\nFiltered Label Distribution:')
for i, (label, count) in enumerate(sorted(filtered_label_distribution.items(), key=lambda x: x[1], reverse=True)):
    print(f'{i}. {label}: {count}')

In [None]:
# Calculate the distribution of label count per row
label_count_distribution = Counter([len(labels) for labels in data['filtered_labels']])

# Print the distribution
print('Distribution of labels per row:')
for count in sorted(label_count_distribution.keys()):
    num_rows = label_count_distribution[count]
    percentage = (num_rows / len(data)) * 100
    print(f'{count} label(s): {num_rows} rows ({percentage:.2f}%)')

In [None]:
# Filter data to keep only rows with appropriate number of labels
before_length_filter = len(data)

# Define maximum number of labels per row
max_label_len = 5  

# Create mask for rows with appropriate number of labels
# Keep rows with at least 1 label and no more than max_label_len
length_mask = data['filtered_labels'].apply(lambda x: 0 < len(x) <= max_label_len)

# Apply the mask to the data
data = data[length_mask]

# Calculate statistics
samples_removed_by_length = before_length_filter - len(data)
print(f"Removed {samples_removed_by_length} samples with too many or zero labels "
    f"({samples_removed_by_length/before_length_filter*100:.2f}% of data)")

### Filter the token length

In [None]:
# Calculate token lengths for all samples
token_lengths = data['all_text'].apply(lambda x: len(x.split()))

# Get statistics
mean_length = token_lengths.mean()
std_length = token_lengths.std()
min_length = token_lengths.min()
max_length = token_lengths.max()

# Define upper and lower bounds (3 standard deviations)
upper_bound = mean_length + 3.0 * std_length
lower_bound = max(1, mean_length - 3.0 * std_length)  # Ensure lower bound is at least 1

print(f"Token length statistics:")
print(f"Mean: {mean_length:.2f}")
print(f"Standard deviation: {std_length:.2f}")
print(f"Min: {min_length}")
print(f"Max: {max_length}")
print(f"Lower bound: {lower_bound:.2f}")
print(f"Upper bound: {upper_bound:.2f}")

# Create mask for samples within bounds
before_length_filter = len(data)
std_mask = (token_lengths >= lower_bound) & (token_lengths <= upper_bound)

# Apply the mask to the data
data = data[std_mask].reset_index(drop=True)

# Calculate statistics
samples_removed_by_length = before_length_filter - len(data)
print(f"Removed {samples_removed_by_length} samples with outlier token lengths "
    f"({samples_removed_by_length/before_length_filter*100:.2f}% of data)")

In [None]:
# Calculate token lengths for all samples after filtering
token_lengths = data['all_text'].apply(lambda x: len(x.split()))

# Get statistics
mean_length = token_lengths.mean()
median_length = token_lengths.median()
min_length = token_lengths.min()
max_length = token_lengths.max()

# Create a histogram with KDE
plt.figure(figsize=(12, 6))
sns.histplot(token_lengths, kde=True)
plt.axvline(mean_length, color='red', linestyle='--', label=f'Mean: {mean_length:.2f}')
plt.axvline(median_length, color='green', linestyle='--', label=f'Median: {median_length:.2f}')

# Add labels and title
plt.xlabel('Number of Tokens')
plt.ylabel('Frequency')
plt.title('Token Length Distribution After Filtering')
plt.legend()

# Print statistics
print(f"Token length statistics after filtering:")
print(f"Mean: {mean_length:.2f}")
print(f"Median: {median_length:.2f}")
print(f"Min: {min_length}")
print(f"Max: {max_length}")
print(f"Standard deviation: {token_lengths.std():.2f}")

# Calculate percentiles
percentiles = [10, 25, 50, 75, 90, 95, 99]
for p in percentiles:
    print(f"{p}th percentile: {np.percentile(token_lengths, p):.2f}")

plt.tight_layout()
plt.show()

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer

# For multi-label classification, we need to encode the labels
mlb = MultiLabelBinarizer()
labels_encoded = mlb.fit_transform(data['filtered_labels'])

# Split data into training and validation sets (80% training, 20% validation)
# Instead of splitting just the text and labels, split the entire dataframe
train_df, val_df = train_test_split(
    data, test_size=0.2, random_state=42, stratify=None
)

# Get the texts and encoded labels for the train and validation sets
train_texts = train_df['all_text'].tolist()
val_texts = val_df['all_text'].tolist()

# Encode the labels for train and validation sets
train_labels = mlb.transform(train_df['filtered_labels'])
val_labels = mlb.transform(val_df['filtered_labels'])

print(f"Training samples: {len(train_texts)}, Validation samples: {len(val_texts)}")
print(f"Number of unique labels: {len(mlb.classes_)}")
print(f"Label names: {mlb.classes_}")

In [None]:
# Move filtered_labels to labels column and drop filtered_labels column
data['labels'] = data['filtered_labels']
data = data.drop(columns=['filtered_labels'])

# Display the first few rows to verify the changes
print("Column names after update:", data.columns.tolist())
data[['labels']].head()

In [None]:
# print before filtering vs after filtering
print(f"Original data size: {len(before_nan_filter)}")
print(f"Data size after filtering: {len(data)}")
print(f"Percentage of data kept: {(len(data) / before_nan_filter) * 100:.2f}%")

In [None]:
from collections import Counter
import seaborn as sns
import numpy as np

import matplotlib.pyplot as plt

# Get the final label distribution
final_label_distribution = Counter([label for labels in data['labels'] for label in labels])

# Print label distribution
print('Final Label Distribution:')
for i, (label, count) in enumerate(sorted(final_label_distribution.items(), key=lambda x: x[1], reverse=True)):
    print(f'{i}. {label}: {count} ({count/len(data)*100:.2f}%)')

# Calculate label length distribution
label_length_distribution = Counter([len(labels) for labels in data['labels']])

# Print label length distribution
print('\nFinal Label Length Distribution:')
for length in sorted(label_length_distribution.keys()):
    count = label_length_distribution[length]
    print(f'{length} label(s): {count} rows ({count/len(data)*100:.2f}%)')

# Create visualizations
plt.figure(figsize=(15, 10))

# Plot top 20 labels
plt.subplot(2, 1, 1)
top_labels = dict(sorted(final_label_distribution.items(), key=lambda x: x[1], reverse=True)[:20])
sns.barplot(x=list(top_labels.keys()), y=list(top_labels.values()))
plt.xticks(rotation=45, ha='right')
plt.title('Top 20 Labels Distribution')
plt.ylabel('Count')
plt.tight_layout()

# Plot label length distribution
plt.subplot(2, 1, 2)
sns.barplot(x=list(label_length_distribution.keys()), 
            y=list(label_length_distribution.values()))
plt.title('Label Length Distribution')
plt.xlabel('Number of Labels')
plt.ylabel('Number of Samples')
plt.tight_layout()

plt.subplots_adjust(hspace=0.5)
plt.show()

# Export the Data

In [17]:
data.head()

Unnamed: 0,comments_url,id,title,body,issue_url,pr_url,labels,pr_number,filename,status,additions,deletions,changes,all_comments,all_text,all_text_0.5,all_text_0.6,all_text_0.7,all_text_0.8,all_text_0.9
0,https://api.github.com/repos/kubernetes/kubern...,2639668210,kubelet crash: fatal error: concurrent map writes,### What happened?\n\nWhile looking into three...,https://github.com/kubernetes/kubernetes/issue...,https://github.com/kubernetes/kubernetes/pull/...,[area/kubelet],128657,"['pkg/kubelet/cm/container_manager_linux.go', ...","['modified', 'modified', 'modified', 'modified']","[3, 4, 18, 75]","[3, 3, 6, 0]","[6, 7, 24, 75]",/sig node Thought to search for similar errors...,kubelet crash : fatal error : concurrent map w...,kubelet crash fatal error concurrent map write...,kubelet crash fatal error concurrent map write...,kubelet crash fatal error concurrent map write...,kubelet crash fatal error concurrent map write...,kubelet crash fatal error concurrent map write...
1,https://api.github.com/repos/kubernetes/kubern...,2617512099,[FG:InPlacePodVerticalScaling] failed to verif...,### What happened?\n\nOne line bug description...,https://github.com/kubernetes/kubernetes/issue...,https://github.com/kubernetes/kubernetes/pull/...,[area/kubelet],126620,"['pkg/kubelet/status/state/checkpoint.go', 'pk...","['modified', 'modified', 'added']","[42, 23, 166]","[26, 17, 0]","[68, 40, 166]",This issue is currently awaiting triage.\n\nIf...,[ fg : inplacepodverticalscale ] fail verify p...,fg inplacepodverticalscale fail verify pod sta...,fg inplacepodverticalscale fail verify pod sta...,fg inplacepodverticalscale fail verify pod sta...,fg inplacepodverticalscale fail verify pod sta...,fg inplacepodverticalscale fail verify pod sta...
2,https://api.github.com/repos/kubernetes/kubern...,2604613192,Restore build-tag flag for code-generator,### What happened?\n\nThe `build-tag` flag is ...,https://github.com/kubernetes/kubernetes/issue...,https://github.com/kubernetes/kubernetes/pull/...,[area/code-generation],128259,['staging/src/k8s.io/code-generator/cmd/conver...,"['modified', 'modified', 'modified', 'modified...","[12, 1, 1, 12, 1, 1]","[2, 1, 1, 1, 1, 1]","[14, 2, 2, 13, 2, 2]",@p0lyn0mial FYI /sig api-machinery /triage acc...,restore build - tag flag code - generator # # ...,restore build tag flag code generator build ta...,restore build tag flag code generator build ta...,restore build tag flag code generator build ta...,restore build tag flag code generator build ta...,restore build tag flag code generator happen b...
3,https://api.github.com/repos/kubernetes/kubern...,2596132738,[Failing Tests] ci-crio-cgroupv1-node-e2e-conf...,### Which jobs are failing?\n* master-blocking...,https://github.com/kubernetes/kubernetes/issue...,https://github.com/kubernetes/kubernetes/pull/...,"[area/kubeadm, area/dependency]",128175,"['go.mod', 'go.sum', 'hack/unwanted-dependenci...","['modified', 'modified', 'modified', 'modified...","[1, 3, 1, 21, 9, 9, 4, 0, 10, 2, 2, 5, 8, 1]","[1, 2, 0, 98, 8, 7, 6, 62, 11, 1, 4, 8, 11, 1]","[2, 5, 1, 119, 17, 16, 10, 62, 21, 3, 6, 13, 1...",@drewhagen: The provided milestone is not vali...,[ fail test ] ci-crio-cgroupv1-node-e2e-confor...,fail impact multiple job job fail master block...,fail impact multiple job job fail master block...,fail test impact multiple job job fail master ...,fail test impact multiple job job fail master ...,fail test impact multiple job job fail master ...
4,https://api.github.com/repos/kubernetes/kubern...,2591358936,Crash on kube manager's service-lb-controller ...,### What happened?\n\nIf kube manager is start...,https://github.com/kubernetes/kubernetes/issue...,https://github.com/kubernetes/kubernetes/pull/...,[area/cloudprovider],128182,['cmd/kube-controller-manager/app/controllerma...,"['modified', 'modified', 'modified']","[26, 12, 5]","[0, 1, 4]","[26, 13, 9]",This issue is currently awaiting triage.\n\nIf...,crash kube manager service - lb - controller v...,crash kube manager service lb controller kube ...,crash kube manager service lb controller kube ...,crash kube manager service lb controller kube ...,crash kube manager service lb controller kube ...,crash kube manager service lb controller happe...


In [None]:
# export cleaned data 
train_df.to_csv('preprocessed_train_data.csv', index=False)
val_df.to_csv('preprocessed_val_data.csv', index=False)
# data.to_json('preprocessed_cleaned_data_with_changed_files.json')