## Setting Up Kaggle API Credentials

In [None]:
from google.colab import userdata
username = userdata.get('KAGGLE_USER')
key = userdata.get('KAGGLE_KEY')
# Echo the credentials into the kaggle.json file
!mkdir -p ~/.kaggle
!echo '{{"username":"{username}","key":"{key}"}}' > ~/.kaggle/kaggle.json
!chmod 600 /root/.kaggle/kaggle.json

In [None]:
# Download dataset
!kaggle competitions download -c liver-ultrasound-detection
!unzip /content/liver-ultrasound-detection.zip && rm -rf /content/liver-ultrasound-detection.zip

In [None]:
import os
import pandas as pd
import cv2
import matplotlib.pyplot as plt
from collections import defaultdict

In [None]:
def count_classes_in_folder(annotation_folder):
    # Dictionary to hold counts of each class ID
    class_counts = defaultdict(int)

    # Iterate over all files in the folder
    for filename in os.listdir(annotation_folder):
        if filename.endswith('.txt'):  # Assuming annotation files have .txt extension
            file_path = os.path.join(annotation_folder, filename)
            with open(file_path, 'r') as file:
                lines = file.readlines()

            # Count class IDs
            for line in lines:
                class_id = int(line.strip().split()[0])
                class_counts[class_id] += 1


    return class_counts

def plot_class_distribution(class_counts, class_names):
    # Prepare data for plotting
    class_ids = sorted(class_counts.keys())
    counts = [class_counts[class_id] for class_id in class_ids]
    labels = [class_names[class_id] for class_id in class_ids]

    # Plot bar chart
    plt.figure(figsize=(10, 6))
    bars = plt.bar(labels, counts, color='skyblue')
    plt.xlabel('Class Names')
    plt.ylabel('Counts')
    plt.title('Class Distribution')
    plt.xticks(rotation=45)

    # Add counts on top of each bar
    for bar, count in zip(bars, counts):
        plt.text(bar.get_x() + bar.get_width() / 2, bar.get_height(), str(count),
                 ha='center', va='bottom')

    plt.show()



In [None]:
# Example usage
annotation_folder = '/content/train/train/annotations'
class_names = ['FFC', 'FFS', 'HCC', 'cyst', 'hemangioma', 'dysplastic', 'CCA'] # Add your class names here

class_counts = count_classes_in_folder(annotation_folder)
plot_class_distribution(class_counts, class_names)

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

# Read the data from Excel file
data = pd.read_excel('/content/mapping.xlsx')

# Group by 'Source' and 'Type', and count occurrences
class_counts = data.groupby(['Source', 'Type']).size().unstack(fill_value=0)

# Plot the bar chart
class_counts.plot(kind='bar', figsize=(10, 6))
plt.xlabel('Source')
plt.ylabel('Counts')
plt.title('Class Distribution by Source')
plt.xticks(rotation=45)
plt.legend(title='Type')
plt.show()


In [None]:
import os
import matplotlib.pyplot as plt
import pandas as pd
from collections import defaultdict

def count_classes_in_files(annotation_files, annotation_folder):
    # Dictionary to hold counts of each class ID
    class_counts = defaultdict(int)

    # Iterate over all annotation files
    for annotation_file in annotation_files:
        # Check for NaN values
        if pd.notna(annotation_file):
            file_path = os.path.join(annotation_folder, str(annotation_file))  # Convert to string
            try:
                with open(file_path, 'r') as file:
                    lines = file.readlines()

                # Count class IDs
                for line in lines:
                    class_id = int(line.strip().split()[0])
                    class_counts[class_id] += 1
            except FileNotFoundError:
                print(f"Warning: Annotation file '{annotation_file}' not found. Skipping.")

    return class_counts

def plot_class_distribution(class_counts, class_names):
    # Prepare data for plotting
    class_ids = sorted(class_counts.keys())
    counts = [class_counts[class_id] for class_id in class_ids]
    labels = [class_names[class_id] for class_id in class_ids]

    # Plot bar chart
    plt.figure(figsize=(10, 6))
    bars = plt.bar(labels, counts, color='skyblue')
    plt.xlabel('Class Names')
    plt.ylabel('Counts')
    plt.title('Class Distribution')
    plt.xticks(rotation=45)

    # Add counts on top of each bar
    for bar, count in zip(bars, counts):
        plt.text(bar.get_x() + bar.get_width() / 2, bar.get_height(), str(count),
                 ha='center', va='bottom')

    plt.show()

# Read the data from Excel file
data = pd.read_excel('/content/mapping.xlsx')

# Get annotation file names and annotation folder
annotation_files = data['Annotation File'].tolist()
annotation_folder = '/content/train/train/annotations'

# Example usage
class_names = ['FFC', 'FFS', 'HCC', 'cyst', 'hemangioma', 'dysplastic', 'CCA']  # Add your class names here

class_counts = count_classes_in_files(annotation_files, annotation_folder)
plot_class_distribution(class_counts, class_names)


In [None]:
def plot_class_distribution_by_source(data):
    # Group data by 'Source' and 'Type', and count occurrences
    grouped_data = data.groupby(['Source', 'Type']).size().unstack(fill_value=0)

    # Plot separate bar charts for each source
    for source, counts in grouped_data.iterrows():
        plt.figure(figsize=(8, 6))
        counts.plot(kind='bar', color='skyblue')
        plt.xlabel('Class Names')
        plt.ylabel('Counts')
        plt.title(f'Class Distribution for Source: {source}')
        plt.xticks(rotation=45)
        plt.tight_layout()
        plt.show()

In [None]:
# Read the data from Excel file
data = pd.read_excel('/content/mapping.xlsx')

# Plot class distribution for each source
plot_class_distribution_by_source(data)

In [None]:
import os
import matplotlib.pyplot as plt
import pandas as pd
from collections import defaultdict

def count_classes_in_files(annotation_files, annotation_folder, data):
    # Dictionary to hold counts of each class ID
    class_counts = defaultdict(int)

    # Iterate over all annotation files
    for annotation_file in annotation_files:
        # Check for NaN values
        if pd.notna(annotation_file):
            file_path = os.path.join(annotation_folder, str(annotation_file))  # Convert to string
            try:
                with open(file_path, 'r') as file:
                    lines = file.readlines()

                # Get source and class_name
                source = data.loc[data['Annotation File'] == annotation_file, 'Source'].iloc[0]
                class_name = data.loc[data['Annotation File'] == annotation_file, 'Type'].iloc[0]
                combined_name = f"{source}.{class_name}"

                # Count class IDs
                for line in lines:
                    class_id = int(line.strip().split()[0])
                    class_counts[combined_name] += 1
            except FileNotFoundError:
                print(f"Warning: Annotation file '{annotation_file}' not found. Skipping.")

    return class_counts

def plot_class_distribution(class_counts):
    # Prepare data for plotting
    labels = list(class_counts.keys())
    counts = list(class_counts.values())

    # Plot bar chart
    plt.figure(figsize=(12, 6))
    bars = plt.bar(labels, counts, color='skyblue')
    plt.xlabel('Class Names')
    plt.ylabel('Counts')
    plt.title('Class Distribution')
    plt.xticks(rotation=45)

    # Add counts on top of each bar
    for bar, count in zip(bars, counts):
        plt.text(bar.get_x() + bar.get_width() / 2, bar.get_height(), str(count),
                 ha='center', va='bottom')

    plt.show()

# Read the data from Excel file
data = pd.read_excel('/content/mapping.xlsx')

# Get annotation file names and annotation folder
annotation_files = data['Annotation File'].tolist()
annotation_folder = '/content/train/train/annotations'

# Example usage
class_counts = count_classes_in_files(annotation_files, annotation_folder, data)
plot_class_distribution(class_counts)


In [None]:
df_mapping = pd.read_csv('/content/mapping.csv')

In [None]:
df_mapping

### Mapping data with file

In [None]:
import pandas as pd
import glob
import os
import numpy as np

# Read the CSV file into a pandas dataframe
df = pd.read_csv('/content/mapping.csv')

# Specify the folder path where you want to search for files
folder_path_train = '/content/train/train/images'
folder_path_val = '/content/val/val/images'
folder_path_test = '/content/test/test/images'

# Get a list of all files in the folder_path
files_in_folder_train = glob.glob(os.path.join(folder_path_train, '*'))
files_in_folder_val = glob.glob(os.path.join(folder_path_val, '*'))
files_in_folder_test = glob.glob(os.path.join(folder_path_test, '*'))

# Function to check if a filename exists in the folder path
def file_exists_in_folder_train(filename):
    return any(filename == os.path.basename(file) for file in files_in_folder_train)

def file_exists_in_folder_val(filename):
    return any(filename == os.path.basename(file) for file in files_in_folder_val)

def file_exists_in_folder_test(filename):
    return any(filename == os.path.basename(file) for file in files_in_folder_test)

# Check if each filename in the 'Annotation File' column exists in the folder path
# If found, update the 'PATH' column with the folder path, otherwise add None
# df['PATH'] = df['Annotation File'].apply(lambda x: folder_path if file_exists_in_folder(x) else None)

# Create an empty list to store the paths
paths = []

# Iterate over each item in the 'Annotation File' column
for file_name in df['Image File']:
    # Check if the file exists in the folder
    if file_exists_in_folder_train(file_name):
        # If it exists, append the folder path to the list
        paths.append('Train')

    elif file_exists_in_folder_val(file_name):
        # If it exists, append the folder path to the list
        paths.append('Val')

    elif file_exists_in_folder_test(file_name):
        paths.append('Test')

    else:
        # If it doesn't exist, append None to the list
        paths.append(None)


df['PATH'] = paths


In [None]:
df['PATH']

In [None]:
# Add "Train" to the "PATH" column where file is found
# df.loc[df['PATH'].notnull(), 'PATH'] = 'Train'
df.loc[df['PATH'].notnull(), 'PATH'] = df['PATH']

# df.loc[df['PATH'] == folder_path, 'PATH'] = 'Train'

# Display the dataframe
print(df)

In [None]:
df.to_csv('mapping2.csv',index=False)