In [1]:
#!/usr/bin/env python
import pandas as pd
import os

def create_mini_tsv(input_file, output_file, fraction=0.1):
    """
    Reads a TSV file and writes a new TSV file containing only the first fraction of rows.

    Parameters:
      input_file (str): Path to the original TSV file.
      output_file (str): Path for the mini TSV file to be created.
      fraction (float): Fraction of the rows to keep (default is 0.1 for 10%).
    """
    # Read the TSV file
    data = pd.read_csv(input_file, sep="\t", header=0)
    total_rows = len(data)
    
    # Determine number of rows to keep (first 10%)
    n_rows = int(total_rows * fraction)
    mini_data = data.iloc[:n_rows, :]
    
    # Save the mini dataset as a TSV file
    mini_data.to_csv(output_file, sep="\t", index=False)
    print(f"Saved {n_rows} rows out of {total_rows} to {output_file}")

if __name__ == '__main__':
    # Define input and output file names
    input_file = "/home/workspaces/polimikel/data/UCR_dataset/Wafer/Wafer_TRAIN.tsv"         # Change this if needed
    output_file = "/home/workspaces/polimikel/data/UCR_dataset/Wafer/Wafer_TRAIN_mini.tsv"     # The output file
    create_mini_tsv(input_file, output_file, fraction=0.1)


Saved 99 rows out of 999 to /home/workspaces/polimikel/data/UCR_dataset/Wafer/Wafer_TRAIN_mini.tsv


In [2]:
#!/usr/bin/env python
import os
import pandas as pd

def analyze_tsv(file_path):
    """
    Loads a TSV file and returns its dimensions (rows, columns).
    
    Parameters:
      file_path (str): Path to the TSV file.
    
    Returns:
      tuple: (number of rows, number of columns)
    """
    try:
        df = pd.read_csv(file_path, sep="\t", header=0)
        return df.shape
    except Exception as e:
        print(f"Error loading {file_path}: {e}")
        return None

def main():
    # Define the file paths (adjust if needed)
    full_file = "/home/workspaces/polimikel/data/UCR_dataset/Wafer/Wafer_TRAIN.tsv" 
    mini_file = "/home/workspaces/polimikel/data/UCR_dataset/Wafer/Wafer_TRAIN_mini.tsv"  
    
    # Analyze full TSV file
    if os.path.exists(full_file):
        full_shape = analyze_tsv(full_file)
        if full_shape:
            print(f"File '{full_file}' dimensions: {full_shape[0]} rows, {full_shape[1]} columns")
    else:
        print(f"File not found: {full_file}")
    
    # Analyze mini TSV file
    if os.path.exists(mini_file):
        mini_shape = analyze_tsv(mini_file)
        if mini_shape:
            print(f"File '{mini_file}' dimensions: {mini_shape[0]} rows, {mini_shape[1]} columns")
    else:
        print(f"File not found: {mini_file}")

if __name__ == "__main__":
    main()


File '/home/workspaces/polimikel/data/UCR_dataset/Wafer/Wafer_TRAIN.tsv' dimensions: 999 rows, 153 columns
File '/home/workspaces/polimikel/data/UCR_dataset/Wafer/Wafer_TRAIN_mini.tsv' dimensions: 99 rows, 153 columns
