In [None]:
import os
import pandas as pd

# Define the directories
input_directory  = r"C:\Users\maste\Downloads\dataloader_v60.0.2\output\test_casenumber - Copy"
dummy_directory = r"D:\output"
output_directory = r"C:\Users\maste\Downloads\dataloader_v60.0.2\output\test_casenumber - Copy\output1"

# Define the delimiter
delimiter = ','  # Change this to the appropriate delimiter if necessary (e.g., ';' for semicolon)

# Create output directory if it doesn't exist
os.makedirs(output_directory, exist_ok=True)

# Function to read and process dummy files in chunks
def read_dummy_files(dummy_directory, delimiter):
    dummy_row = None
    for file in os.listdir(dummy_directory):
        if file.endswith('.csv'):
            df = pd.read_csv(os.path.join(dummy_directory, file), delimiter=delimiter)
            if dummy_row is None:
                dummy_row = df.iloc[0].to_dict()
    return dummy_row

# Collect dummy data from the dummy files
dummy_row = read_dummy_files(dummy_directory, delimiter)

# Function to generate the legacy ticket ID from the ticket ID
def generate_legacy_ticket_id(ticket_id):
    numeric_part = int(ticket_id[3:])
    return f'TTB{str(numeric_part).zfill(len(ticket_id) - 3)}'

# Collect all Ticket_IDs from existing files
all_ticket_ids_27 = set()
all_ticket_ids_79 = set()
csv_files_27 = []
csv_files_79 = []

for file in os.listdir(input_directory):
    if '_27' in file and file.endswith('.csv'):
        csv_files_27.append(file)
        df = pd.read_csv(os.path.join(input_directory, file), usecols=['Ticket_ID'], delimiter=delimiter)
        all_ticket_ids_27.update(df['Ticket_ID'].unique())
    elif '_79' in file and file.endswith('.csv'):
        csv_files_79.append(file)
        df = pd.read_csv(os.path.join(input_directory, file), usecols=['Ticket_ID'], delimiter=delimiter)
        all_ticket_ids_79.update(df['Ticket_ID'].unique())

def process_files(csv_files, all_ticket_ids, file_suffix):
    # Generate the full range of Ticket_IDs
    all_ticket_ids = sorted(all_ticket_ids)
    min_id = int(all_ticket_ids[0][3:])  # Assuming Ticket_IDs are of the format 'TTBxxxxxxx'
    max_id = int(all_ticket_ids[-1][3:])  # Assuming Ticket_IDs are of the format 'TTBxxxxxxx'
    full_range_ids = [f'TTB{str(i).zfill(len(all_ticket_ids[0]) - 3)}' for i in range(min_id, max_id + 1)]

    # Identify missing Ticket_IDs
    missing_ids = set(full_range_ids) - set(all_ticket_ids)

    # Create placeholder rows for missing IDs
    missing_rows = []
    for ticket_id in missing_ids:
        placeholder_row = dummy_row.copy()
        placeholder_row['Ticket_ID'] = ticket_id
        placeholder_row['Legacy_ticket_id'] = ticket_id  # Ensure Legacy_ticket_id matches Ticket_ID
        missing_rows.append(placeholder_row)
    missing_df = pd.DataFrame(missing_rows)

    # Read, combine, and split files if necessary
    combined_df = pd.DataFrame()
    for file in csv_files:
        df = pd.read_csv(os.path.join(input_directory, file), delimiter=delimiter)
        combined_df = pd.concat([combined_df, df], ignore_index=True)

    # Add missing rows to the combined dataframe
    combined_df = pd.concat([combined_df, missing_df], ignore_index=True)
    combined_df.sort_values(by='Ticket_ID', inplace=True)

    # Ensure all Legacy_ticket_id values match Ticket_ID
    combined_df['Legacy_ticket_id'] = combined_df['Ticket_ID']

    # Remove .0 from all values except nominal column
    for col in combined_df.columns:
        if col != 'nominal':  # Replace 'nominal' with the actual name of your nominal column
            combined_df[col] = combined_df[col].apply(lambda x: str(x).replace('.0', '') if isinstance(x, float) else x)

    # Replace NaN with empty strings
    combined_df.fillna('', inplace=True)

    # Ensure the combined dataframe doesn't exceed 300,000 rows per file
    chunk_size = 300000
    num_chunks = (len(combined_df) // chunk_size) + 1

    for i in range(num_chunks):
        chunk_df = combined_df.iloc[i * chunk_size: (i + 1) * chunk_size]
        output_filename = os.path.join(output_directory, f'{os.path.basename(file).split(".csv")[0]}_part{i + 1}.csv')
        chunk_df.to_csv(output_filename, index=False, sep=delimiter)

    print(f"Processing for files with suffix '{file_suffix}' completed.")

# Process the files for both suffixes
process_files(csv_files_27, all_ticket_ids_27, '27')
process_files(csv_files_79, all_ticket_ids_79, '79')

# To read the files in order, you can sort the filenames programmatically
output_files = sorted(os.listdir(output_directory), key=lambda x: (x.split('_')[1], int(x.split('part')[1].split('.')[0])))

for file in output_files:
    df = pd.read_csv(os.path.join(output_directory, file), delimiter=delimiter)
    print(f"Reading file: {file}")
    # Process the DataFrame as needed


In [None]:
import os
import pandas as pd

def clean_csv_files(input_folder, output_folder):
    # Create the output folder if it does not exist
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)
    
    # Loop through all files in the input folder
    for filename in os.listdir(input_folder):
        if filename.endswith(".csv"):
            # Construct the full file path
            file_path = os.path.join(input_folder, filename)
            
            # Read the CSV file
            data = pd.read_csv(file_path)
            
            # Remove the 'Legacy_Ticket_ID' column if it exists
            if 'Legacy_Ticket_ID' in data.columns:
                data = data.drop(columns=['Legacy_Ticket_ID'])
            
            # Rename the 'Legacy_ticket_id' column to 'Legacy_Ticket_ID' if it exists
            if 'Legacy_ticket_id' in data.columns:
                data = data.rename(columns={'Legacy_ticket_id': 'Legacy_Ticket_ID'})
            
            # Replace all NaN values with empty strings
            data = data.fillna('')
            
            # Remove '.0' at the end of all values
            data = data.applymap(lambda x: str(x).replace('.0', '') if isinstance(x, (int, float)) else str(x))
            
            
            # Save the cleaned data to the output folder
            cleaned_file_path = os.path.join(output_folder, filename)
            data.to_csv(cleaned_file_path, index=False)
            print(f"Processed and saved: {cleaned_file_path}")

# Define the input and output folders
input_folder = r"C:\Users\maste\Downloads\dataloader_v60.0.2\output\test_casenumber - Copy\output1"
output_folder = r"D:\cleaned"

# Call the function to clean all CSV files
clean_csv_files(input_folder, output_folder)

In [None]:
# to check number of rows
import os
import pandas as pd

# Define the folder containing the CSV files
folder_path = 'path/to/your/folder'

# Create a list to store the file details
file_details = []

# Iterate over each file in the folder
for filename in os.listdir(folder_path):
    if filename.endswith('.csv'):
        file_path = os.path.join(folder_path, filename)
        # Read the CSV file
        df = pd.read_csv(file_path)
        # Get the number of rows (excluding header)
        num_rows = len(df)
        # Append the details to the list
        file_details.append({'Filename': filename, 'Number of Rows': num_rows})

# Create a DataFrame from the list of file details
file_details_df = pd.DataFrame(file_details)

# Save the DataFrame to an Excel file
output_file_path = 'file_details.xlsx'
file_details_df.to_excel(output_file_path, index=False)

print(f'File details saved to {output_file_path}')
