In [None]:
import os
import pandas as pd


input_directory  = r"C:\Users\maste\Downloads\dataloader_v60.0.2\output\test_casenumber - Copy"
dummy_directory = r"D:\output"
output_directory = r"C:\Users\maste\Downloads\dataloader_v60.0.2\output\test_casenumber - Copy\output1"

# Define the delimiter
delimiter = ','  


os.makedirs(output_directory, exist_ok=True)


def read_dummy_files(dummy_directory, delimiter):
    dummy_row = None
    for file in os.listdir(dummy_directory):
        if file.endswith('.csv'):
            df = pd.read_csv(os.path.join(dummy_directory, file), delimiter=delimiter)
            if dummy_row is None:
                dummy_row = df.iloc[0].to_dict()
    return dummy_row


dummy_row = read_dummy_files(dummy_directory, delimiter)


def generate_legacy_ticket_id(ticket_id):
    numeric_part = int(ticket_id[3:])
    return f'TTB{str(numeric_part).zfill(len(ticket_id) - 3)}'


all_ticket_ids_27 = set()
all_ticket_ids_79 = set()
csv_files_27 = []
csv_files_79 = []

for file in os.listdir(input_directory):
    if file.endswith('_27.csv'):
        csv_files_27.append(file)
        df = pd.read_csv(os.path.join(input_directory, file), usecols=['Ticket_ID'], delimiter=delimiter)
        all_ticket_ids_27.update(df['Ticket_ID'].unique())
    elif file.endswith('_79.csv'):
        csv_files_79.append(file)
        df = pd.read_csv(os.path.join(input_directory, file), usecols=['Ticket_ID'], delimiter=delimiter)
        all_ticket_ids_79.update(df['Ticket_ID'].unique())

def process_files(csv_files, all_ticket_ids, file_suffix):
    # Generate the full range of Ticket_IDs
    all_ticket_ids = sorted(all_ticket_ids)
    min_id = int(all_ticket_ids[0][3:])  
    max_id = int(all_ticket_ids[-1][3:])  
    full_range_ids = [f'TTB{str(i).zfill(len(all_ticket_ids[0]) - 3)}' for i in range(min_id, max_id + 1)]

    
    missing_ids = set(full_range_ids) - set(all_ticket_ids)

   
    missing_rows = []
    for ticket_id in missing_ids:
        placeholder_row = dummy_row.copy()
        placeholder_row['Ticket_ID'] = ticket_id
        placeholder_row['Legacy_ticket_id'] = ticket_id  
        missing_rows.append(placeholder_row)
    missing_df = pd.DataFrame(missing_rows)

   
    combined_df = pd.DataFrame()
    for file in csv_files:
        df = pd.read_csv(os.path.join(input_directory, file), delimiter=delimiter)
        combined_df = pd.concat([combined_df, df], ignore_index=True)

   
    combined_df = pd.concat([combined_df, missing_df], ignore_index=True)
    combined_df.sort_values(by='Ticket_ID', inplace=True)

    
    combined_df['Legacy_ticket_id'] = combined_df['Ticket_ID']

    # Remove .0 from all values except nominal column
    for col in combined_df.columns:
        if col != 'nominal':  
            combined_df[col] = combined_df[col].apply(lambda x: str(x).replace('.0', '') if isinstance(x, float) else x)

    
    combined_df.fillna('', inplace=True)

    
    chunk_size = 300000
    num_chunks = (len(combined_df) // chunk_size) + 1

    for i in range(num_chunks):
        chunk_df = combined_df.iloc[i * chunk_size: (i + 1) * chunk_size]
        output_filename = os.path.join(output_directory, f'{os.path.basename(file).split(".csv")[0]}_part{i + 1}.csv')
        chunk_df.to_csv(output_filename, index=False, sep=delimiter)

    print(f"Processing for files with suffix '{file_suffix}' completed.")


process_files(csv_files_27, all_ticket_ids_27, '27')
process_files(csv_files_79, all_ticket_ids_79, '79')


output_files = sorted(os.listdir(output_directory), key=lambda x: (x.split('_')[1], int(x.split('part')[1].split('.')[0])))

for file in output_files:
    df = pd.read_csv(os.path.join(output_directory, file), delimiter=delimiter)
    print(f"Reading file: {file}")
    # Process the DataFrame as needed


In [None]:
import os
import pandas as pd

def clean_csv_files(input_folder, output_folder):
    # Create the output folder if it does not exist
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)
    
    # Loop through all files in the input folder
    for filename in os.listdir(input_folder):
        if filename.endswith(".csv"):
            # Construct the full file path
            file_path = os.path.join(input_folder, filename)
            
            # Read the CSV file
            data = pd.read_csv(file_path)
            
            # Remove the 'Legacy_Ticket_ID' column if it exists
            if 'Legacy_Ticket_ID' in data.columns:
                data = data.drop(columns=['Legacy_Ticket_ID'])
            
            # Rename the 'Legacy_ticket_id' column to 'Legacy_Ticket_ID' if it exists
            if 'Legacy_ticket_id' in data.columns:
                data = data.rename(columns={'Legacy_ticket_id': 'Legacy_Ticket_ID'})
            
            # Replace all NaN values with empty strings
            data = data.fillna('')
            
            # Remove '.0' at the end of all values
            data = data.applymap(lambda x: str(x).replace('.0', '') if isinstance(x, (int, float)) else str(x))
            
            
            # Save the cleaned data to the output folder
            cleaned_file_path = os.path.join(output_folder, filename)
            data.to_csv(cleaned_file_path, index=False)
            print(f"Processed and saved: {cleaned_file_path}")

# Define the input and output folders
input_folder = r"C:\Users\maste\Downloads\dataloader_v60.0.2\output\test_casenumber - Copy\output1"
output_folder = r"D:\cleaned"

# Call the function to clean all CSV files
clean_csv_files(input_folder, output_folder)

# To cleanse the detail Column

In [None]:
import pandas as pd

def process_csv_data(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        lines = file.readlines()

    # Remove BOM from each line
    lines = [line.replace('\ufeff', '') for line in lines]

    entries = []
    current_entry = []
    current_ticket_id = None

    for line in lines:
        if line.startswith('TTB'):
            if current_entry:
                entries.append((current_ticket_id, '\n'.join(current_entry)))
                current_entry = []
        
            parts = line.split(',', 3)
            if len(parts) > 3:
                current_ticket_id = parts[0]
                current_entry.append(parts[3].strip())
            continue
        current_entry.append(line.strip())

    if current_entry:
        entries.append((current_ticket_id, '\n'.join(current_entry)))

    return entries

file_path = r"C:\Users\maste\Downloads\bricare_case_januari2023_2_details.csv"
processed_data = process_csv_data(file_path)

df_final = pd.DataFrame(processed_data, columns=['Ticket ID', 'Details'])


if df_final.iloc[0]['Ticket ID'] and df_final.iloc[0]['Details'].startswith(df_final.iloc[0]['Ticket ID']):
    df_final.at[0, 'Details'] = df_final.iloc[0]['Details'][len(df_final.iloc[0]['Ticket ID'])+2:]


df_final = df_final.iloc[:10]
output_path = "details_20230101_20230101.csv"
df_final.to_csv(output_path, index=False)

print(f"Processed data saved to {output_path}")


## Merge the file

In [None]:
import pandas as pd

#just take 10 lines for an example
path=r"D:\dataquality\bricare_20230101_20230101.csv"
df=pd.read_csv(path)
df.iloc[:10].to_csv(path,index=False

In [None]:
import pandas as pd


file_path_1 = r"D:\dataquality2\bricare_uat20230101_20230101.csv"
file_path_2 = r"D:\dataquality2\details_uat_20230101_20230101.csv"


df_tenline_bricare = pd.read_csv(file_path_1)
df_detail_bricare_10line = pd.read_csv(file_path_2)

df_detail_bricare_10line.columns = ['Ticket_ID', 'Details']

merged_df = pd.merge(df_tenline_bricare, df_detail_bricare_10line, on='Ticket_ID', how='left')


output_file_path = r"D:\dataquality2\bricare_uat_20230101_20230101.csv"


column_to_move="Details"
merged_df = merged_df[[col for col in merged_df if col != column_to_move][:3] + [column_to_move] + [col for col in merged_df if col != column_to_move][3:]] 

merged_df.to_csv(output_file_path, index=False)

In [None]:
import os
import pandas as pd

def read_and_save_csv_files(source_directory, destination_directory, summary_file_path):
    s
    if not os.path.exists(destination_directory):
        os.makedirs(destination_directory)
    
    
    files = os.listdir(source_directory)
    
    # Filter for CSV files
    csv_files = [file for file in files if file.endswith('.csv')]
    
    summary_data = []

    for csv_file in csv_files:
       
        source_file_path = os.path.join(source_directory, csv_file)
        
        
        df = pd.read_csv(source_file_path)
        
      
        destination_file_path = os.path.join(destination_directory, csv_file)
        
       
        df.to_csv(destination_file_path, index=False)
        print(f"Saved {csv_file} to {destination_file_path}")
        
       
        summary_data.append({'Filename': csv_file, 'Total Rows': len(df)})
    
   
    summary_df = pd.DataFrame(summary_data)
    
   
    summary_df.to_excel(summary_file_path, index=False)
    print(f"Summary file saved to {summary_file_path}")


source_directory = 
destination_directory = 
summary_file_path = 

read_and_save_csv_files(source_directory, destination_directory, summary_file_path)
