# Case 0

In [1]:
import os
import pandas as pd

# Define the directories
input_directory = r"C:\Users\maste\Downloads\dataloader_v60.0.2\output\test_casenumber - Copy"
output_directory = r"C:\Users\maste\Downloads\dataloader_v60.0.2\output\test_casenumber - Copy\output1"

# Define the delimiter
delimiter = ','  # Change this to the appropriate delimiter if necessary (e.g., ';' for semicolon)

# Create output directory if it doesn't exist
os.makedirs(output_directory, exist_ok=True)

# Function to generate the legacy ticket ID from the ticket ID
def generate_legacy_ticket_id(ticket_id):
    numeric_part = int(ticket_id[3:])
    return f'TTB{str(numeric_part).zfill(len(ticket_id) - 3)}'

# Collect all Ticket_IDs from existing files
def collect_ticket_ids(input_directory, file_suffix, delimiter):
    ticket_ids = set()
    csv_files = []

    for file in os.listdir(input_directory):
        if file_suffix in file and file.endswith('.csv'):
            csv_files.append(file)
            df = pd.read_csv(os.path.join(input_directory, file), usecols=['Ticket_ID'], delimiter=delimiter)
            ticket_ids.update(df['Ticket_ID'].unique())

    return sorted(ticket_ids), csv_files

all_ticket_ids_27, csv_files_27 = collect_ticket_ids(input_directory, '_27', delimiter)
all_ticket_ids_79, csv_files_79 = collect_ticket_ids(input_directory, '_79', delimiter)

def process_files(csv_files, all_ticket_ids, file_suffix, delimiter, output_directory):
    # Generate the full range of Ticket_IDs
    min_id = int(all_ticket_ids[0][3:])
    max_id = int(all_ticket_ids[-1][3:])
    full_range_ids = [f'TTB{str(i).zfill(len(all_ticket_ids[0]) - 3)}' for i in range(min_id, max_id + 1)]

    # Identify missing Ticket_IDs
    missing_ids = set(full_range_ids) - set(all_ticket_ids)

    # Define the fixed values for the gap rows
    fixed_values = {
        'Call_Type_ID': 1000,
        'Create_Date': '1/1/1999',
        'status': 'Cancelled',
        'TanggalClosed': '1/1/1999',
        'record_type': 'Case Migration'
    }

    # Create placeholder rows for missing IDs
    missing_rows = []
    for ticket_id in missing_ids:
        placeholder_row = fixed_values.copy()
        placeholder_row['Ticket_ID'] = ticket_id
        placeholder_row['Legacy_ticket_id'] = ticket_id  # Ensure Legacy_ticket_id matches Ticket_ID
        missing_rows.append(placeholder_row)
    missing_df = pd.DataFrame(missing_rows)

    # Read, combine, and split files if necessary
    combined_df = pd.DataFrame()
    for file in csv_files:
        df = pd.read_csv(os.path.join(input_directory, file), delimiter=delimiter)
        combined_df = pd.concat([combined_df, df], ignore_index=True)

    # Add missing rows to the combined dataframe
    combined_df = pd.concat([combined_df, missing_df], ignore_index=True)
    combined_df.sort_values(by='Ticket_ID', inplace=True)

    # Ensure all Legacy_ticket_id values match Ticket_ID
    combined_df['Legacy_ticket_id'] = combined_df['Ticket_ID']

    # Remove .0 from all values except nominal column
    for col in combined_df.columns:
        if col != 'nominal':  # Replace 'nominal' with the actual name of your nominal column
            combined_df[col] = combined_df[col].apply(lambda x: str(x).replace('.0', '') if isinstance(x, float) else x)

    # Replace NaN with empty strings
    combined_df.fillna('', inplace=True)
    combined_df.replace('nan', '', inplace=True)

    # Ensure the combined dataframe doesn't exceed 300,000 rows per file
    chunk_size = 300000
    num_chunks = (len(combined_df) // chunk_size) + 1

    for i in range(num_chunks):
        chunk_df = combined_df.iloc[i * chunk_size: (i + 1) * chunk_size]
        output_filename = os.path.join(output_directory, f'{os.path.basename(csv_files[0]).split(".csv")[0]}_part{i + 1}.csv')
        chunk_df.to_csv(output_filename, index=False, sep=delimiter)

    print(f"Processing for files with suffix '{file_suffix}' completed.")

# Process the files for both suffixes
process_files(csv_files_27, all_ticket_ids_27, '27', delimiter, output_directory)
process_files(csv_files_79, all_ticket_ids_79, '79', delimiter, output_directory)

# To read the files in order, you can sort the filenames programmatically
output_files = sorted(os.listdir(output_directory), key=lambda x: (x.split('_')[1], int(x.split('part')[1].split('.')[0])))

for file in output_files:
    df = pd.read_csv(os.path.join(output_directory, file), delimiter=delimiter)
    print(f"Reading file: {file}")
    # Process the DataFrame as needed


Processing for files with suffix '27' completed.
Processing for files with suffix '79' completed.
Reading file: bricare_20200101_20200101_27_kosong_part1.csv
Reading file: bricare_20230101_20230101_79_this_part1.csv


# Case 0.1

In [2]:
import os
import pandas as pd

# Define the directories
input_directory = r"C:\Users\maste\Downloads\dataloader_v60.0.2\output\test_casenumber - Copy"
output_directory = r"C:\Users\maste\Downloads\dataloader_v60.0.2\output\test_casenumber - Copy\output1"

# Define the delimiter
delimiter = ','  # Change this to the appropriate delimiter if necessary (e.g., ';' for semicolon)

# Create output directory if it doesn't exist
os.makedirs(output_directory, exist_ok=True)

# Function to generate the legacy ticket ID from the ticket ID
def generate_legacy_ticket_id(ticket_id):
    numeric_part = int(ticket_id[3:])
    return f'TTB{str(numeric_part).zfill(len(ticket_id) - 3)}'

# Collect all Ticket_IDs from existing files
def collect_ticket_ids(input_directory, file_suffix, delimiter):
    ticket_ids = set()
    csv_files = []

    for file in os.listdir(input_directory):
        if file_suffix in file and file.endswith('.csv'):
            csv_files.append(file)
            df = pd.read_csv(os.path.join(input_directory, file), usecols=['Ticket_ID'], delimiter=delimiter)
            ticket_ids.update(df['Ticket_ID'].unique())

    return sorted(ticket_ids), csv_files

# Collect Ticket_IDs and CSV files for suffix '_79'
all_ticket_ids_79, csv_files_79 = collect_ticket_ids(input_directory, '_79', delimiter)

def process_files(csv_files, all_ticket_ids, file_suffix, delimiter, output_directory):
    # Generate the full range of Ticket_IDs
    min_id = int(all_ticket_ids[0][3:])
    max_id = int(all_ticket_ids[-1][3:])
    full_range_ids = [f'TTB{str(i).zfill(len(all_ticket_ids[0]) - 3)}' for i in range(min_id, max_id + 1)]

    # Identify missing Ticket_IDs
    missing_ids = set(full_range_ids) - set(all_ticket_ids)

    # Define the fixed values for the gap rows
    fixed_values = {
        'Call_Type_ID': 1000,
        'Create_Date': '1/1/1999',
        'status': 'Cancelled',
        'TanggalClosed': '1/1/1999',
        'record_type': 'Case Migration'
    }

    # Create placeholder rows for missing IDs
    missing_rows = []
    for ticket_id in missing_ids:
        placeholder_row = fixed_values.copy()
        placeholder_row['Ticket_ID'] = ticket_id
        placeholder_row['Legacy_ticket_id'] = ticket_id  # Ensure Legacy_ticket_id matches Ticket_ID
        missing_rows.append(placeholder_row)
    missing_df = pd.DataFrame(missing_rows)

    # Read, combine, and split files if necessary
    combined_df = pd.DataFrame()
    for file in csv_files:
        df = pd.read_csv(os.path.join(input_directory, file), delimiter=delimiter)
        combined_df = pd.concat([combined_df, df], ignore_index=True)

    # Add missing rows to the combined dataframe
    combined_df = pd.concat([combined_df, missing_df], ignore_index=True)
    combined_df.sort_values(by='Ticket_ID', inplace=True)

    # Ensure all Legacy_ticket_id values match Ticket_ID
    combined_df['Legacy_ticket_id'] = combined_df['Ticket_ID']

    # Remove .0 from all values except nominal column
    for col in combined_df.columns:
        if col != 'nominal':  # Replace 'nominal' with the actual name of your nominal column
            combined_df[col] = combined_df[col].apply(lambda x: str(x).replace('.0', '') if isinstance(x, float) else x)

    # Replace NaN with empty strings
    combined_df.fillna('', inplace=True)
    combined_df.replace('nan', '', inplace=True)

    # Ensure the combined dataframe doesn't exceed 300,000 rows per file
    chunk_size = 300000
    num_chunks = (len(combined_df) // chunk_size) + 1

    for i in range(num_chunks):
        chunk_df = combined_df.iloc[i * chunk_size: (i + 1) * chunk_size]
        output_filename = os.path.join(output_directory, f'{os.path.basename(csv_files[0]).split(".csv")[0]}_part{i + 1}.csv')
        chunk_df.to_csv(output_filename, index=False, sep=delimiter)

    print(f"Processing for files with suffix '{file_suffix}' completed.")

# Process the files for suffix '_79'
process_files(csv_files_79, all_ticket_ids_79, '79', delimiter, output_directory)

# To read the files in order, you can sort the filenames programmatically
output_files = sorted(os.listdir(output_directory), key=lambda x: (x.split('_')[1], int(x.split('part')[1].split('.')[0])))

for file in output_files:
    df = pd.read_csv(os.path.join(output_directory, file), delimiter=delimiter)
    print(f"Reading file: {file}")
    # Process the DataFrame as needed


Processing for files with suffix '79' completed.
Reading file: bricare_20230101_20230101_79_this_part1.csv


# Case 1

In [None]:
import os
import pandas as pd

# Define the directories
input_directory = r"C:\Users\maste\Downloads\dataloader_v60.0.2\output\test_casenumber - Copy"
output_directory = r"C:\Users\maste\Downloads\dataloader_v60.0.2\output\test_casenumber - Copy\output1"

# Define the delimiter
delimiter = ';'  # Change this to the appropriate delimiter if necessary (e.g., ';' for semicolon)

# Create output directory if it doesn't exist
os.makedirs(output_directory, exist_ok=True)

# Function to generate the legacy ticket ID from the ticket ID
def generate_legacy_ticket_id(ticket_id):
    numeric_part = int(ticket_id[3:])
    return f'TTB{str(numeric_part).zfill(len(ticket_id) - 3)}'

# Collect all Ticket_IDs from existing files
def collect_ticket_ids(input_directory, file_suffix, delimiter):
    ticket_ids = set()
    csv_files = []

    for file in os.listdir(input_directory):
        if file_suffix in file and file.endswith('.csv'):
            csv_files.append(file)
            df = pd.read_csv(os.path.join(input_directory, file), usecols=['Ticket_ID'], delimiter=delimiter)
            ticket_ids.update(df['Ticket_ID'].unique())

    return sorted(ticket_ids), csv_files

all_ticket_ids_27, csv_files_27 = collect_ticket_ids(input_directory, '_27', delimiter)
all_ticket_ids_79, csv_files_79 = collect_ticket_ids(input_directory, '_79', delimiter)

def process_files(csv_files, all_ticket_ids, file_suffix, delimiter, output_directory):
    # Generate the full range of Ticket_IDs from TTB000000000001 to the maximum in the current data
    max_id = int(all_ticket_ids[-1][3:])
    full_range_ids = [f'TTB{str(i).zfill(12)}' for i in range(1, max_id + 1)]

    # Identify missing Ticket_IDs
    missing_ids = set(full_range_ids) - set(all_ticket_ids)

    # Define the fixed values for the gap rows
    fixed_values = {
        'Call_Type_ID': 1000,
        'Create_Date': '1/1/1999',
        'status': 'Cancelled',
        'TanggalClosed': '1/1/1999',
        'record_type': 'Case Migration'
    }

    # Create placeholder rows for missing IDs
    missing_rows = []
    for ticket_id in missing_ids:
        placeholder_row = fixed_values.copy()
        placeholder_row['Ticket_ID'] = ticket_id
        placeholder_row['Legacy_ticket_id'] = ticket_id  # Ensure Legacy_ticket_id matches Ticket_ID
        missing_rows.append(placeholder_row)
    missing_df = pd.DataFrame(missing_rows)

    # Read, combine, and split files if necessary
    combined_df = pd.DataFrame()
    for file in csv_files:
        df = pd.read_csv(os.path.join(input_directory, file), delimiter=delimiter)
        combined_df = pd.concat([combined_df, df], ignore_index=True)

    # Add missing rows to the combined dataframe
    combined_df = pd.concat([combined_df, missing_df], ignore_index=True)
    combined_df.sort_values(by='Ticket_ID', inplace=True)

    # Ensure all Legacy_ticket_id values match Ticket_ID
    combined_df['Legacy_ticket_id'] = combined_df['Ticket_ID']

    # Remove .0 from all values except nominal column
    for col in combined_df.columns:
        if col != 'nominal':  # Replace 'nominal' with the actual name of your nominal column
            combined_df[col] = combined_df[col].apply(lambda x: str(x).replace('.0', '') if isinstance(x, float) else x)

    # Replace NaN with empty strings
    combined_df.fillna('', inplace=True)
    combined_df.replace('nan', '', inplace=True)

    # Ensure the combined dataframe doesn't exceed 300,000 rows per file
    chunk_size = 300000
    num_chunks = (len(combined_df) // chunk_size) + 1

    for i in range(num_chunks):
        chunk_df = combined_df.iloc[i * chunk_size: (i + 1) * chunk_size]
        output_filename = os.path.join(output_directory, f'{os.path.basename(csv_files[0]).split(".csv")[0]}_part{i + 1}.csv')
        chunk_df.to_csv(output_filename, index=False, sep=delimiter)

    print(f"Processing for files with suffix '{file_suffix}' completed.")

# Process the files for both suffixes
process_files(csv_files_27, all_ticket_ids_27, '27', delimiter, output_directory)
process_files(csv_files_79, all_ticket_ids_79, '79', delimiter, output_directory)

# To read the files in order, you can sort the filenames programmatically
output_files = sorted(os.listdir(output_directory), key=lambda x: (x.split('_')[1], int(x.split('part')[1].split('.')[0])))

for file in output_files:
    df = pd.read_csv(os.path.join(output_directory, file), delimiter=delimiter)
    print(f"Reading file: {file}")
    # Process the DataFrame as needed


# Case 2

In [None]:
import os
import pandas as pd

def clean_csv_files(input_folder, output_folder):
    # Create the output folder if it does not exist
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)
    
    # Loop through all files in the input folder
    for filename in os.listdir(input_folder):
        if filename.endswith(".csv"):
            # Construct the full file path
            file_path = os.path.join(input_folder, filename)
            
            # Read the CSV file
            data = pd.read_csv(file_path)
            
            # Remove the 'Legacy_Ticket_ID' column if it exists
            if 'Legacy_Ticket_ID' in data.columns:
                data = data.drop(columns=['Legacy_Ticket_ID'])
            
            # Rename the 'Legacy_ticket_id' column to 'Legacy_Ticket_ID' if it exists
            if 'Legacy_ticket_id' in data.columns:
                data = data.rename(columns={'Legacy_ticket_id': 'Legacy_Ticket_ID'})
            
            # Replace all NaN values with empty strings
            data = data.fillna('')
            
            # Remove '.0' at the end of all values
            data = data.applymap(lambda x: str(x).replace('.0', '') if isinstance(x, (int, float)) else str(x))
            
            
            # Save the cleaned data to the output folder
            cleaned_file_path = os.path.join(output_folder, filename)
            data.to_csv(cleaned_file_path, index=False)
            print(f"Processed and saved: {cleaned_file_path}")

# Define the input and output folders
input_folder = r"C:\Users\maste\Downloads\dataloader_v60.0.2\output\test_casenumber - Copy\output1"
output_folder = r"D:\cleaned"

# Call the function to clean all CSV files
clean_csv_files(input_folder, output_folder)

In [None]:
# to check number of rows
import os
import pandas as pd

# Define the folder containing the CSV files
folder_path = 'path/to/your/folder'

# Create a list to store the file details
file_details = []

# Iterate over each file in the folder
for filename in os.listdir(folder_path):
    if filename.endswith('.csv'):
        file_path = os.path.join(folder_path, filename)
        # Read the CSV file
        df = pd.read_csv(file_path)
        # Get the number of rows (excluding header)
        num_rows = len(df)
        # Append the details to the list
        file_details.append({'Filename': filename, 'Number of Rows': num_rows})

# Create a DataFrame from the list of file details
file_details_df = pd.DataFrame(file_details)

# Save the DataFrame to an Excel file
output_file_path = 'file_details.xlsx'
file_details_df.to_excel(output_file_path, index=False)

print(f'File details saved to {output_file_path}')


## Account

In [None]:
import pandas as pd

# Load the CSV file into a DataFrame
df = pd.read_csv('yourfile.csv', delimiter=';')

# Check for duplicate 'cifno'
duplicate_cifno = df.duplicated('cifno')
if duplicate_cifno.any():
    print("There are duplicate 'cifno' values.")
else:
    print("All 'cifno' values are unique.")

# Fill empty 'Nama' fields with "No Name"
df['Nama'].fillna('No Name', inplace=True)
df['Nama'].replace('', 'No Name', inplace=True)

# Save the updated DataFrame to a new CSV file
df.to_csv('updated_yourfile.csv', index=False, sep=';')

print("Processing complete. The updated file has been saved as 'updated_yourfile.csv'.")


# Closed Date

In [None]:
import os
import pandas as pd

def check_dates_in_files_and_generate_excel(folder_path, output_excel_file):
    # List to store the results
    results = []

    # Iterate over all files in the folder
    for filename in os.listdir(folder_path):
        if filename.endswith(".csv"):  # assuming the files are in CSV format
            file_path = os.path.join(folder_path, filename)
            # Read the file into a DataFrame
            df = pd.read_csv(file_path)

            # Check if the necessary columns are in the DataFrame
            if 'Create_Date' in df.columns and 'TanggalClosed' in df.columns:
                # Convert columns to datetime if they are not already
                df['Create_Date'] = pd.to_datetime(df['Create_Date'], errors='coerce')
                df['TanggalClosed'] = pd.to_datetime(df['TanggalClosed'], errors='coerce')

                # Check if Create_Date is before TanggalClosed
                condition = df['Create_Date'] > df['TanggalClosed']
                
                # Find the rows where the condition is not met
                incorrect_rows = df[~condition].index.tolist()
                
                # Append result for the current file
                result = {
                    'Filename': filename,
                    'All_Dates_Correct': condition.all(),
                    'Incorrect_Rows': incorrect_rows  # List of incorrect rows
                }
                results.append(result)
            else:
                results.append({
                    'Filename': filename,
                    'All_Dates_Correct': False,
                    'Incorrect_Rows': 'Missing required columns'
                })
    
    # Convert the results to a DataFrame
    results_df = pd.DataFrame(results)

    # Save the results to an Excel file
    with pd.ExcelWriter(output_excel_file) as writer:
        results_df.to_excel(writer, index=False, sheet_name='Summary')
        
        for result in results:
            if result['All_Dates_Correct'] is False and isinstance(result['Incorrect_Rows'], list):
                # Write the detailed incorrect rows for each file
                incorrect_df = df.iloc[result['Incorrect_Rows']]
                incorrect_df.to_excel(writer, sheet_name=result['Filename'], index=True)



# Define the folder path and output Excel file
folder_path = r"D:\cleaned"
output_excel_file = 'date_check_results.xlsx'
check_dates_in_files_and_generate_excel(folder_path, output_excel_file)

print(f"Results have been saved to {output_excel_file}")


In [None]:
import os
import shutil
from datetime import datetime

def move_files_by_date(source_folder, destination_folder, start_date):
    if not os.path.exists(destination_folder):
        os.makedirs(destination_folder)

    for root, dirs, files in os.walk(source_folder):
        for file in files:
            file_path = os.path.join(root, file)
            # Get the creation date of the file
            file_create_date = datetime.fromtimestamp(os.path.getctime(file_path))
            # Format the creation date to match the given format
            formatted_date = file_create_date.strftime('%Y-%m-%d %H:%M:%S')

            if datetime.strptime(formatted_date, '%Y-%m-%d %H:%M:%S') >= start_date:
                shutil.move(file_path, destination_folder)
                print(f'Moved: {file_path} to {destination_folder}')

# Path of the folder to scan
source_folder = r'path\to\source\folder'
# Path of the destination folder
destination_folder = r'path\to\destination\folder'

# Start date
start_date = datetime.strptime('2022-01-01 00:00:00', '%Y-%m-%d %H:%M:%S')

move_files_by_date(source_folder, destination_folder, start_date)
