In [None]:
import pandas as pd

# Read the first CSV file into a DataFrame
invoicing_df = pd.read_csv("invoicing_rev.06.xlsx - invoicing.csv")

# Read the second CSV file into a DataFrame
collection_df = pd.read_csv("collection.swd.rev.16.xlsx - Sheet1.csv")

# Display the first 5 rows of the first DataFrame
print(invoicing_df.head().to_markdown(index=False, numalign="left", stralign="left"))

# Display the first 5 rows of the second DataFrame
print(collection_df.head().to_markdown(index=False, numalign="left", stralign="left"))

# Print the column names and their data types for the first DataFrame
print(invoicing_df.info())

# Print the column names and their data types for the second DataFrame
print(collection_df.info())

In [None]:
# Drop nulls from `Project NUM` and `Cost Center`
invoicing_df.dropna(subset=['Project NUM', 'Cost Center'], inplace=True)
collection_df.dropna(subset=['Cost Center'], inplace=True)

# Convert `Project NUM` to string and strip leading and trailing whitespace
invoicing_df['Project NUM'] = invoicing_df['Project NUM'].astype(str).str.strip()

# Convert `Cost Center` to numeric
invoicing_df['Cost Center'] = pd.to_numeric(invoicing_df['Cost Center'])
collection_df['Cost Center'] = pd.to_numeric(collection_df['Cost Center'])

# Concatenate the two dataframes
projects_df = pd.concat([invoicing_df, collection_df])

# Create a new column `Project_ID` by combining relevant columns
projects_df['Project_ID'] = projects_df.apply(
    lambda row: row['Project NUM'] + ' - ' + row['project name']
    if not pd.isna(row['Project NUM'])
    else str(int(row['Cost Center'])) + ' - ' + row['Project Name'],
    axis=1
)

# Create a new column `Source` to track the source of the data
projects_df['Source'] = 'Invoicing'
projects_df.loc[projects_df['Project Name'].notna(), 'Source'] = 'Collections'

# Select the desired columns
projects_df = projects_df[[
    'Project_ID',
    'Source',
    'Project NUM',
    'project name',
    'Cost Center',
    'Project Name',
    'Rec No',
    'Customer/ Activity',
    'Customer Name',
    'Invoice Date',
    'Amount included VAT',
    'Receipt Date',
    'Amount',
]]

# Display the first 10 rows of the combined dataframe
print(projects_df.head(10).to_markdown(index=False, numalign="left", stralign="left"))

In [None]:
import re

# Drop nulls from `Project NUM` and `Cost Center`
invoicing_df.dropna(subset=['Project NUM', 'Cost Center'], inplace=True)
collection_df.dropna(subset=['Cost Center'], inplace=True)

# Convert `Project NUM` to string and strip leading and trailing whitespace
invoicing_df['Project NUM'] = invoicing_df['Project NUM'].astype(str).str.strip()

# Remove any non-digit characters from the `Cost Center` column
invoicing_df['Cost Center'] = invoicing_df['Cost Center'].astype(str).apply(lambda x: re.sub(r'[^0-9]', '', x))
collection_df['Cost Center'] = collection_df['Cost Center'].astype(str).apply(lambda x: re.sub(r'[^0-9]', '', x))

# Convert `Cost Center` to numeric
invoicing_df['Cost Center'] = pd.to_numeric(invoicing_df['Cost Center'])
collection_df['Cost Center'] = pd.to_numeric(collection_df['Cost Center'])

# Concatenate the two dataframes
projects_df = pd.concat([invoicing_df, collection_df])

# Create a new column `Project_ID` by combining relevant columns
projects_df['Project_ID'] = projects_df.apply(
    lambda row: row['Project NUM'] + ' - ' + row['project name']
    if not pd.isna(row['Project NUM'])
    else str(int(row['Cost Center'])) + ' - ' + row['Project Name'],
    axis=1
)

# Create a new column `Source` to track the source of the data
projects_df['Source'] = 'Invoicing'
projects_df.loc[projects_df['Project Name'].notna(), 'Source'] = 'Collections'

# Select the desired columns
projects_df = projects_df[[
    'Project_ID',
    'Source',
    'Project NUM',
    'project name',
    'Cost Center',
    'Project Name',
    'Rec No',
    'Customer/ Activity',
    'Customer Name',
    'Invoice Date',
    'Amount included VAT',
    'Receipt Date',
    'Amount',
]]

# Display the first 10 rows of the combined dataframe
print(projects_df.head(10).to_markdown(index=False, numalign="left", stralign="left"))

In [None]:
import re

# Drop nulls from `Project NUM` and `Cost Center`
invoicing_df.dropna(subset=['Project NUM', 'Cost Center'], inplace=True)
collection_df.dropna(subset=['Cost Center'], inplace=True)

# Convert `Project NUM` to string and strip leading and trailing whitespace
invoicing_df['Project NUM'] = invoicing_df['Project NUM'].astype(str).str.strip()

# Remove any non-digit characters from the `Cost Center` column
invoicing_df['Cost Center'] = invoicing_df['Cost Center'].astype(str).apply(lambda x: re.sub(r'[^0-9]', '', x))
collection_df['Cost Center'] = collection_df['Cost Center'].astype(str).apply(lambda x: re.sub(r'[^0-9]', '', x))

# Convert `Cost Center` to numeric
invoicing_df['Cost Center'] = pd.to_numeric(invoicing_df['Cost Center'])
collection_df['Cost Center'] = pd.to_numeric(collection_df['Cost Center'])

# Concatenate the two dataframes
projects_df = pd.concat([invoicing_df, collection_df])

# Create a new column `Project_ID` by combining relevant columns
projects_df['Project_ID'] = projects_df.apply(
    lambda row: (row['Project NUM'] + ' - ' + row['project name'])
    if not pd.isna(row['Project NUM'])
    else (str(row['Cost Center']) + ' - ' + row['Project Name']),
    axis=1
)

# Create a new column `Source` to track the source of the data
projects_df['Source'] = 'Invoicing'
projects_df.loc[projects_df['Project Name'].notna(), 'Source'] = 'Collections'

# Select the desired columns
projects_df = projects_df[[
    'Project_ID',
    'Source',
    'Project NUM',
    'project name',
    'Cost Center',
    'Project Name',
    'Rec No',
    'Customer/ Activity',
    'Customer Name',
    'Invoice Date',
    'Amount included VAT',
    'Receipt Date',
    'Amount',
]]

# Display the first 10 rows of the combined dataframe
print(projects_df.head(10).to_markdown(index=False, numalign="left", stralign="left"))

In [None]:
# Sort the data by `Project_ID` and `Source` columns
projects_df.sort_values(by=['Project_ID', 'Source'], inplace=True)

# Display the first 10 rows of the combined dataframe
print(projects_df.head(10).to_markdown(index=False, numalign="left", stralign="left"))