In [1]:
import pandas as pd

# Load the uploaded files into dataframes - csv files downloaded from QuickBooks
file_path1 = "ManorExpenses.csv"
file_path2 = "manorexpenses2.csv"

df1 = pd.read_csv(file_path1)
df2 = pd.read_csv(file_path2)

# Display the first few rows of both dataframes to understand their structures
df1_head = df1.head()
df2_head = df2.head()

df1_head, df2_head


(            Natural Electric LLC                     Unnamed: 1  Unnamed: 2  \
 0             Transaction Report                            NaN         NaN   
 1                      All Dates                            NaN         NaN   
 2                            NaN                        Account        Date   
 3  51400 Job Materials Purchased                            NaN         NaN   
 4                            NaN  51400 Job Materials Purchased  03/28/2019   
 
          Unnamed: 3 Unnamed: 4  
 0               NaN        NaN  
 1               NaN        NaN  
 2  Transaction type     Amount  
 3               NaN        NaN  
 4              Bill  $8,052.47  ,
   Natural Electric LLC Unnamed: 1  Unnamed: 2        Unnamed: 3 Unnamed: 4
 0   Transaction Report        NaN         NaN               NaN        NaN
 1            All Dates        NaN         NaN               NaN        NaN
 2                  NaN    Account        Date  Transaction type     Amount
 3       

In [2]:
# Reloading the datasets, skipping the irrelevant metadata rows
df1_clean = pd.read_csv(file_path1, skiprows=4)  # Start reading from row 4
df2_clean = pd.read_csv(file_path2, skiprows=4)  # Start reading from row 4

# Check if the columns are identical before merging
columns_match = df1_clean.columns == df2_clean.columns

# If columns match, concatenate the dataframes
if columns_match.all():
    combined_df = pd.concat([df1_clean, df2_clean], ignore_index=True)
else:
    combined_df = None  # If columns don't match, we'll handle it differently

# Display the combined dataset or column mismatch message
combined_df_head = combined_df.head() if combined_df is not None else "Column mismatch detected."
combined_df_head


Unnamed: 0.1,Unnamed: 0,Account,Date,Transaction type,Amount
0,51400 Job Materials Purchased,,,,
1,,51400 Job Materials Purchased,03/28/2019,Bill,"$8,052.47"
2,,51400 Job Materials Purchased,04/01/2019,Bill,$924.55
3,,51400 Job Materials Purchased,04/01/2019,Bill,"$1,306.59"
4,,51400 Job Materials Purchased,04/01/2019,Bill,$700.32


In [4]:
# Remove the first column
cleaned_df_without_first_column = combined_df.iloc[:, 1:]

# Display the first few rows of the updated dataframe
cleaned_df_without_first_column_head = cleaned_df_without_first_column.head()
cleaned_df_without_first_column_head


Unnamed: 0,Account,Date,Transaction type,Amount
0,,,,
1,51400 Job Materials Purchased,03/28/2019,Bill,"$8,052.47"
2,51400 Job Materials Purchased,04/01/2019,Bill,$924.55
3,51400 Job Materials Purchased,04/01/2019,Bill,"$1,306.59"
4,51400 Job Materials Purchased,04/01/2019,Bill,$700.32


In [5]:
# Report the number of NaN and missing values in each column
missing_values_report = cleaned_df_without_first_column.isna().sum()

missing_values_report


Account             32
Date                32
Transaction type    32
Amount              16
dtype: int64

In [6]:
# Drop all rows with any NaN values
cleaned_df_no_na = cleaned_df_without_first_column.dropna()

# Identify rows with any missing values
rows_with_missing_values = cleaned_df_no_na[cleaned_df_no_na.isna().any(axis=1)]

# Display the rows with missing values (if any)
if not rows_with_missing_values.empty:
    print("Rows with missing values:")
    print(rows_with_missing_values)
else:
    print("No rows with missing values found.")


No rows with missing values found.


In [7]:
# Display the data types of each column in the cleaned dataframe
data_types = cleaned_df_no_na.dtypes
data_types

Account             object
Date                object
Transaction type    object
Amount              object
dtype: object

In [10]:
# Modify 'Amount' and 'Date' columns in cleaned_df_no_na
cleaned_df_no_na.loc[:, 'Amount'] = (
    cleaned_df_no_na['Amount']
    .replace(r'[\$,]', '', regex=True)
    .astype(float)
)

cleaned_df_no_na.loc[:, 'Date'] = pd.to_datetime(
    cleaned_df_no_na['Date'], errors='coerce'
)

# Display data types
cleaned_df_no_na.dtypes

Account                     object
Date                datetime64[ns]
Transaction type            object
Amount                     float64
dtype: object

In [11]:
# Display all unique values in the 'Account' column
unique_account_types = cleaned_df_no_na['Account'].unique()

unique_account_types


array(['51400 Job Materials Purchased', '50800 Equipment Rental for Jobs',
       '53600 Subcontractors Expense', 'PT Casual Labor', 'Fuel',
       '64300 Meals and Entertainment', '60100 Auto and Truck Expenses',
       '63300 Insurance Expense', '60400 Bank Service Charges',
       'Employee Education and Resources', 'Employee Bonus', 'Wages',
       'Taxes'], dtype=object)

In [13]:
# Remove leading numbers and spaces from the 'Account' column using regex
cleaned_df_no_na.loc[:, 'Account'] = cleaned_df_no_na['Account'].str.replace(r'^\d+\s+', '', regex=True)

# Display the unique account types after cleaning
unique_account_types_cleaned = cleaned_df_no_na['Account'].unique()

print(unique_account_types_cleaned)


['Job Materials Purchased' 'Equipment Rental for Jobs'
 'Subcontractors Expense' 'PT Casual Labor' 'Fuel'
 'Meals and Entertainment' 'Auto and Truck Expenses' 'Insurance Expense'
 'Bank Service Charges' 'Employee Education and Resources'
 'Employee Bonus' 'Wages' 'Taxes']


In [14]:
# Remove rows where 'Account' is 'Employee Bonus'
cleaned_df_no_na = cleaned_df_no_na[cleaned_df_no_na['Account'] != 'Employee Bonus']

#Display unique account types to confirm removal

unique_account_types_cleaned = cleaned_df_no_na['Account'].unique()

print(unique_account_types_cleaned)


['Job Materials Purchased' 'Equipment Rental for Jobs'
 'Subcontractors Expense' 'PT Casual Labor' 'Fuel'
 'Meals and Entertainment' 'Auto and Truck Expenses' 'Insurance Expense'
 'Bank Service Charges' 'Employee Education and Resources' 'Wages' 'Taxes']


In [15]:
# Replace specified account names with 'Labor'
cleaned_df_no_na['Account'] = cleaned_df_no_na['Account'].replace(
    ['PT Casual Labor', 'Wages', 'Taxes'], 'Labor'
)

# Display the unique account types to confirm the change
unique_account_types_updated = cleaned_df_no_na['Account'].unique()

unique_account_types_updated


array(['Job Materials Purchased', 'Equipment Rental for Jobs',
       'Subcontractors Expense', 'Labor', 'Fuel',
       'Meals and Entertainment', 'Auto and Truck Expenses',
       'Insurance Expense', 'Bank Service Charges',
       'Employee Education and Resources'], dtype=object)

In [16]:
# Replace specified account names with 'Other'
cleaned_df_no_na['Account'] = cleaned_df_no_na['Account'].replace(
    ['Meals and Entertainment', 'Auto and Truck Expenses', 
     'Insurance Expense', 'Bank Service Charges'], 'Other'
)

# Display the unique account types to confirm the change
unique_account_types_final = cleaned_df_no_na['Account'].unique()

unique_account_types_final


array(['Job Materials Purchased', 'Equipment Rental for Jobs',
       'Subcontractors Expense', 'Labor', 'Fuel', 'Other',
       'Employee Education and Resources'], dtype=object)

In [17]:
# Remove the row with 'Employee Education and Resources'
cleaned_df_no_na = cleaned_df_no_na[cleaned_df_no_na['Account'] != 'Employee Education and Resources']

# Display the unique account types to confirm the change
unique_account_types_final = cleaned_df_no_na['Account'].unique()

unique_account_types_final


array(['Job Materials Purchased', 'Equipment Rental for Jobs',
       'Subcontractors Expense', 'Labor', 'Fuel', 'Other'], dtype=object)

In [18]:
#Shift the labor dates one week prior to better reflect dates of actual labor - paycheck one week prior to payperiod end
# Subtract one week from the 'Date' column for all 'Labor' entries

cleaned_df_no_na.loc[cleaned_df_no_na['Account'] == 'Labor', 'Date'] = (
    cleaned_df_no_na.loc[cleaned_df_no_na['Account'] == 'Labor', 'Date'] - pd.Timedelta(weeks=1)
)

# Display the updated 'Labor' entries to confirm the change
labor_entries_updated = cleaned_df_no_na[cleaned_df_no_na['Account'] == 'Labor']

labor_entries_updated

Unnamed: 0,Account,Date,Transaction type,Amount
410,Labor,2020-06-22,Expense,333.00
411,Labor,2020-06-24,Expense,307.40
641,Labor,2019-03-15,Payroll Check,999.26
642,Labor,2019-03-15,Payroll Check,130.67
643,Labor,2019-03-15,Payroll Check,1121.31
...,...,...,...,...
1000,Labor,2020-10-23,Payroll Check,14.82
1001,Labor,2020-11-06,Payroll Check,17.04
1002,Labor,2021-01-15,Payroll Check,3.43
1003,Labor,2021-01-15,Payroll Check,0.79


In [19]:
# Rename the dataframe to 'manor_expense_df'
manor_expense_df = cleaned_df_no_na

# Save the dataframe to a CSV file with the name 'ManorExpensesFinal.csv'
csv_path = "ManorExpensesFinal.csv"
manor_expense_df.to_csv(csv_path, index=False)

manor_expense_df.head

<bound method NDFrame.head of                       Account       Date Transaction type   Amount
1     Job Materials Purchased 2019-03-28             Bill  8052.47
2     Job Materials Purchased 2019-04-01             Bill   924.55
3     Job Materials Purchased 2019-04-01             Bill  1306.59
4     Job Materials Purchased 2019-04-01             Bill   700.32
5     Job Materials Purchased 2019-04-01             Bill   424.47
...                       ...        ...              ...      ...
1000                    Labor 2020-10-23    Payroll Check    14.82
1001                    Labor 2020-11-06    Payroll Check    17.04
1002                    Labor 2021-01-15    Payroll Check     3.43
1003                    Labor 2021-01-15    Payroll Check     0.79
1004                    Labor 2021-02-12    Payroll Check     4.94

[975 rows x 4 columns]>

In [20]:
# Create a new dataframe 'NE_jobs_df' with the specified columns
columns = [
    "Job Name", "Materials", "Labor", "Equipment Rental", 
    "Subcontractor Expense", "Fuel", "Other", 
    "Total Cost", "Total Income", "Profit Margin", "Vendor Credits"
]

# Initialize the new dataframe with the specified columns
NE_jobs_df = pd.DataFrame(columns=columns)

# Display the new dataframe to confirm its creation
NE_jobs_df


Unnamed: 0,Job Name,Materials,Labor,Equipment Rental,Subcontractor Expense,Fuel,Other,Total Cost,Total Income,Profit Margin,Vendor Credits


In [23]:
# Calculate the necessary values from 'manor_expense_df'
materials_sum = manor_expense_df[manor_expense_df['Account'] == 'Job Materials Purchased']['Amount'].sum()
labor_sum = manor_expense_df[manor_expense_df['Account'] == 'Labor']['Amount'].sum()
equipment_rental_sum = manor_expense_df[manor_expense_df['Account'] == 'Equipment Rental for Jobs']['Amount'].sum()
subcontractor_expense_sum = manor_expense_df[manor_expense_df['Account'] == 'Subcontractors Expense']['Amount'].sum()
fuel_sum = manor_expense_df[manor_expense_df['Account'] == 'Fuel']['Amount'].sum()
other_sum = manor_expense_df[manor_expense_df['Account'] == 'Other']['Amount'].sum()

# Total Cost is the sum of all expenses across all account types
total_cost = manor_expense_df['Amount'].sum()

# Total Income provided as float
total_income = 828476.88

# Calculate Profit Margin: (Total Income - Total Cost) / Total Income
profit_margin = (total_income - total_cost) / total_income

# Sum of Vendor Credits (as negative numbers)
vendor_credits_sum = manor_expense_df[
    manor_expense_df['Transaction type'] == 'Vendor Credit']['Amount'].sum()

# Create the first row as a dictionary
first_row = {
    "Job Name": "The Manor",
    "Materials": materials_sum,
    "Labor": labor_sum,
    "Equipment Rental": equipment_rental_sum,
    "Subcontractor Expense": subcontractor_expense_sum,
    "Fuel": fuel_sum,
    "Other": other_sum,
    "Total Cost": total_cost,
    "Total Income": total_income,
    "Profit Margin": profit_margin,
    "Vendor Credits": vendor_credits_sum
}

# Create a DataFrame from the first row
first_row_df = pd.DataFrame([first_row])

# === Insert the following section here ===
# Drop all-NaN columns from both DataFrames to prevent future warnings
first_row_df = first_row_df.dropna(axis=1, how='all')
NE_jobs_df = NE_jobs_df.dropna(axis=1, how='all')

# Use pd.concat to add the row to NE_jobs_df
NE_jobs_df = pd.concat([NE_jobs_df, first_row_df], ignore_index=True)

# Round the numeric values to two decimal places
NE_jobs_df = NE_jobs_df.round(2)

# Display the final NE_jobs_df
NE_jobs_df


Unnamed: 0,Job Name,Materials,Labor,Equipment Rental,Subcontractor Expense,Fuel,Other,Total Cost,Total Income,Profit Margin,Vendor Credits
0,The Manor,246049.05,199060.77,4181.55,47900.54,6010.92,26252.18,529455.01,828476.88,0.36,-30620.53
1,The Manor,246049.05,199060.77,4181.55,47900.54,6010.92,26252.18,529455.01,828476.88,0.36,-30620.53


In [24]:
# Remove duplicate rows from NE_jobs_df
NE_jobs_df = NE_jobs_df.drop_duplicates(ignore_index=True)

# Display the dataframe to confirm the removal of duplicates
NE_jobs_df


Unnamed: 0,Job Name,Materials,Labor,Equipment Rental,Subcontractor Expense,Fuel,Other,Total Cost,Total Income,Profit Margin,Vendor Credits
0,The Manor,246049.05,199060.77,4181.55,47900.54,6010.92,26252.18,529455.01,828476.88,0.36,-30620.53


In [25]:
# Save the NE_jobs_df to a CSV file named 'NEjobs.csv'
csv_path = "NEjobs.csv"
NE_jobs_df.to_csv(csv_path, index=False)

# Confirm the operation
csv_path


'NEjobs.csv'

In [27]:
manor_expense_df.to_excel('ManorExpensesFinal.xlsx', index=False)