In [None]:
# Load all sheets from the Excel file to examine and clean them
all_sheets = pd.read_excel(file_path, sheet_name=None)  # None loads all sheets

# Dictionary to store cleaned data from all sheets
cleaned_sheets = {}

# Function to clean each sheet similar to previous methods
def clean_sheet(data):
    # Initializing temporary and permanent storage structures
    cleaned_data = {
        'State': [],
        'DC Number': [],
        'Location': [],
        'Square Feet': [],
        'Year Opened': [],
        'Description of Operation': []
    }
    temp_location = ''
    temp_description = ''
    temp_year = None

    # Iterate through the rows of the data
    for index, row in data.iterrows():
        if pd.notna(row['State']):  # Start of a new entry
            if temp_location:
                cleaned_data['Location'].append(temp_location)
                cleaned_data['Description of Operation'].append(temp_description)
                cleaned_data['Year Opened'].append(temp_year)
            temp_location = ''
            temp_description = ''
            temp_year = None
            
            cleaned_data['State'].append(row['State'])
            cleaned_data['DC Number'].append(row['DC'])
            temp_location = str(row['Location']) if pd.notna(row['Location']) else ''
            cleaned_data['Square Feet'].append(row['Square Feet'] if pd.notna(row['Square Feet']) else None)
            temp_year = row['Year'] if pd.notna(row['Year']) else None
            temp_description = row['Description of Operation'] if pd.notna(row['Description of Operation']) else ''
        else:
            if pd.notna(row['Location']):
                temp_location += ' ' + str(row['Location'])
            if pd.notna(row['Description of Operation']):
                temp_description += ' ' + row['Description of Operation']
    
    if temp_location:
        cleaned_data['Location'].append(temp_location)
        cleaned_data['Description of Operation'].append(temp_description)
        cleaned_data['Year Opened'].append(temp_year)

    return pd.DataFrame(cleaned_data)

# Process each sheet
for sheet_name, data in all_sheets.items():
    cleaned_sheets[sheet_name] = clean_sheet(data)

# Now, cleaned_sheets contains all the cleaned data indexed by sheet names
len(cleaned_sheets)  # Display the count of cleaned sheets to ensure all are processed


In [None]:
# Check column headers for each sheet to identify discrepancies
column_headers = {sheet_name: data.columns.tolist() for sheet_name, data in all_sheets.items()}
column_headers


In [None]:
# Standardize column names and proceed with data cleaning for each sheet
def standardize_columns(data):
    # Standardize column names to match the most common format
    data = data.rename(columns={
        'DC Number': 'DC',
        'Year Opened': 'Year',
        'Unnamed: 6': 'Extra'  # Handle any extra unnamed columns that appear
    })
    return data

# Re-process each sheet with standardized columns
for sheet_name, data in all_sheets.items():
    standardized_data = standardize_columns(data)
    cleaned_sheets[sheet_name] = clean_sheet(standardized_data)

# Now, cleaned_sheets contains all the cleaned data indexed by sheet names
len(cleaned_sheets)  # Display the count of cleaned sheets to ensure all are processed


In [None]:
# Refine the cleaning process to handle incomplete or improperly accumulated data

def refined_clean_sheet(data):
    # Initializing temporary and permanent storage structures
    cleaned_data = {
        'State': [],
        'DC': [],
        'Location': [],
        'Square Feet': [],
        'Year': [],
        'Description of Operation': []
    }
    temp_location = ''
    temp_description = ''
    temp_year = None
    temp_square_feet = None

    # Iterate through the rows of the data
    for index, row in data.iterrows():
        if pd.notna(row['State']):  # Start of a new entry
            if temp_location:
                cleaned_data['Location'].append(temp_location)
                cleaned_data['Description of Operation'].append(temp_description)
                cleaned_data['Year'].append(temp_year)
                cleaned_data['Square Feet'].append(temp_square_feet)
            temp_location = ''
            temp_description = ''
            temp_year = None
            temp_square_feet = None
            
            cleaned_data['State'].append(row['State'])
            cleaned_data['DC'].append(row['DC'])
            temp_location = str(row['Location']) if pd.notna(row['Location']) else ''
            temp_square_feet = row['Square Feet'] if pd.notna(row['Square Feet']) else None
            temp_year = row['Year'] if pd.notna(row['Year']) else None
            temp_description = row['Description of Operation'] if pd.notna(row['Description of Operation']) else ''
        else:
            if pd.notna(row['Location']):
                temp_location += ' ' + str(row['Location'])
            if pd.notna(row['Description of Operation']):
                temp_description += ' ' + row['Description of Operation']

    # Append the last entry
    if temp_location:
        cleaned_data['Location'].append(temp_location)
        cleaned_data['Description of Operation'].append(temp_description)
        cleaned_data['Year'].append(temp_year)
        cleaned_data['Square Feet'].append(temp_square_feet)

    return pd.DataFrame(cleaned_data)

# Re-process each sheet with refined cleaning process
for sheet_name, data in all_sheets.items():
    standardized_data = standardize_columns(data)
    cleaned_sheets[sheet_name] = refined_clean_sheet(standardized_data)

# Check the count of cleaned sheets to ensure all are processed
len(cleaned_sheets)


In [None]:
# Implementing a more robust check to ensure all fields are populated in each row

def fully_refined_clean_sheet(data):
    # Initializing temporary and permanent storage structures
    cleaned_data = {
        'State': [],
        'DC': [],
        'Location': [],
        'Square Feet': [],
        'Year': [],
        'Description of Operation': []
    }
    temp_location = ''
    temp_description = ''
    temp_year = None
    temp_square_feet = None
    entry_complete = True

    # Iterate through the rows of the data
    for index, row in data.iterrows():
        if pd.notna(row['State']) and entry_complete:  # Start of a new entry
            if temp_location:
                cleaned_data['Location'].append(temp_location)
                cleaned_data['Description of Operation'].append(temp_description)
                cleaned_data['Year'].append(temp_year)
                cleaned_data['Square Feet'].append(temp_square_feet)
            temp_location = ''
            temp_description = ''
            temp_year = None
            temp_square_feet = None
            entry_complete = False
            
            cleaned_data['State'].append(row['State'])
            cleaned_data['DC'].append(row['DC'])
            temp_location = str(row['Location']) if pd.notna(row['Location']) else ''
            temp_square_feet = row['Square Feet'] if pd.notna(row['Square Feet']) else None
            temp_year = row['Year'] if pd.notna(row['Year']) else None
            temp_description = row['Description of Operation'] if pd.notna(row['Description of Operation']) else ''
        else:
            if pd.notna(row['Location']):
                temp_location += ' ' + str(row['Location'])
            if pd.notna(row['Description of Operation']):
                temp_description += ' ' + row['Description of Operation']
            if pd.notna(row['Year']):
                temp_year = row['Year']
            if pd.notna(row['Square Feet']):
                temp_square_feet = row['Square Feet']
            entry_complete = True

    # Append the last entry
    if temp_location:
        cleaned_data['Location'].append(temp_location)
        cleaned_data['Description of Operation'].append(temp_description)
        cleaned_data['Year'].append(temp_year)
        cleaned_data['Square Feet'].append(temp_square_feet)

    return pd.DataFrame(cleaned_data)

# Re-process each sheet with fully refined cleaning process
for sheet_name, data in all_sheets.items():
    standardized_data = standardize_columns(data)
    cleaned_sheets[sheet_name] = fully_refined_clean_sheet(standardized_data)

# Check the count of cleaned sheets to ensure all are processed
len(cleaned_sheets)


In [None]:
# Simplified cleaning approach to extract only necessary data for summary statistics
def simplified_clean_sheet(data):
    cleaned_entries = {
        'Year': [],
        'Square Feet': []
    }
    
    for index, row in data.iterrows():
        # Checking for complete entries directly based on essential data presence
        if pd.notna(row['State']) and pd.notna(row['Square Feet']) and pd.notna(row['Year']):
            # Extract only the year portion if it contains more detail
            year = str(row['Year']).split(",")[-1].strip()
            cleaned_entries['Year'].append(year)
            cleaned_entries['Square Feet'].append(row['Square Feet'])
    
    return pd.DataFrame(cleaned_entries)

# Applying the simplified cleaning method
simplified_cleaned_data = {}
for sheet_name, data in all_sheets.items():
    standardized_data = standardize_columns(data)
    simplified_cleaned_data[sheet_name] = simplified_clean_sheet(standardized_data)

# Check the number of entries processed to ensure data was collected
{sheet: len(df) for sheet, df in simplified_cleaned_data.items()}


## Second GPT

In [None]:
import pandas as pd

# Load all sheets from the uploaded Excel file
file_path = '/mnt/data/Distribution Centers.xlsx'
all_sheets = pd.read_excel(file_path, sheet_name=None)

def clean_sheet(data):
    cleaned_data = {
        'State': [],
        'DC Number': [],
        'Location': [],
        'Square Feet': [],
        'Year Opened': [],
        'Description of Operation': []
    }
    temp_storage = {}

    for index, row in data.iterrows():
        if pd.notna(row['State']):  # Indicates the start of a new entry
            if temp_storage:  # There is previous data to append
                for key in cleaned_data:
                    cleaned_data[key].append(temp_storage.get(key, ''))
                temp_storage = {}
            # Update temporary storage with new data
            temp_storage = {
                'State': row.get('State', ''),
                'DC Number': row.get('DC Number', ''),
                'Location': row.get('Location', ''),
                'Square Feet': row.get('Square Feet', None),
                'Year Opened': row.get('Year Opened', None),
                'Description of Operation': row.get('Description of Operation', '')
            }
        else:
            # Continuation of existing entry, append additional info
            temp_storage['Location'] = temp_storage.get('Location', '') + ' ' + str(row.get('Location', ''))
            temp_storage['Description of Operation'] = temp_storage.get('Description of Operation', '') + ' ' + str(row.get('Description of Operation', ''))

    # Append the last collected entry
    if temp_storage:
        for key in cleaned_data:
            cleaned_data[key].append(temp_storage.get(key, ''))

    return pd.DataFrame(cleaned_data)

def standardize_columns(data):
    # Define a dictionary to map actual column names to standardized ones
    standard_names = {
        'DC': 'DC Number',
        'Year': 'Year Opened',
        # Add all known variations here
    }
    # Check if column names in data are in the keys of standard_names, if so, rename them
    data = data.rename(columns={old: standard_names.get(old, old) for old in data.columns})
    return data

# Cleaning all sheets
cleaned_sheets = {}
for sheet_name, data in all_sheets.items():
    data = standardize_columns(data)  # Standardize columns first
    cleaned_sheets[sheet_name] = clean_sheet(data)  # Clean data

# Display the count of cleaned sheets and a preview to ensure all are processed
print("Total sheets cleaned:", len(cleaned_sheets))
for name, sheet in cleaned_sheets.items():
    print(f"Preview of {name}:")
    print(sheet.head())


In [None]:
# Updated and corrected version of the script to handle the mismatch in column names and prevent KeyError

def clean_sheet(data):
    cleaned_data = {
        'State': [],
        'DC Number': [],
        'Location': [],
        'Square Feet': [],
        'Year Opened': [],
        'Description of Operation': []
    }
    temp_storage = {}

    for index, row in data.iterrows():
        if pd.notna(row['State']):  # Indicates the start of a new entry
            if temp_storage:  # There is previous data to append
                for key in cleaned_data:
                    cleaned_data[key].append(temp_storage.get(key, ''))
                temp_storage = {}
            # Update temporary storage with new data
            temp_storage = {
                'State': row.get('State', ''),
                'DC Number': row.get('DC Number', ''),
                'Location': row.get('Location', ''),
                'Square Feet': row.get('Square Feet', None),
                'Year Opened': row.get('Year Opened', None),
                'Description of Operation': row.get('Description of Operation', '')
            }
        else:
            # Continuation of existing entry, append additional info
            temp_storage['Location'] = temp_storage.get('Location', '') + ' ' + str(row.get('Location', ''))
            temp_storage['Description of Operation'] = temp_storage.get('Description of Operation', '') + ' ' + str(row.get('Description of Operation', ''))

    # Append the last collected entry
    if temp_storage:
        for key in cleaned_data:
            cleaned_data[key].append(temp_storage.get(key, ''))

    return pd.DataFrame(cleaned_data)

def standardize_columns(data):
    # Define a dictionary to map actual column names to standardized ones
    standard_names = {
        'DC': 'DC Number',
        'Year': 'Year Opened',
        # Ensure all variations are covered
    }
    # Check if column names in data are in the keys of standard_names, if so, rename them
    data = data.rename(columns={old: standard_names.get(old, old) for old in data.columns})
    return data

# Cleaning all sheets
cleaned_sheets = {}
for sheet_name, data in all_sheets.items():
    data = standardize_columns(data)  # Standardize columns first
    cleaned_sheets[sheet_name] = clean_sheet(data)  # Clean data

# Display the count of cleaned sheets and a preview to ensure all are processed
len(cleaned_sheets), {sheet_name: sheet_data.head() for sheet_name, sheet_data in cleaned_sheets.items()}


In [None]:
temp_storage['Location'] = temp_storage.get('Location', '') + ' ' + str(row.get('Location', '') if pd.notna(row['Location']) else '')
temp_storage['Description of Operation'] = temp_storage.get('Description of Operation', '') + ' ' + str(row.get('Description of Operation', '') if pd.notna(row['Description of Operation']) else '')


In [None]:
# Updated version of the script to handle concatenation of strings and floats correctly

def clean_sheet(data):
    cleaned_data = {
        'State': [],
        'DC Number': [],
        'Location': [],
        'Square Feet': [],
        'Year Opened': [],
        'Description of Operation': []
    }
    temp_storage = {}

    for index, row in data.iterrows():
        if pd.notna(row['State']):  # Indicates the start of a new entry
            if temp_storage:  # There is previous data to append
                for key in cleaned_data:
                    cleaned_data[key].append(temp_storage.get(key, ''))
                temp_storage = {}
            # Update temporary storage with new data
            temp_storage = {
                'State': row.get('State', ''),
                'DC Number': row.get('DC Number', ''),
                'Location': row.get('Location', ''),
                'Square Feet': row.get('Square Feet', None),
                'Year Opened': row.get('Year Opened', None),
                'Description of Operation': row.get('Description of Operation', '')
            }
        else:
            # Continuation of existing entry, append additional info
            temp_storage['Location'] = temp_storage.get('Location', '') + ' ' + (str(row['Location']) if pd.notna(row['Location']) else '')
            temp_storage['Description of Operation'] = temp_storage.get('Description of Operation', '') + ' ' + (str(row['Description of Operation']) if pd.notna(row['Description of Operation']) else '')

    # Append the last collected entry
    if temp_storage:
        for key in cleaned_data:
            cleaned_data[key].append(temp_storage.get(key, ''))

    return pd.DataFrame(cleaned_data)

# Cleaning all sheets
cleaned_sheets = {}
for sheet_name, data in all_sheets.items():
    data = standardize_columns(data)  # Standardize columns first
    cleaned_sheets[sheet_name] = clean_sheet(data)  # Clean data

# Display the count of cleaned sheets and a preview to ensure all are processed
len(cleaned_sheets), {sheet_name: sheet_data.head() for sheet_name, sheet_data in cleaned_sheets.items()}


In [None]:
 temp_storage['Location'] = temp_storage.get('Location', '') + ' ' + (str(row['Location']) if pd.notna(row['Location']) else '')
temp_storage['Description of Operation'] = temp_storage.get('Description of Operation', '') + ' ' + (str(row['Description of Operation']) if pd.notna(row['Description of Operation']) else '')


In [None]:
def clean_sheet(data):
    cleaned_data = {
        'State': [],
        'DC Number': [],
        'Location': [],
        'Square Feet': [],
        'Year Opened': [],
        'Description of Operation': []
    }
    temp_storage = {}

    for index, row in data.iterrows():
        if pd.notna(row['State']):  # Indicates the start of a new entry
            if temp_storage:  # There is previous data to append
                for key in cleaned_data:
                    cleaned_data[key].append(temp_storage.get(key, ''))
                temp_storage = {}
            # Update temporary storage with new data
            temp_storage = {
                'State': str(row['State']),
                'DC Number': str(row['DC Number']),
                'Location': str(row['Location']) if pd.notna(row['Location']) else '',
                'Square Feet': row['Square Feet'] if pd.notna(row['Square Feet']) else None,
                'Year Opened': row['Year Opened'] if pd.notna(row['Year Opened']) else None,
                'Description of Operation': str(row['Description of Operation']) if pd.notna(row['Description of Operation']) else ''
            }
        else:
            # Continuation of existing entry, append additional info
            temp_storage['Location'] += ' ' + (str(row['Location']) if pd.notna(row['Location']) else '')
            temp_storage['Description of Operation'] += ' ' + (str(row['Description of Operation']) if pd.notna(row['Description of Operation']) else '')

    # Append the last collected entry
    if temp_storage:
        for key in cleaned_data:
            cleaned_data[key].append(temp_storage.get(key, ''))

    return pd.DataFrame(cleaned_data)


In [None]:
# Implement the corrected clean_sheet function that handles data concatenation safely

def clean_sheet(data):
    cleaned_data = {
        'State': [],
        'DC Number': [],
        'Location': [],
        'Square Feet': [],
        'Year Opened': [],
        'Description of Operation': []
    }
    temp_storage = {}

    for index, row in data.iterrows():
        if pd.notna(row['State']):  # Indicates the start of a new entry
            if temp_storage:  # There is previous data to append
                for key in cleaned_data:
                    cleaned_data[key].append(temp_storage.get(key, ''))
                temp_storage = {}
            # Update temporary storage with new data
            temp_storage = {
                'State': str(row['State']),
                'DC Number': str(row['DC Number']),
                'Location': str(row['Location']) if pd.notna(row['Location']) else '',
                'Square Feet': row['Square Feet'] if pd.notna(row['Square Feet']) else None,
                'Year Opened': row['Year Opened'] if pd.notna(row['Year Opened']) else None,
                'Description of Operation': str(row['Description of Operation']) if pd.notna(row['Description of Operation']) else ''
            }
        else:
            # Continuation of existing entry, append additional info
            temp_storage['Location'] += ' ' + (str(row['Location']) if pd.notna(row['Location']) else '')
            temp_storage['Description of Operation'] += ' ' + (str(row['Description of Operation']) if pd.notna(row['Description of Operation']) else '')

    # Append the last collected entry
    if temp_storage:
        for key in cleaned_data:
            cleaned_data[key].append(temp_storage.get(key, ''))

    return pd.DataFrame(cleaned_data)

# Cleaning all sheets
cleaned_sheets = {}
for sheet_name, data in all_sheets.items():
    data = standardize_columns(data)  # Standardize columns first
    cleaned_sheets[sheet_name] = clean_sheet(data)  # Clean data

# Display the count of cleaned sheets and a preview to ensure all are processed
len(cleaned_sheets), {sheet_name: sheet_data.head() for sheet_name, sheet_data in cleaned_sheets.items()}


In [None]:
def clean_sheet(data):
    cleaned_data = {
        'State': [],
        'DC Number': [],
        'Location': [],
        'Square Feet': [],
        'Year Opened': [],
        'Description of Operation': []
    }
    # Initialize temp_storage with empty or default values for all keys
    temp_storage = {key: '' if key in ['State', 'DC Number', 'Location', 'Description of Operation'] else None for key in cleaned_data}

    for index, row in data.iterrows():
        if pd.notna(row['State']):  # Indicates the start of a new entry
            if any(temp_storage.values()):  # There is previous data to append
                for key in cleaned_data:
                    cleaned_data[key].append(temp_storage.get(key, ''))
                temp_storage = {key: '' if key in ['State', 'DC Number', 'Location', 'Description of Operation'] else None for key in cleaned_data}
            
            # Update temporary storage with new data
            temp_storage.update({
                'State': str(row['State']),
                'DC Number': str(row['DC Number']),
                'Location': str(row['Location']) if pd.notna(row['Location']) else '',
                'Square Feet': row['Square Feet'] if pd.notna(row['Square Feet']) else None,
                'Year Opened': row['Year Opened'] if pd.notna(row['Year Opened']) else None,
                'Description of Operation': str(row['Description of Operation']) if pd.notna(row['Description of Operation']) else ''
            })
        else:
            # Continuation of existing entry, append additional info
            temp_storage['Location'] += ' ' + (str(row['Location']) if pd.notna(row['Location']) else '')
            temp_storage['Description of Operation'] += ' ' + (str(row['Description of Operation']) if pd.notna(row['Description of Operation']) else '')

    # Append the last collected entry
    if any(temp_storage.values()):
        for key in cleaned_data:
            cleaned_data[key].append(temp_storage.get(key, ''))

    return pd.DataFrame(cleaned_data)


In [None]:
# Revised clean_sheet function with proper initialization of temp_storage to prevent KeyError

def clean_sheet(data):
    cleaned_data = {
        'State': [],
        'DC Number': [],
        'Location': [],
        'Square Feet': [],
        'Year Opened': [],
        'Description of Operation': []
    }
    # Initialize temp_storage with empty or default values for all keys
    temp_storage = {key: '' if key in ['State', 'DC Number', 'Location', 'Description of Operation'] else None for key in cleaned_data}

    for index, row in data.iterrows():
        if pd.notna(row['State']):  # Indicates the start of a new entry
            if any(temp_storage.values()):  # There is previous data to append
                for key in cleaned_data:
                    cleaned_data[key].append(temp_storage.get(key, ''))
                temp_storage = {key: '' if key in ['State', 'DC Number', 'Location', 'Description of Operation'] else None for key in cleaned_data}
            
            # Update temporary storage with new data
            temp_storage.update({
                'State': str(row['State']),
                'DC Number': str(row['DC Number']),
                'Location': str(row['Location']) if pd.notna(row['Location']) else '',
                'Square Feet': row['Square Feet'] if pd.notna(row['Square Feet']) else None,
                'Year Opened': row['Year Opened'] if pd.notna(row['Year Opened']) else None,
                'Description of Operation': str(row['Description of Operation']) if pd.notna(row['Description of Operation']) else ''
            })
        else:
            # Continuation of existing entry, append additional info
            temp_storage['Location'] += ' ' + (str(row['Location']) if pd.notna(row['Location']) else '')
            temp_storage['Description of Operation'] += ' ' + (str(row['Description of Operation']) if pd.notna(row['Description of Operation']) else '')

    # Append the last collected entry
    if any(temp_storage.values()):
        for key in cleaned_data:
            cleaned_data[key].append(temp_storage.get(key, ''))

    return pd.DataFrame(cleaned_data)

# Cleaning all sheets again with the revised function
cleaned_sheets = {}
for sheet_name, data in all_sheets.items():
    data = standardize_columns(data)  # Standardize columns first
    cleaned_sheets[sheet_name] = clean_sheet(data)  # Clean data

# Display the count of cleaned sheets and a preview to ensure all are processed
len(cleaned_sheets), {sheet_name: sheet_data.head() for sheet_name, sheet_data in cleaned_sheets.items()}


## Summary Stats

In [None]:
import re

# Function to extract year from various date formats in the 'Year Opened' field
def extract_year(date_str):
    if pd.isna(date_str):
        return None
    match = re.search(r'\b(19|20)\d{2}\b', str(date_str))
    return int(match.group(0)) if match else None

# Function to derive type from 'Description of Operation'
def extract_type(description):
    if "Crossdock" in description:
        return "Crossdock"
    elif "Import" in description:
        return "Import"
    elif "Distribution Center" in description:
        return "Distribution Center"
    else:
        return "Other"

# Prepare data for aggregation
aggregated_data = {
    'Year': [],
    'Type': [],
    'Square Feet': [],
    'Count': []  # This will count the number of entries for DCs
}

for sheet_name, data in cleaned_sheets.items():
    for index, row in data.iterrows():
        year = extract_year(row['Year Opened'])
        if year:  # Only process entries with a valid year
            type_ = extract_type(row['Description of Operation'])
            aggregated_data['Year'].append(year)
            aggregated_data['Type'].append(type_)
            aggregated_data['Square Feet'].append(row['Square Feet'] if pd.notna(row['Square Feet']) else 0)
            aggregated_data['Count'].append(1)  # Each row counts as one DC

# Convert aggregated data to DataFrame
aggregated_df = pd.DataFrame(aggregated_data)

# Summarize data: Total square feet and count of DCs by year and type
summary_stats = aggregated_df.groupby(['Year', 'Type']).agg({'Square Feet': 'sum', 'Count': 'sum'}).reset_index()
summary_stats.sort_values(by='Year', inplace=True)
summary_stats


In [None]:
# Ensure 'Square Feet' is treated as floats and handle NaN values properly before aggregation

# Convert 'Square Feet' to floats and replace NaN with 0 for aggregation
aggregated_df['Square Feet'] = aggregated_df['Square Feet'].apply(lambda x: float(x) if pd.notna(x) else 0.0)

# Summarize data: Total square feet and count of DCs by year and type
summary_stats = aggregated_df.groupby(['Year', 'Type']).agg({'Square Feet': 'sum', 'Count': 'sum'}).reset_index()
summary_stats.sort_values(by='Year', inplace=True)
summary_stats


In [None]:
# Clean 'Square Feet' by removing commas and converting to float
aggregated_df['Square Feet'] = aggregated_df['Square Feet'].apply(lambda x: float(str(x).replace(',', '')) if pd.notna(x) else 0.0)

# Summarize data: Total square feet and count of DCs by year and type
summary_stats = aggregated_df.groupby(['Year', 'Type']).agg({'Square Feet': 'sum', 'Count': 'sum'}).reset_index()
summary_stats.sort_values(by='Year', inplace=True)
summary_stats


In [None]:
# Display unique values in 'Square Feet' that may be causing conversion problems
unique_square_feet = set(aggregated_df['Square Feet'])
unique_square_feet


In [None]:
# Function to clean and convert square feet values to floats
def clean_square_feet(value):
    if pd.isna(value):
        return 0.0
    # Remove commas, extract the first sequence of numbers possibly containing a dot
    cleaned_value = re.findall(r'[\d\.]+', str(value).replace(',', '').replace(' ', ''))
    return float(cleaned_value[0]) if cleaned_value else 0.0

# Apply the cleaning function to the 'Square Feet' column
aggregated_df['Square Feet'] = aggregated_df['Square Feet'].apply(clean_square_feet)

# Summarize data: Total square feet and count of DCs by year and type
summary_stats = aggregated_df.groupby(['Year', 'Type']).agg({'Square Feet': 'sum', 'Count': 'sum'}).reset_index()
summary_stats.sort_values(by='Year', inplace=True)
summary_stats


In [None]:
def clean_square_feet(value):
    if pd.isna(value):
        return 0.0
    # Normalize the string by removing commas and extra spaces
    value = str(value).replace(',', '').replace(' ', '')
    # Handle cases with multiple periods
    parts = value.split('.')
    if len(parts) > 2:
        value = ''.join(parts[:-1]) + '.' + parts[-1]  # Rejoin all but last part without dots and last part with a dot
    elif len(parts) == 2:
        value = value  # Already correctly formatted
    # Convert to float
    try:
        return float(value)
    except ValueError:
        return 0.0  # In case of any unexpected format that still fails, return 0.0


In [None]:
# Updated function to clean and convert square feet values to floats, handling multiple periods
def clean_square_feet(value):
    if pd.isna(value):
        return 0.0
    # Normalize the string by removing commas and extra spaces
    value = str(value).replace(',', '').replace(' ', '')
    # Handle cases with multiple periods
    parts = value.split('.')
    if len(parts) > 2:
        value = ''.join(parts[:-1]) + '.' + parts[-1]  # Rejoin all but last part without dots and last part with a dot
    elif len(parts) == 2:
        value = value  # Already correctly formatted
    # Convert to float
    try:
        return float(value)
    except ValueError:
        return 0.0  # In case of any unexpected format that still fails, return 0.0

# Apply the cleaning function to the 'Square Feet' column
aggregated_df['Square Feet'] = aggregated_df['Square Feet'].apply(clean_square_feet)

# Summarize data: Total square feet and count of DCs by year and type
summary_stats = aggregated_df.groupby(['Year', 'Type']).agg({'Square Feet': 'sum', 'Count': 'sum'}).reset_index()
summary_stats.sort_values(by='Year', inplace=True)
summary_stats


In [None]:
# Calculate total distribution centers from the summary statistics
total_centers_from_summary = summary_stats['Count'].sum()

# Calculate the total number of entries across all sheets
total_centers_from_sheets = sum([sheet.shape[0] for sheet in cleaned_sheets.values()])

total_centers_from_summary, total_centers_from_sheets
