# Group Project 4 - Team T.W. Schultz


### Team Members: Neha Lala (*Team Manager*), Gavin Armstrong, Dojun Kim, Samantha Wang, Daniela Salinas Gomez

## **1.3 [A] Unit Test:**

In [1]:
import pandas as pd

In [2]:
def read_sheets(key, sheet=None):
    # Simulate different DataFrame structures based on the 'sheet' argument
    if sheet == 'MultiIndex':
        return pd.DataFrame([[1, 2], [3, 4]], columns=pd.MultiIndex.from_tuples([('A', 'B'), ('A', 'C')]))
    elif sheet == 'Duplicates':
        return pd.DataFrame([[1, 2, 3]], columns=['col', 'col', 'col2'])
    elif sheet == 'Unnamed':
        return pd.DataFrame([[1, 2]], columns=['Unnamed: 0', 'col2'])
    elif sheet == 'DuplicateRows':
        return pd.DataFrame([[1, 2], [1, 2]], columns=['col1', 'col2'])
    return pd.DataFrame()

def get_clean_sheet(key, sheet=None):
    df = read_sheets(key, sheet=sheet)
    df.columns = [c.strip() for c in df.columns.tolist()]
    df = df.loc[:, ~df.columns.duplicated(keep='first')]   
    df = df.drop([col for col in df.columns if col.startswith('Unnamed')], axis=1)
    df = df.loc[~df.index.duplicated(), :]
    return df

# Test handling of duplicated columns
cleaned_df = get_clean_sheet('dummy_key', 'Duplicates')
assert 'col' in cleaned_df.columns and 'col2' in cleaned_df.columns, "Error: Duplicated columns were not removed correctly"
assert len(cleaned_df.columns) == 2, "Error: Incorrect number of columns after removing duplicates"

# Test dropping 'Unnamed' columns
cleaned_df = get_clean_sheet('dummy_key', 'Unnamed')
assert 'col2' in cleaned_df.columns and 'Unnamed: 0' not in cleaned_df.columns, "Error: 'Unnamed' columns were not dropped"

print("All tests passed!")

All tests passed!


In [3]:
def clean_multiindex_df(df):
    # If the columns are a MultiIndex, join them with an underscore, otherwise just strip them
    if isinstance(df.columns, pd.MultiIndex):
        df.columns = ['_'.join(map(str, col)).strip() for col in df.columns.values]
    else:
        df.columns = [col.strip() for col in df.columns.tolist()]

    # Remove duplicated columns
    df = df.loc[:, ~df.columns.duplicated(keep='first')]

    # Drop columns that start with 'Unnamed'
    df = df.drop([col for col in df.columns if col.startswith('Unnamed')], axis=1)

    # Remove duplicated rows
    df = df.loc[~df.index.duplicated(), :]

    return df

# Test multiindex handling
df = pd.DataFrame([[1, 2], [3, 4]], columns=pd.MultiIndex.from_tuples([('A', 'B'), ('A', 'C')]))
cleaned_df = clean_multiindex_df(df)
assert list(cleaned_df.columns) == ['A_B', 'A_C'], "Error: MultiIndex columns were not handled correctly"

# Test remove duplicated columns
df = pd.DataFrame([[1, 2, 3]], columns=['col', 'col', 'col2'])
cleaned_df = clean_multiindex_df(df)
assert 'col' in cleaned_df.columns and 'col2' in cleaned_df.columns, "Error: Duplicated columns were not removed correctly"
assert len(cleaned_df.columns) == 2, "Error: Incorrect number of columns after removing duplicates"

# Test drop unnamed columns
df = pd.DataFrame([[1, 2]], columns=['Unnamed: 0', 'col2'])
cleaned_df = clean_multiindex_df(df)
assert 'col2' in cleaned_df.columns and 'Unnamed: 0' not in cleaned_df.columns, "Error: 'Unnamed' columns were not dropped"

print("All tests passed!")


All tests passed!
