In [1]:
import pandas as pd
import os
import numpy as np
import warnings

warnings.simplefilter(action='ignore', category=UserWarning)
print(os.getcwd())

## CHANGE THIS TO THE PATH OF THE BUILD DOCUMENTATION FILE
src = 'data'
dst = 'clean_data'
f = os.path.join(src, 'BuildDocumentation.xlsx')

/Users/nikkivanhandel/CS6400/mfgdb/CS6400-MfgDB/0_preprocessing


In [2]:
xl = pd.ExcelFile(f)
sheets = xl.sheet_names  # see all sheet names

# Parse the sheet into a nice row 
def sheet_to_row(df,sheet):
    g = 'General Information' # columns with trait names
    # input: preparsed dataframe
    # output: pandas dataframe with traits as columns
    info = df.iloc[:, 0:2]
    info.dropna(axis=0, subset=g, inplace=True)
    row = info.set_index(g).transpose()
    row.loc[:,'Build ID:'] = sheet # Reassign BuildID 
    row.reset_index(drop=True, inplace=True)
    
    # Find where build description is (column changes )
    notecol = np.where(df.iloc[0, :].str.contains('Build Description')==True)[0][0]
    buildinfo = df.iloc[:, notecol:notecol+2].dropna().transpose()
    buildinfo.columns = buildinfo.iloc[0]
    buildinfo = buildinfo.drop(buildinfo.index[0]).reset_index(drop=True)
    if not buildinfo.empty: # If notes, add notes
        row = pd.concat([row, buildinfo], axis=1)
    return row


master = sheet_to_row(xl.parse(sheets[2]), sheets[2])

notetypes = set()

for sheet in sheets[3:]:
    df = xl.parse(sheet)
    row = sheet_to_row(df, sheet)

    master = pd.merge(master, row, how='outer')

    
master.set_index('Build ID:', inplace=True)

# Wrangle datatypes
master = master.infer_objects()

# Remove Empty Columns
for col in master.columns:
    if (~master[col].isna()).sum() < 3: # Remove columns with few observations
        print('Removed ', col)
        master.drop(col, axis=1, inplace=True, errors='ignore')


Removed  Details
Removed  Cost:
Removed  Heat Treat
Removed  Translation Discrepancy
Removed  BP Corner
Removed  Part Corner
Removed  Diff (mm)
Removed  Real (mm)
Removed  9.379999999999995
Removed  2.8424242424242414


In [3]:

def multiremove(string, replaces):
    for word in replaces:
        string = string.replace(word, '')
    return string
    
# Programmatically refactor column names 

cols = [multiremove(col, ['-',':', '?', '_', '&']) for col in master.columns]
for i, col in enumerate(cols):
    if '(' in col:
        p = col[(col.find('(')):(col.find(')')+1)]
        if len(p)==3:
            cols[i]= multiremove(col, ['(', ')']) # For XY
        else:
            cols[i]=col.replace(p, '')
cols = [c.strip().lower().replace(' ', '_') for c in cols]
master.columns=cols

master.successful = master.successful.str.lower().str.contains('yes')
master.info()

<class 'pandas.core.frame.DataFrame'>
Index: 72 entries, B001 to B93
Data columns (total 32 columns):
 #   Column                          Non-Null Count  Dtype         
---  ------                          --------------  -----         
 0   nickname                        72 non-null     object        
 1   operator                        72 non-null     object        
 2   date_printed                    70 non-null     datetime64[ns]
 3   customer                        52 non-null     object        
 4   build_plate_type                72 non-null     object        
 5   build_plate_id                  37 non-null     object        
 6   material_type                   70 non-null     object        
 7   file_location                   67 non-null     object        
 8   parameter_file_name             67 non-null     object        
 9   successful                      30 non-null     object        
 10  total_part_volume               35 non-null     object        
 11  print_tim

### Observations
 - Many builds are the same part/project - linked by title
 -  <del>Operators should be comma seperated for consistency  </del>
 - Customer looks fine, but some names are common 
 - <del> BuildPlateType has redundancy. </del>
 - <del>TotalPartVolume, PrintType, GasFlowVoltage must be forced to numeric</del>
 -  <del>Some overlap with powder_type and material  </del>
 - Parameter file names are usually not given
 - Successful Needs consistency 
 -  <del>Heat treat is useless lol  </del>

In [4]:
# Fix formats
for numCol in ['total_part_volume', 'print_time', 'gas_flow_voltage']:
    master.loc[:, numCol] = pd.to_numeric(master[numCol], errors='coerce')

#Make operators comma-seperated
master.operator = master.operator.str.replace('/', ',')

br, ar = 'Before Replacement: ', 'After Replacement: '

print('BuildPlateTypes')
print(br, master.build_plate_type.unique())
BP_replace = master.build_plate_type.str.contains('Full')
master.loc[BP_replace, 'build_plate_type'] = 'Full'
print(ar, master.build_plate_type.unique())

BuildPlateTypes
Before Replacement:  ['Full Plate' 'RBV' 'Mini Assembly' 'Medium' 'Full sized' 'Full' 'Large']
After Replacement:  ['Full' 'RBV' 'Mini Assembly' 'Medium' 'Large']


In [5]:
# Inform material from Powder Type
material_df = master.loc[:, ['material_type','powder_type']]
unknown_material = material_df.material_type.isna() & ~material_df.powder_type.isna()
master.loc[unknown_material, 'material_type'] = '316L'

# Presumably EOS branded powder?
eos = master.material_type.str.contains('EOS') & master.powder_type.isna()
master.loc[eos, 'powder_type'] = 'EOS'

# Replace instances of KM with Kennametal
km = master.powder_type.str.contains('KM') | master.material_type.str.contains('KM')
master.loc[km, 'powder_type'] = 'Kennametal'

master.loc[master.powder_type.str.contains('Elem'), 'powder_type'] = 'Elementum 3D'
master.loc[master.powder_type.str.contains('Carpenter'), 'powder_type'] = 'Carpenter'

# Manually validated that sieving condition is represented properly, no change

# Make all 316L instances 316
master.loc[master.material_type.str.contains('316L'), 'material_type'] = '316L'

# Correct inst where extra info given
master.material_type.compare(master.powder_type)

Unnamed: 0_level_0,self,other
Build ID:,Unnamed: 1_level_1,Unnamed: 2_level_1
B001,316L,EOS
B002,316L,EOS
B003,316L,EOS
B004,316L,EOS
B005,316L,EOS
B006,316L,EOS
B007,316L,EOS
B008,316L,EOS
B009,316L,EOS
B010,316L,EOS


In [6]:
# User knowledge: Correct table mistakes
master.powder_type[master.powder_type.str.contains('A1000')] = 'Elementum 3D'
master.material_type[master.material_type.str.contains('E3D')] = 'A1000'
master.powder_type[master.material_type.str.contains('AlSi10Mg')] = 'Carpenter'
master.material_type[master.material_type.str.contains('Al1000')] = 'A1000'

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  master.powder_type[master.powder_type.str.contains('A1000')] = 'Elementum 3D'
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  master.material_type[master.material_type.str.contains('E3D')] = 'A1000'
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  master.powder_type[master.material_type.str.contains('AlSi10Mg')] = 'Carpenter'
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/st

In [7]:
# Classify inputs
powder_info = ['material_type', 'powder_type']
eos_inputs = ['recoater_type', 'build_plate_type']
inputs = powder_info + eos_inputs

# Get the unique powders
unique_powders = master.sort_values('date_printed').groupby(powder_info).first().date_printed
powder_table = unique_powders.reset_index()
powder_table.index = powder_table.index.rename('powder_id')
powder_table.rename(columns={'date_printed': 'first_use'}, inplace=True)
powder_table.reset_index(inplace=True)

# Add lot number to the table
merged = pd.merge(master, powder_table.drop('first_use', axis=1), on=powder_info)
merged = merged.drop(powder_info,axis=1)
master = merged 

In [8]:
# Create powder lot table
powder_table.to_csv(os.path.join(dst, 'powder_lots.csv'), index=False, header=False)
# Create build documentation
master.to_csv(os.path.join(dst, 'clean_BuildDocumentation.csv'), header=False)