# Joseph Gross - Element Classification (multi-label) from LIBS data 

### Get all unique filenames

In [2]:
from pathlib import Path

all_filenames = []
unique_filenames = []

# loops through all the filenames and adds them to a list
for filename in Path("Training Data").glob('**/*.txt'):
    if "ASCII" not in filename.name:
        all_filenames.append("Training Data/" + filename.name)
    # unique_filenames.append(filename.name[:25])

# returns a list of unique filenames to be used next 
#set(unique_filenames)

len(all_filenames)

5419

### Extract element from one file

In [3]:
def get_element(filename):
    all_element_lists = []
    all_element_names = ["Copper", "Iron", "Nickel", "Tin", "Tungsten", "Zinc"]
    
    # <element>_strings is a list of strings that if found in the filename label that file as a certain element
    copper_strings = ["Cu(NO3)2", "Copper"]
    all_element_lists.append(copper_strings)
    
    iron_strings = ["Fe(NO3)3", "UnknownNail", "Iron"]
    all_element_lists.append(iron_strings)
    
    nickel_strings = ["Ni(NO3)2", "NiCl2", "Nickel"]
    all_element_lists.append(nickel_strings)
    
    tin_strings = ["SnStandard", "Tin", "Sn"]
    all_element_lists.append(tin_strings)
    
    tungsten_strings = ["WStandard", "Tungsten"]
    all_element_lists.append(tungsten_strings)
    
    zinc_strings = ["Zn(NO3)2", "Zinc", "Zn"]
    all_element_lists.append(zinc_strings)
    
    # For each list of strings, checks if the filename contains any element in that list and if it
    # does then it returns that element
    test_method = False
    for i in range(len(all_element_names)):
        if any([element.lower() in filename.lower() for element in all_element_lists[i]]):
            test_method = True
            return all_element_names[i]
    
    # tests if an element is found for every filename
    # if not, the filename is printed
    if test_method == False:
        print(filename)
        
#for element in sorted(all_filenames):
    #print(get_element(element))

### Extract data from one file

In [1]:
import pandas as pd

# returns a dataframe with the frequency and intensity of a single file
def get_data(filename):
    df = pd.read_csv(filename, skiprows=13, sep='\t', names=['freq', 'intensity'])
    df = df.set_index('freq')
    return df

### Time how long it takes to process one file

In [4]:
from random import randint
import time
start = time.time()
# Timer starts


test_file = all_filenames[randint(0, len(all_filenames))]

all_element_names_test = ["copper", "iron", "nickel", "tin", "tungsten", "zinc"]
intensities_test = []
result_to_print = []

df_test = get_data(test_file)
intensities_test.append(df_test)

element = get_element(test_file).lower()
element_index = all_element_names_test.index(element)
for i in range(len(all_element_names_test)):
    if i == element_index:
        result_to_print.append(1)
    else:
        result_to_print.append(0)

print("Test file name:", test_file)
print()
print(result_to_print, "->", all_element_names_test)
print()
print(intensities_test[0].head(5))

# Time ends
end = time.time()
process_single_file_time = (end-start)
print()
print("Max time to process all files:", round(len(all_filenames) * process_single_file_time), "seconds")

Test file name: Training Data/Fe(NO3)3@0.0125M.TitratingCurve.Agarose3%dH2O.NegLispCup.LampEnergy10.LIBS07252019_HRD10591_16-59-03-337.txt

[0, 1, 0, 0, 0, 0] -> ['copper', 'iron', 'nickel', 'tin', 'tungsten', 'zinc']

         intensity
freq              
223.165     -20.06
223.400     -20.06
223.635     -20.06
223.869     -30.06
224.104     -24.06

Max time to process all files: 383 seconds


### Process all files

In [5]:
# all the intensities dataframes will be stored in this list before being concatenated into one
intensities = []
all_element_labels = []


# <element>_labels is list of binary values (0/1) that will be used as the "test" (label) values for
# the training of the algorithms. Each list will be used to train a binary classifier
copper_labels = []
all_element_labels.append(copper_labels)

iron_labels = []
all_element_labels.append(iron_labels)

nickel_labels = []
all_element_labels.append(nickel_labels)

tin_labels = []
all_element_labels.append(tin_labels)

tungsten_labels = []
all_element_labels.append(tungsten_labels)

zinc_labels = []
all_element_labels.append(zinc_labels)

In [6]:
all_element_names = ["copper", "iron", "nickel", "tin", "tungsten", "zinc"]

# loops through every file and processes the information needed
for file in all_filenames:
    # collects the intensity and frequency data and appends it to the intensities list
    df = get_data(file)
    if df.index[-2] != 672.689:
        print("--", file)
    intensities.append(df['intensity'])
    
    # identifies the element for each file and the index in the above element list
    # a binary value (0/1) is then added to each labels list depending on what that element is
    # a 1 will be added to the element list for the element identified and a 0 to the rest of the lists
    element = get_element(file).lower()
    element_index = all_element_names.index(element)
    for i in range(len(all_element_labels)):
        if i == element_index:
            all_element_labels[i].append(1)
        else:
            all_element_labels[i].append(0)

In [7]:
# all the intensities are concatenated to create a master dataframe will all the data
# all the labels are stored in a master list of lists (each interior list will be used
# to train a new binary classifier)
for i in range(len(intensities)):
    df = intensities[i]
    if 672.689 == df.index[-2]:
        continue
    else:
        print(all_filenames[i])

In [8]:
master_df = pd.concat(intensities, axis=1).transpose()
print(master_df.shape, len(all_element_labels[0]), len(all_element_labels[1]), len(all_element_labels[2]), 
 len(all_element_labels[3]), len(all_element_labels[4]), len(all_element_labels[5]))
master_df.head()

(5419, 2048) 5419 5419 5419 5419 5419 5419


freq,223.165,223.4,223.635,223.86900000000003,224.104,224.338,224.57299999999998,224.808,225.042,225.27700000000002,...,671.07,671.2719999999999,671.475,671.677,671.88,672.082,672.284,672.487,672.689,672.8919999999999
intensity,-21.75,-21.75,-21.75,-17.75,-17.75,-8.75,-6.75,-14.75,9.25,-0.75,...,74.25,93.25,88.25,76.25,71.25,80.25,75.25,91.25,98.25,90.25
intensity,-15.31,-15.31,-15.31,-17.31,-11.31,-6.31,-3.31,-15.31,-16.31,8.69,...,83.69,84.69,88.69,73.69,66.69,84.69,73.69,86.69,83.69,73.69
intensity,-24.06,-24.06,-24.06,-25.06,-24.06,-13.06,-12.06,-14.06,-12.06,2.94,...,74.94,63.94,79.94,79.94,78.94,88.94,84.94,90.94,80.94,58.94
intensity,-12.0,-12.0,-12.0,-11.0,-6.0,-20.0,-5.0,-16.0,-15.0,-6.0,...,93.0,82.0,87.0,78.0,90.0,84.0,85.0,89.0,85.0,88.0
intensity,-10.5,-10.5,-10.5,-3.5,-24.5,-25.5,-13.5,-1.5,-9.5,-4.5,...,84.5,94.5,89.5,81.5,78.5,98.5,74.5,79.5,74.5,85.5


In [9]:
for labels in all_element_labels:
    print(sum(labels))

251
1101
2434
604
600
429


### Training, Testing, and Scoring Models

In [10]:
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.naive_bayes import GaussianNB

classifiers = [
    #KNeighborsClassifier(3),
    #SVC(gamma=2, C=1),
    #DecisionTreeClassifier(max_depth=5),
    RandomForestClassifier(n_estimators=100),
    #MLPClassifier(alpha=1, max_iter=100),
    #AdaBoostClassifier(),
    #GaussianNB(),
    #SVC(kernel="linear", C=0.025),
    #GradientBoostingClassifier()
]

classifiers_to_train = [
    RandomForestClassifier(n_estimators=100),
    RandomForestClassifier(n_estimators=100),
    RandomForestClassifier(n_estimators=100),
    RandomForestClassifier(n_estimators=100),
    RandomForestClassifier(n_estimators=100),
    RandomForestClassifier(n_estimators=100),
]

In [11]:
X = master_df.values
scaler = StandardScaler()
X = scaler.fit_transform(X)

score_dict = {}

for clf in classifiers:
    score_dict[clf.__class__.__name__] = 1

trained_models = []
# for every element, a seperate binary classifier is trained, tested, and scored
# for every element, all the classifiers are trained, tested, and scores based on the labelled values passed (y)
# In this cases, the labelled values passed (y) are looped so that all the different binary lists are used once
for i in range(len(all_element_labels)):
    y = all_element_labels[i]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.4, random_state=0)
    print(all_element_names[i].title() + ":")
    clf = classifiers_to_train[i]
    
    clf.fit(X_train, y_train)
    trained_models.append(clf)
    score = np.mean(cross_val_score(clf, X_test, y_test))
    name = clf.__class__.__name__
    score_dict[clf.__class__.__name__] *= score
    print(name, score)
    print("----------------------------")

Copper:
RandomForestClassifier 0.9912389257339317
----------------------------
Iron:
RandomForestClassifier 0.9718598381372372
----------------------------
Nickel:
RandomForestClassifier 0.9612596481025185
----------------------------
Tin:
RandomForestClassifier 0.9995389580451821
----------------------------
Tungsten:
RandomForestClassifier 0.9990779160903642
----------------------------
Zinc:
RandomForestClassifier 0.9967714291917463
----------------------------


# Applying trained model on waterkeepers data

### Get all unique filenames

In [12]:
def get_location_info(filename):
    if "Location1" in filename:
        return 0
    if "Location2" in filename:
        return 1
    if "Location3" in filename:
        return 2
    if "Location4" in filename:
        return 3
    if "Location5" in filename:
        return 4
    if "Location6" in filename:
        return 5
    if "Location7" in filename:
        return 6

In [13]:
waterkeepers_filenames = []
filenames_by_location = []

location_1 = []
location_2 = []
location_3 = []
location_4 = []
location_5 = []
location_6 = []
location_7 = []
filenames_by_location.append(location_1)
filenames_by_location.append(location_2)
filenames_by_location.append(location_3)
filenames_by_location.append(location_4)
filenames_by_location.append(location_5)
filenames_by_location.append(location_6)
filenames_by_location.append(location_7)

location_dict = {0:"Location1", 1:"Location2", 2:"Location3", 3:"Location4",
                          4:"Location5", 5:"Location6", 6:"Location7"}

# loops through all the filenames and adds them to a list
for filename in Path("WaterKeepers Data").glob('**/*/*.txt'):
    waterkeepers_filenames.append("WaterKeepers Data/" + filename.name)
    location_info = get_location_info(filename.name)
    filenames_by_location[location_info].append("WaterKeepers Data/" + 
                                                location_dict[location_info] + 
                                                "/" + filename.name)

len(waterkeepers_filenames), len(filenames_by_location )

(1601, 7)

### Process all files

In [14]:
water_keepers_intensities = []

location_1_intensities = []
location_2_intensities = []
location_3_intensities = []
location_4_intensities = []
location_5_intensities = []
location_6_intensities = []
location_7_intensities = []

water_keepers_intensities.append(location_1_intensities)
water_keepers_intensities.append(location_2_intensities)
water_keepers_intensities.append(location_3_intensities)
water_keepers_intensities.append(location_4_intensities)
water_keepers_intensities.append(location_5_intensities)
water_keepers_intensities.append(location_6_intensities)
water_keepers_intensities.append(location_7_intensities)

# loops through every file and processes the information needed
for i in range(len(filenames_by_location)):
    location = filenames_by_location[i]
    for file in location:
        # collects the intensity and frequency data and appends it to the intensities list
        water_keepers_df = get_data(file)
        water_keepers_intensities[i].append(water_keepers_df['intensity'])

In [15]:
water_keepers_df = []

#create a list of dataframes for each location (each dataframe has the intesities data)
for i in range(len(water_keepers_intensities)):
    temp_df = pd.concat(water_keepers_intensities[i], axis=1).transpose()
    #print(temp_df.shape)
    water_keepers_df.append(temp_df)

## Machine Learning

### Identify elements in one location

In [16]:
def get_predictions(location_df):
    X = location_df.values
    X = scaler.transform(X)

    location_results = []
    for clf in trained_models:
        location_results.append(clf.predict(X))

    return location_results

### Get predictions from all locations

In [17]:
location_results = []

for location in water_keepers_df:
    location_results.append(get_predictions(location))

## Analyzing Predictions 

#### Get premutations of numbers 0-5 (index values for accessing predictions)

In [18]:
from itertools import combinations

# get all combinations of index values in order to analyze the prediction results
def get_all_combinations(n):
    index_list = list(range(0, n))
    all_combinations = []
    
    for i in range(2, n+1):
        all_combinations.append(list(combinations(index_list, i)))
        
    return all_combinations

In [19]:
for element in get_all_combinations(6):
    print(element, len(element))

[(0, 1), (0, 2), (0, 3), (0, 4), (0, 5), (1, 2), (1, 3), (1, 4), (1, 5), (2, 3), (2, 4), (2, 5), (3, 4), (3, 5), (4, 5)] 15
[(0, 1, 2), (0, 1, 3), (0, 1, 4), (0, 1, 5), (0, 2, 3), (0, 2, 4), (0, 2, 5), (0, 3, 4), (0, 3, 5), (0, 4, 5), (1, 2, 3), (1, 2, 4), (1, 2, 5), (1, 3, 4), (1, 3, 5), (1, 4, 5), (2, 3, 4), (2, 3, 5), (2, 4, 5), (3, 4, 5)] 20
[(0, 1, 2, 3), (0, 1, 2, 4), (0, 1, 2, 5), (0, 1, 3, 4), (0, 1, 3, 5), (0, 1, 4, 5), (0, 2, 3, 4), (0, 2, 3, 5), (0, 2, 4, 5), (0, 3, 4, 5), (1, 2, 3, 4), (1, 2, 3, 5), (1, 2, 4, 5), (1, 3, 4, 5), (2, 3, 4, 5)] 15
[(0, 1, 2, 3, 4), (0, 1, 2, 3, 5), (0, 1, 2, 4, 5), (0, 1, 3, 4, 5), (0, 2, 3, 4, 5), (1, 2, 3, 4, 5)] 6
[(0, 1, 2, 3, 4, 5)] 1


### Create a dataframe with data for each location

In [20]:
# create a list of dataframes (one dataframe per location) 
# each dataframe containes the prediction results for that location
location_results_df = []

for location in location_results:
    series_list = []
    
    for element in location:
        series_list.append(pd.Series(element))
    
    location_results_df.append(pd.concat(series_list, axis=1))

In [21]:
len(location_results_df)

7

### Create new column (sum) given a list of other column names

In [22]:
def create_column(combination, df):
    sum = 0
    for element in combination:
        sum += df[element]
        
    df[combination] = sum
    return df

### Create all new combinations of columns

In [23]:
def create_new_columns(result_df, i):
    for combination in get_all_combinations(6):
        for element in combination:
            result_df = create_column(element, result_df)
            
    i_to_location = {0:"Location1", 1:"Location2", 2:"Location3", 3:"Location4",
                    4:"Location5", 5:"Location6", 6:"Location7"}
    result_df['Location'] = i_to_location[i]
    
    return result_df

### Create new columns for all locations

In [24]:
location_all_results_df = []
for i in range(len(location_results_df)):
    location = location_results_df[i]
    location_all_results_df.append(create_new_columns(location, i))

### Merge all location data in one dataframe

In [25]:
all_locations_results_df = pd.concat(location_all_results_df).set_index("Location")

final_df = round(all_locations_results_df.groupby("Location").sum() /
            all_locations_results_df.groupby("Location").count() * 10000)/100

In [26]:
metal_to_index_dict = {"copper":0, "iron":1, "nickel":2, "tin":3, "tungsten":4, "zinc":5}
index_to_metal_dict = {0:"Copper", 1:"Iron", 2:"Nickel", 3:"Tin", 4:"Tungsten", 5:"Zinc"}

## Percent of total samples containing each metal

In [27]:
print('Percent of total samples containing each metal')
result_df = final_df[list(range(0,6))].rename(columns=index_to_metal_dict)
#result_df[result_df>0]
result_df

Percent of total samples containing each metal


Unnamed: 0_level_0,Copper,Iron,Nickel,Tin,Tungsten,Zinc
Location,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Location1,0.0,1.5,0.0,0.0,0.0,23.0
Location2,0.0,3.5,0.0,0.0,0.0,15.5
Location3,0.0,0.0,0.0,0.0,0.0,100.0
Location4,0.0,2.5,0.0,0.0,0.0,23.5
Location5,0.0,0.0,5.99,0.0,0.0,50.12
Location6,0.0,38.0,0.0,0.0,0.0,1.5
Location7,0.0,0.0,0.0,0.0,0.0,100.0


In [28]:
location_num_to_name_dict = {"Location1": "Morningside" , "Location2": "RE black pebble beach", 
                             "Location3": "Marjory Stoneman Biscayne Nature Center",
                             "Location4": "Matheson Hammock", "Location5": "Key Biscayne Yacht Club",
                             "Location6": "Virginia Key RSMAS", "Location7": "Key Biscayne Beach Club"}
result_df = result_df.reset_index()

result_df["Location"] = result_df["Location"].map(location_num_to_name_dict)
result_df = result_df.set_index("Location")
print('Percent of total samples containing each metal')
result_df

Percent of total samples containing each metal


Unnamed: 0_level_0,Copper,Iron,Nickel,Tin,Tungsten,Zinc
Location,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Morningside,0.0,1.5,0.0,0.0,0.0,23.0
RE black pebble beach,0.0,3.5,0.0,0.0,0.0,15.5
Marjory Stoneman Biscayne Nature Center,0.0,0.0,0.0,0.0,0.0,100.0
Matheson Hammock,0.0,2.5,0.0,0.0,0.0,23.5
Key Biscayne Yacht Club,0.0,0.0,5.99,0.0,0.0,50.12
Virginia Key RSMAS,0.0,38.0,0.0,0.0,0.0,1.5
Key Biscayne Beach Club,0.0,0.0,0.0,0.0,0.0,100.0


# Visualizations

In [29]:
import random

random_number = random.randint(0, len(filenames_by_location[0]))

data_filename = filenames_by_location[0][random_number]

In [30]:
data_df = get_data(data_filename).T

frequencies = data_df.columns

In [31]:
normalized_data = scaler.transform(data_df.values).T

In [32]:
regular_data = data_df.values.T

In [1]:
from matplotlib import pyplot as plt

fig, axs = plt.subplots(2, 1, constrained_layout=True)
axs[0].plot(frequencies, regular_data, "-")
axs[0].set_title('Raw Data')
axs[0].set_xlabel('Wavelength (nm)')
axs[0].set_ylabel('Intensity')

axs[1].plot(frequencies, normalized_data, "-")
axs[1].set_title('Normalized Data')
axs[1].set_xlabel('Wavelength (nm)')
axs[1].set_ylabel('Intensity')

plt.show()
fig.savefig("Normalized vs Regular Data.pdf", bbox_inches='tight')

ImportError: cannot import name 'get_backend'

In [34]:
print(data_filename)

WaterKeepers Data/Location1/Location1.07152019.Energy5.Lamp.07192019.3%Agarose_HRD10591_14-33-26-016.txt
