# Cleaning the Dataset for ML Algorithm
#### This Notebook will delete all the products with the toxins and create a new dataset used for the machine learning algorithm

- The dataframe of cleaned products only [here](../data/processed/cleaned_products_data.csv)
- The list of hazardous products is [here](../references/hazardous_products.txt)
- The list of clean products is [here](../references/clean_products.txt)

***This notebook will only contain a dataframe and a seperate list of product names (used for website) with the clean products***

In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import scipy as sp

In [3]:
df = pd.read_csv('../data/raw/cosmetics.csv')
df.info()
df.head(10)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1472 entries, 0 to 1471
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Label        1472 non-null   object 
 1   Brand        1472 non-null   object 
 2   Name         1472 non-null   object 
 3   Price        1472 non-null   int64  
 4   Rank         1472 non-null   float64
 5   Ingredients  1472 non-null   object 
 6   Combination  1472 non-null   int64  
 7   Dry          1472 non-null   int64  
 8   Normal       1472 non-null   int64  
 9   Oily         1472 non-null   int64  
 10  Sensitive    1472 non-null   int64  
dtypes: float64(1), int64(6), object(4)
memory usage: 126.6+ KB


Unnamed: 0,Label,Brand,Name,Price,Rank,Ingredients,Combination,Dry,Normal,Oily,Sensitive
0,Moisturizer,LA MER,Crème de la Mer,175,4.1,"Algae (Seaweed) Extract, Mineral Oil, Petrolat...",1,1,1,1,1
1,Moisturizer,SK-II,Facial Treatment Essence,179,4.1,"Galactomyces Ferment Filtrate (Pitera), Butyle...",1,1,1,1,1
2,Moisturizer,DRUNK ELEPHANT,Protini™ Polypeptide Cream,68,4.4,"Water, Dicaprylyl Carbonate, Glycerin, Ceteary...",1,1,1,1,0
3,Moisturizer,LA MER,The Moisturizing Soft Cream,175,3.8,"Algae (Seaweed) Extract, Cyclopentasiloxane, P...",1,1,1,1,1
4,Moisturizer,IT COSMETICS,Your Skin But Better™ CC+™ Cream with SPF 50+,38,4.1,"Water, Snail Secretion Filtrate, Phenyl Trimet...",1,1,1,1,1
5,Moisturizer,TATCHA,The Water Cream,68,4.2,"Water, Saccharomyces/Camellia Sinensis Leaf/Cl...",1,0,1,1,1
6,Moisturizer,DRUNK ELEPHANT,Lala Retro™ Whipped Cream,60,4.2,"Water, Glycerin, Caprylic/ Capric Triglyceride...",1,1,1,1,0
7,Moisturizer,DRUNK ELEPHANT,Virgin Marula Luxury Facial Oil,72,4.4,100% Unrefined Sclerocraya Birrea (Marula) Ker...,1,1,1,1,0
8,Moisturizer,KIEHL'S SINCE 1851,Ultra Facial Cream,29,4.4,"Water, Glycerin, Cyclohexasiloxane, Squalane, ...",1,1,1,1,1
9,Moisturizer,LA MER,Little Miss Miracle Limited-Edition Crème de l...,325,5.0,"Algae (Seaweed) Extract, Mineral Oil, Petrolat...",0,0,0,0,0


### Make a list of all of the toxins

In [4]:
ingredents_list = df['Ingredients'].tolist()
# print(ingredents_list)

In [5]:
# Create a dictionary with words as keys and variable names as values
words = {'Ethanolamine': 'ethanolamine_occ', 
         'Oxybenzone': 'oxybenzone_occ', 
         'Resorcinol': 'resorcinol_occ', 
         'Formaldehyde': 'formaldehyde_occ', 
         'Diethanol': 'diethanolamine_occ', 
         'Silane': 'silanes_occ', 
         'Siloxane': 'siloxanes_occ', 
         'Octinaxate': 'octinaxate_occ',
         #MODDERATE
         'Parfum': 'parfum_occ', 
         'Fragrance': 'fragrance_occ',
         'Triclosan': 'triclosan_occ', 
         'Octinoxates':'octinoxates_occ', 
         'Homosalate' : 'homosalate_occ', 
         'Teflon': 'teflon_occ',
         #HIGH
         'Talc': 'talc_occ', 
         'Parabens':'parabens_occ', 
         'Propylene Glycol': 'glycol_occ'}

# Iterate over the dictionary
for word, var in words.items():
  # Use the `str.contains` method to find the occurrences where the word is found in the Ingredients column
  # and create a variable with the corresponding name and value
  globals()[var] = df[df['Ingredients'].str.contains(word)]

In [6]:
%%capture cap 

words = {'Ethanolamine': 'ethanolamine_occ', 
         'Oxybenzone': 'oxybenzone_occ', 
         'Resorcinol': 'resorcinol_occ', 
         'Formaldehyde': 'formaldehyde_occ', 
         'Diethanol': 'diethanolamine_occ', 
         'Silane': 'silanes_occ', 
         'Siloxane': 'siloxanes_occ', 
         'Octinaxate': 'octinaxate_occ',
         #MODDERATE
         'Parfum': 'parfum_occ', 
         'Fragrance': 'fragrance_occ',
         'Triclosan': 'triclosan_occ', 
         'Octinoxates':'octinoxates_occ', 
         'Homosalate' : 'homosalate_occ', 
         'Teflon': 'teflon_occ',
         #HIGH
         'Talc': 'talc_occ', 
         'Parabens':'parabens_occ', 
         'Propylene Glycol': 'glycol_occ'}

words_tuples = {}
for chemical, variable in words.items():
    variable_value = eval(variable)
    index = variable_value.index
    name = variable_value.loc[index, "Name"]
    if index.empty:
        words_tuples[chemical] = "No chemical found"
    else:
        tuples = [(i, n) for i, n in zip(index, name)]
        words_tuples[chemical] = tuples

with open('../reports/hazardous_products.txt', 'w') as f:
    for chemical, tuples in words_tuples.items():
        if tuples == "No chemical found":
            f.write(f"No {chemical} found\n")
        else:
            f.write(f"{chemical} found at index numbers:\n")
            for i, n in tuples:
                f.write(f"{i} - {n}\n")
            f.write("\n")

In [7]:
with open("../references/hazardous_products.txt", "r") as file:
    index_numbers = [int(line.strip()) for line in file]

selected_products = df.loc[df['Name'].isin(index_numbers)]

print(selected_products['Name'])

ValueError: invalid literal for int() with base 10: 'No Ethanolamine found'

In [8]:
with open("../references/hazardous_products.txt", "r") as file:
    index_numbers = []
    for line in file:
        try:
            index_numbers.append(int(line.strip()))
        except ValueError:
            continue
    index_numbers = [int(line.strip()) for line in file]

# Use the .loc[] method to select the rows of the dataframe where the index number matches the number in your list
selected_products = df.loc[df['Name'].isin(index_numbers)]

# Print the selected products
print(selected_products)

Empty DataFrame
Columns: [Label, Brand, Name, Price, Rank, Ingredients, Combination, Dry, Normal, Oily, Sensitive]
Index: []


### Make a loop that will run the same function on other toxins
#### Store the toxins in a list
- master list
- hazard levels
    - low
    - moderate
    - high
#### Use a loop to print the products that contains the toxins

### Lists sorted by Toxicity (for reference)


In [9]:
# master_list = [
#     silanes_occ, oxybenzone_occ, siloxanes_occ, resorcinol_occ, ethanolamine_occ, formaldehyde_occ, diethanolamine_occ, octinaxate_occ,
#     parfum_occ, fragrance_occ, triclosan_occ, octinoxates_occ, homosalate_occ, teflon_occ,
#     talc_occ, parabens_occ, glycol_occ]

# low_list = [
#     silanes_occ, oxybenzone_occ, siloxanes_occ, resorcinol_occ, ethanolamine_occ, formaldehyde_occ, diethanolamine_occ, octinaxate_occ,
#     ]

# moderate_list = [
#     parfum_occ, fragrance_occ, triclosan_occ, octinoxates_occ, homosalate_occ, teflon_occ,
#     ]

# high_list = [
#     talc_occ, parabens_occ, glycol_occ
# ]

### Low Toxicity toxin products index number list is [here](../references/low_hazard_products.txt)

In [10]:
low = {'Ethanolamine': 'ethanolamine_occ',
       'Oxybenzone': 'oxybenzone_occ',
       'Resorcinol': 'resorcinol_occ',
       'Formaldehyde': 'formaldehyde_occ',
       'Diethanol': 'diethanolamine_occ',
       'Silane': 'silanes_occ',
       'Siloxane': 'siloxanes_occ',
       'Octinaxate': 'octinaxate_occ', }

low_tuples = {}
for chemical, variable in low.items():
    variable_value = eval(variable)
    index = variable_value.index
    name = variable_value.loc[index, "Name"]
    if index.empty:
        low_tuples[chemical] = "No chemical found"
    else:
        tuples = [(i, n) for i, n in zip(index, name)]
        low_tuples[chemical] = tuples

with open('../references/low_hazard_products.txt', 'w') as f:
    for chemical, tuples in low_tuples.items():
        if tuples == "No chemical found":
            f.write(f"No {chemical} found\n")
        else:
            f.write(f"{chemical} found at index numbers:\n")
            for i, n in tuples:
                f.write(f"{i} - {n}\n")
            f.write("\n")


### Moderate Toxicity toxin products index number list is [here](../references/mod_hazard_products.txt)

In [11]:
mod = {
    'Parfum': 'parfum_occ',
    'Fragrance': 'fragrance_occ',
    'Triclosan': 'triclosan_occ',
    'Octinoxates': 'octinoxates_occ',
    'Homosalate': 'homosalate_occ',
    'Teflon': 'teflon_occ',
}

mod_tuples = {}
for chemical, variable in mod.items():
    variable_value = eval(variable)
    index = variable_value.index
    name = variable_value.loc[index, "Name"]
    if index.empty:
        mod_tuples[chemical] = "No chemical found"
    else:
        tuples = [(i, n) for i, n in zip(index, name)]
        mod_tuples[chemical] = tuples

with open('../references/mod_hazard_products.txt', 'w') as f:
    for chemical, tuples in mod_tuples.items():
        if tuples == "No chemical found":
            f.write(f"No {chemical} found\n")
        else:
            f.write(f"{chemical} found at index numbers:\n")
            for i, n in tuples:
                f.write(f"{i} - {n}\n")
            f.write("\n")

### High Toxicity toxin products index number list is [here](../references/high_hazard_products.txt)

In [20]:
high = {'Talc': 'talc_occ',
        'Parabens': 'parabens_occ',
        'Propylene Glycol ': 'glycol_occ'}

high_tuples = {}
for chemical, variable in high.items():
    variable_value = eval(variable)
    index = variable_value.index
    name = variable_value.loc[index, "Name"]
    if index.empty:
        high_tuples[chemical] = "No chemical found"
    else:
        tuples = [(i, n) for i, n in zip(index, name)]
        high_tuples[chemical] = tuples

with open('../references/high_hazard_products.txt', 'w') as f:
    for chemical, tuples in high_tuples.items():
        if tuples == "No chemical found":
            f.write(f"No {chemical} found\n")
        else:
            f.write(f"{chemical} found at index numbers:\n")
            for i, n in tuples:
                f.write(f"{i} - {n}\n")
            f.write("\n")


### Reading the textfile outputs to get the index numbers (for deletion)
1. A new array is created to hold all of the indicies
2. The index numbers are read from [here](../references/hazardous_products.txt)
3. Adds to the ```products``` array if the text is a digit

In [None]:
%%capture cap 
with open('../reports/hazardous_products.txt', 'r') as f:
    contents = f.read()
    
lines = contents.split('\n')

products = []

for line in lines:
    words = line.split()
    for word in words:
        if word.isdigit():
            products.append(int(word))

print(products)

# Close the file
f.close()

print(type(products))

products_array = np.array(products)
print(type(products_array))

print(products_array)

### Add the toxicity column with these values:

- Low
- Moderate
- High
- n/a

### Now we must remove the index numbers (products) that are in the lists to make a dataset with non-toxic products

In [None]:
# removed_products = []
# with open('../reports/hazardous_products.txt', 'r') as f:
#     for line in f:
#         if 'found at index numbers:' in line:
#             for index in f:
#                 if index.strip().isdigit():
#                     removed_products.append(int(index))
#                 else:
#                     break

# removed_products_array = np.array(removed_products)

# print(removed_products_array)
# df = df.drop(removed_products_array)


# index_numbers = []
# with open('../references/hazardous_products.txt', 'r') as f:
#     for line in f:
#         if 'found at index numbers:' in line:
#             for index in f:
#                 if index.strip().isdigit():
#                     index_numbers.append(int(index))
#                 else:
#                     break

# index_array = np.array(index_numbers)
# df = df.drop(index_array)

with open('../references/hazardous_products.txt', 'r') as f:
    contents = f.read()
    
lines = contents.split('\n')

products = []

for line in lines:
    words = line.split()
    for word in words:
        if word.isdigit():
            products.append(int(word))


f.close()

toxic_array = np.array(products)

print(toxic_array)

df = df.drop(toxic_array)

### **A total of 825 out of 1472 products are clean!**

In [None]:
df.info()

In [None]:
df.head()

In [None]:
df.describe()

### Export the results to a s CSV file that has the new toxicity column to processed

In [None]:
df.to_csv('../data/processed/clean_products_data.csv', index=False)

### Creating the list of clean products
- `products_array` is the array that contains the index numbers of the toxic products
- create the array without toxic products (clean products) using `np.setdiff1d()` method
- access the product names by using the index numbers
- make a txt file of the index number & product name

In [None]:
print("Toxic Product Index numbers:")
print(toxic_array)

# Total number of products range from 1 to 1471
all_index = np.arange(1, 1472)
clean_array = np.setdiff1d(all_index, toxic_array)

print("Clean Product Index numbers:")
print(clean_array)

In [None]:
with open('../references/clean_products.txt', 'w') as f:
    for i in range(len(clean_array)):
        product_name = df.at[clean_array[i], 'Name']
        f.write(str(clean_array[i]) + ': ' + product_name + '\n')