# TRANSFORM
# Cleaning raw data from ASOS

Data was accessed through an API between 2023-11-04 and 2023-11-09. <br/> 
Now the data must be cleaned and transformed to comply with the DDBB standards.


The raw datasets have the following **structure**: <br/>
* JSON files with a very complex structure <br/>

The raw datasets have the following **issues to amend**:

1. Keep only **brand, product description and price €**  <br/>
2. ASOS dataset **486 names** for colours
    * Normalize the color names using `mlg.namvector_clean`
    * Simplify colour names according to the colors used by Amazon.es
    * Use `color_simplification.pkl` - A chatGPT generated dictionary to simplifify the colors returned by ASOS
    * Transform rows with multiple color values transform to "multicolor" <br/>
<br/>
3. Clean `price` column
    * Take out € symbol
    * Replace , for .
    * Transform to float <br/>
<br/>
4. Normalize brand names


### 0. Import the modules

In [175]:
import requests
import time
import pandas as pd
import pickle
import numpy as np
import warnings
warnings.filterwarnings('ignore') # ignorar warnings


from src import ETL_functions as etl
from src import dataanalysis_fun1 as eda  #Import my module

Reload my module if neccessary

In [176]:
import importlib
from src import dataanalysis_fun1 as mlg # Import the module
#importlib.reload(mlg)  # Reload the module

# Suppress warning when reloading the module
with warnings.catch_warnings():
    warnings.simplefilter("ignore") 
    importlib.reload(mlg)  # Reload the module

## 1. Load the data using pickle


In [177]:
with open('../data/raw/collapsed_dict2.pkl', 'rb') as file:
    collapsed_dict2 = pickle.load(file)

In [178]:
with open('../data/raw/asos_womentshirt_data.pkl', 'rb') as file:
    asos_womentshirt_data = pickle.load(file)
    
with open('../data/raw/asos_womendress_data.pkl', 'rb') as file:
    asos_womendress_data = pickle.load(file)

with open('../data/raw/asos_womentop_data.pkl', 'rb') as file:
    asos_womentop_data = pickle.load(file)
    
with open('../data/raw/asos_womentop_data.pkl', 'rb') as file:
    asos_womentop_data = pickle.load(file)

asos_woman_list=[asos_womentshirt_data, asos_womendress_data, asos_womentop_data, asos_womentop_data]

In [179]:
with open('../data/raw/asos_mentshirt_data1.pkl', 'rb') as file:
    asos_men1_data = pickle.load(file)
    
with open('../data/raw/asos_mentshirt_data2.pkl', 'rb') as file:
    asos_men2_data = pickle.load(file)
    
asos_men_list=[asos_men1_data, asos_men2_data]

## 2. Transform/clean the data

First for women clothes, then for men dataset

In [180]:
asos_raw_list=asos_woman_list

## 2.1 Join the datasets and parse them into DataFrame

In [181]:
color_df_list=[]
for ASOS in asos_raw_list:
    thedf1=[[row["brandName"], row["name"],  row["price"]["current"]["text"], row["colour"]] for row in ASOS]
    color_df=pd.DataFrame(thedf1,columns=["brand", "description", "price", "colour"])
    color_df_list.append(color_df)
    
ASOS_df = pd.concat(color_df_list, ignore_index=True)

# delete duplicated rows 
ASOS_df1=ASOS_df[~ASOS_df["description"].duplicated()]
ASOS_df1.reset_index()

Unnamed: 0,index,brand,description,price,colour
0,0,New Look,Top negro escalonado de manga larga con estamp...,"16,99 €",Negro
1,1,Mama.licious,Top color ámbar dorado de manga larga con cuel...,"45,99 €",Dorado ámbar
2,2,Mama.licious,Top amarillo de manga larga con cuello ancho d...,"21,99 €",Dorado ámbar
3,3,Selected,Top marrón de manga larga con cuello alto de S...,"31,99 €",Tierra oscura
4,4,Mama.licious,Top multicolor a rayas con cuello alto de punt...,"40,99 €",MULTICOLOR
...,...,...,...,...,...
10555,13007,Vans,Camiseta naranja con logo en la parte izquierd...,"12,00 €",Naranja
10556,13013,adidas performance,Top rosa con capas en la espalda de adidas Tra...,"35,75 €",Rosa
10557,13019,Urban Bliss,Camiseta negra con lateral fruncido de Urban B...,"4,00 €",Negro
10558,13024,ASOS DESIGN,Camiseta blanca de corte cuadrado sin mangas d...,"9,49 €",Blanco


## 2.2 Normalize the color names using `mlg.namvector_clean`

In [182]:
ASOS_df1=ASOS_df1.copy()
ASOS_df1["colour"]=eda.namvector_clean(ASOS_df1["colour"])
ASOS_df1["colour"]=[col.replace("_", " ") for col in ASOS_df1["colour"]]

### 2.3 Simplify colour names according to the color categories used by Amazon.es

In [183]:
amz_colors=['negro', 'gris', 'blanco', 'marron', 'beis', 'rojo', 'rosa',
       'naranja', 'amarillo', 'marfil', 'verde', 'turquesa', 'azul',
       'morado', 'dorado']

english_colors = ['black', 'gray', 'white', 'brown', 'beige', 'red', 'pink', 'orange', 'yellow', 'ivory', 'green', 'turquoise', 'blue', 'purple', 'gold']
amz_colors_dict = dict(zip(english_colors, amz_colors))

print(amz_colors_dict)

{'black': 'negro', 'gray': 'gris', 'white': 'blanco', 'brown': 'marron', 'beige': 'beis', 'red': 'rojo', 'pink': 'rosa', 'orange': 'naranja', 'yellow': 'amarillo', 'ivory': 'marfil', 'green': 'verde', 'turquoise': 'turquesa', 'blue': 'azul', 'purple': 'morado', 'gold': 'dorado'}


### 2.3 a - 16 SPANISH NAMES

In [184]:
ASOS_df2=ASOS_df1.copy()
ASOS_df2["colour_simp"]=""

for color in amz_colors:
    '''
    check in original color nomenclature if a simplest color is present
    some nomenclatures contain multiple words, i need doble comprehension to eval all the words by cell
    '''
    matches = [any(word in color for word in cell.split())
               for cell in ASOS_df2["colour"].values]
    
    ASOS_df2.loc[matches, "colour_simp"] = color

### 2.3 b -16 ENGLISH NAMES

In [185]:
ASOS_df2.head()

for dict_key, dict_value in amz_colors_dict.items(): 
    matches = [cell in dict_key for cell in ASOS_df2["colour"].values]
    
    '''
    check in original color nomenclature if a simplest color is present
    some nomenclatures contain multiple words, i need doble comprehension to eval all the words by cell
    '''
    matches = [any(word in dict_key for word in cell.split())
               for cell in ASOS_df2["colour"].values]
    
    ASOS_df2.loc[matches, "colour_simp"] = dict_value


### 2.3 c - `collapsed_dict2.pkl` -chatGPT generated dictionary
Load the object from the file -chatGPT generated dictionary to simplifify the colors returned by ASOS

In [186]:
def color_dict_simplify(x):
    x = x.lower()
    theval=""
    for KKK, VVV in collapsed_dict2.items():
        for VAL in VVV:
            if VAL in x:
                theval = KKK
            
    return theval

####

def update_colour_simp(row):
    if row['colour_simp'] == "":
        return row['colour_simp2']
    else:
        return row['colour_simp']

In [187]:
ASOS_df3=ASOS_df2.copy()
ASOS_df3['colour_simp2'] = ASOS_df3['colour'].apply(color_dict_simplify)

### 2.3 d- Unify colors in `colour_simp` and keep only one column

In [188]:
ASOS_df3['colour_simp'] = ASOS_df3.apply(update_colour_simp, axis=1)

#### There still a crazy ASOS names 

We have still 269 rows without simple nomenclature. <br/>
I could Use fuzzywuzzy to clean it but -for the moment-, I prefer just to drop them

In [189]:
crazy_ASOS_women=ASOS_df3[ASOS_df3["colour_simp"]==""]

print(len(crazy_ASOS_women))
display(crazy_ASOS_women.sample(6))

#crazy_ASOS_women.to_csv('../data/clean/crazy_ASOS_women.csv', index=False)  # Change 'output.csv' to your desired file name

269


Unnamed: 0,brand,description,price,colour,colour_simp,colour_simp2
9316,Topshop,Vestido vaquero descolorido con cuello halter ...,"27,50 €",descolorido,,
4018,Levi's,Vestido corto verde estilo polo con ribetes en...,"47,00 €",eden,,
9236,Edited,Vestido amplio de manga corta con estampado fl...,"30,00 €",floral retro,,
8147,ASOS Curve,Vestido veraniego midi color pastel con estamp...,"18,89 €",flores tono pastel,,
9298,Pieces Tall,Vestido amplio con estampado de florecitas y c...,"16,19 €",floral,,
6197,JDY,Vestido corto gris con lazada en la cintura de...,"24,99 €",lgm,,


### 2.3 e- Drop rows with crazy names and keep only simplified color

In [190]:
ASOS_df4 = ASOS_df3.copy()
ASOS_df4 = ASOS_df4.loc[:, ['brand', 'description', 'price', "colour_simp"]]
ASOS_df4 = ASOS_df4.rename(columns={'colour_simp': 'colour'})
ASOS_df4.colour =[row.lstrip() for row in ASOS_df4.colour]

In [191]:
temp_yes=ASOS_df4[ASOS_df4["colour"]!=""]
temp_no=ASOS_df4[ASOS_df4["colour"]==""]

for i in range(len(collapsed_dict2)):
    keyword = list(collapsed_dict2.keys())[i]
    temp_no.loc[temp_no["description"].str.contains(keyword), "colour"] = keyword

ASOS_df4 = pd.concat([temp_yes, temp_no], axis=0)
print(len(ASOS_df4[ASOS_df4["colour"]==""]))
ASOS_df4=ASOS_df4[ASOS_df4["colour"]!=""]

136


#### Simplify ` turquesa` for `azul` and `marfil` for `beis`

In [192]:
ASOS_df4["colour"][ASOS_df4["colour"]=="turquesa"]="azul"
ASOS_df4["colour"][ASOS_df4["colour"]=="marfil"]="beis"

## 2.4 Clean `price` column

In [193]:
ASOS_df4["price"]=[VAL.replace(",", ".") for VAL in ASOS_df4["price"]]
ASOS_df4["price"]=[VAL.replace("€", "") for VAL in ASOS_df4["price"]]
ASOS_df4["price"]=[float(VAL) for VAL in ASOS_df4["price"]]

## 2.5 Normalize `brand` names

In [194]:
ASOS_df5=ASOS_df4.copy()
ASOS_df5.brand = [row.replace(".", "") for row in ASOS_df5.brand]

## 3. Save cleaned data

In [195]:
#asos_women_clothes_clean=ASOS_df5.copy()
#asos_women_clothes_clean.to_csv('../data/clean/asos_women_clothes_clean.csv', index=False)

## For men

In [158]:
asos_raw_list=asos_men_list

In [159]:
color_df_list=[]
for ASOS in asos_raw_list:
    thedf1=[[row["brandName"], row["name"],  row["price"]["current"]["text"], row["colour"]] for row in ASOS]
    color_df=pd.DataFrame(thedf1,columns=["brand", "description", "price", "colour"])
    color_df_list.append(color_df)
    
ASOS_df = pd.concat(color_df_list, ignore_index=True)

# delete duplicated rows 
ASOS_df1=ASOS_df[~ASOS_df["description"].duplicated()]
ASOS_df1.reset_index()

Unnamed: 0,index,brand,description,price,colour
0,0,French Connection,Camisa de vestir blanca de manga larga con cue...,"39,99 €",Blanco
1,1,ASOS DESIGN,Pack de 2 camisas de oficina blancas elásticas...,"17,50 €",Blanco
2,2,ASOS DESIGN,Camisa Oxford negra entallada con cuello itali...,"34,99 €",Negro
3,3,ASOS DESIGN,Camisa de vestir negra entallada con detalle d...,"21,00 €",Negro
4,4,Topman,Camisa de vestir blanca de corte slim y manga ...,"25,00 €",Blanco
...,...,...,...,...,...
1912,2411,Labelrail,Polo negro y naranja unisex de manga larga de ...,"37,00 €",Negro/naranja
1913,2412,Lacoste,Polo rojo liso de Lacoste,"70,00 €",rojo
1914,2413,Champion,Camiseta negra con logo vintage de Champion,"29,50 €",Negro
1915,2414,ASOS DESIGN,Camiseta corta negra ajustada unisex con estam...,"16,00 €",Negro


In [160]:
ASOS_df1=ASOS_df1.copy()
ASOS_df1["colour"]=eda.namvector_clean(ASOS_df1["colour"])
ASOS_df1["colour"]=[col.replace("_", " ") for col in ASOS_df1["colour"]]

In [161]:
amz_colors=['negro', 'gris', 'blanco', 'marron', 'beis', 'rojo', 'rosa',
       'naranja', 'amarillo', 'marfil', 'verde', 'turquesa', 'azul',
       'morado', 'dorado']

english_colors = ['black', 'gray', 'white', 'brown', 'beige', 'red', 'pink', 'orange', 'yellow', 'ivory', 'green', 'turquoise', 'blue', 'purple', 'gold']
amz_colors_dict = dict(zip(english_colors, amz_colors))

print(amz_colors_dict)

{'black': 'negro', 'gray': 'gris', 'white': 'blanco', 'brown': 'marron', 'beige': 'beis', 'red': 'rojo', 'pink': 'rosa', 'orange': 'naranja', 'yellow': 'amarillo', 'ivory': 'marfil', 'green': 'verde', 'turquoise': 'turquesa', 'blue': 'azul', 'purple': 'morado', 'gold': 'dorado'}


In [162]:
ASOS_df2=ASOS_df1.copy()
ASOS_df2["colour_simp"]=""

for color in amz_colors:
    '''
    check in original color nomenclature if a simplest color is present
    some nomenclatures contain multiple words, i need doble comprehension to eval all the words by cell
    '''
    matches = [any(word in color for word in cell.split())
               for cell in ASOS_df2["colour"].values]
    
    ASOS_df2.loc[matches, "colour_simp"] = color

In [94]:
ASOS_df2.head()

for dict_key, dict_value in amz_colors_dict.items(): 
    matches = [cell in dict_key for cell in ASOS_df2["colour"].values]
    
    '''
    check in original color nomenclature if a simplest color is present
    some nomenclatures contain multiple words, i need doble comprehension to eval all the words by cell
    '''
    matches = [any(word in dict_key for word in cell.split())
               for cell in ASOS_df2["colour"].values]
    
    ASOS_df2.loc[matches, "colour_simp"] = dict_value


In [163]:
ASOS_df3=ASOS_df2.copy()
ASOS_df3['colour_simp2'] = ASOS_df3['colour'].apply(color_dict_simplify)

In [164]:
ASOS_df3['colour_simp'] = ASOS_df3.apply(update_colour_simp, axis=1)

In [165]:
crazy_ASOS_men=ASOS_df3[ASOS_df3["colour_simp"]==""]

print(len(crazy_ASOS_men))
display(crazy_ASOS_men.sample(6))

#crazy_ASOS_men.to_csv('../data/clean/crazy_ASOS_men.csv', index=False)  # Change 'output.csv' to your desired file name

55


Unnamed: 0,brand,description,price,colour,colour_simp,colour_simp2
1506,ASOS DESIGN,Camiseta gris extragrande con estampado de dib...,"23,50 €",griffin,,
1584,Timberland,Camiseta marrón de corte slim con logo pequeño...,"21,00 €",caoba claro,,
886,Timberland,Camiseta azul cielo con logo pequeño de Timber...,"17,50 €",skyway,,
1398,ASOS DESIGN,Top beis de tirantes holgado con cuello de pic...,"13,00 €",grava,,
1250,Berghaus,Camiseta color arena con estampado de montaña ...,"20,50 €",sand dune,,
391,ADPT,Camiseta beis lavado extragrande sin mangas de...,"7,00 €",loza,,


In [166]:
ASOS_df4 = ASOS_df3.copy()
ASOS_df4 = ASOS_df4.loc[:, ['brand', 'description', 'price', "colour_simp"]]
ASOS_df4 = ASOS_df4.rename(columns={'colour_simp': 'colour'})
ASOS_df4.colour =[row.lstrip() for row in ASOS_df4.colour]

In [167]:
temp_yes=ASOS_df4[ASOS_df4["colour"]!=""]
temp_no=ASOS_df4[ASOS_df4["colour"]==""]

for i in range(len(collapsed_dict2)):
    keyword = list(collapsed_dict2.keys())[i]
    temp_no.loc[temp_no["description"].str.contains(keyword), "colour"] = keyword

ASOS_df4 = pd.concat([temp_yes, temp_no], axis=0)
print(len(ASOS_df4[ASOS_df4["colour"]==""]))
ASOS_df4=ASOS_df4[ASOS_df4["colour"]!=""]

22


In [170]:
ASOS_df4["colour"][ASOS_df4["colour"]=="turquesa"]="azul"
ASOS_df4["colour"][ASOS_df4["colour"]=="marfil"]="beis"

In [171]:
ASOS_df4["price"]=[VAL.replace(",", ".") for VAL in ASOS_df4["price"]]
ASOS_df4["price"]=[VAL.replace("€", "") for VAL in ASOS_df4["price"]]
ASOS_df4["price"]=[float(VAL) for VAL in ASOS_df4["price"]]

In [172]:
ASOS_df5=ASOS_df4.copy()
ASOS_df5.brand = [row.replace(".", "") for row in ASOS_df5.brand]

In [173]:
#asos_men_clothes_clean=ASOS_df5.copy()
#asos_men_clothes_clean.to_csv('../data/clean/asos_men_clothes_clean.csv', index=False)