# Import dependancies

In [2]:
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, r2_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import plotly.express as px
import en_core_web_lg

# Prepare data

## Import
Concatanate into one prada products dataframe and clean data as needed
- Reformatted material proportions for easier embedding

In [8]:
denim = pd.read_csv('Prada\\pradaProducts_Denim.csv')
knitwear = pd.read_csv("Prada\\pradaProducts_Knitwear.csv")
leatherClothing = pd.read_csv("Prada\\pradaProducts_Leather-Clothing.csv")
outerwear = pd.read_csv("Prada\\pradaProducts_Outerwear.csv")
suits = pd.read_csv("Prada\\pradaProducts_Suits.csv")

pradaDataset = pd.concat([denim, knitwear, leatherClothing, outerwear, suits], ignore_index=True)
print(pradaDataset.head())
pradaDataset['price'].describe()

                      name                       id   type   price  \
0           Chambray shirt  GEC105_14J2_F0ABR_S_OOO  Denim  1920.0   
1           Chambray pants  GEP403_14J2_F0ABR_S_OOO  Denim  1790.0   
2  Old denim zipper jacket  GEB233_16HE_F0V41_S_232  Denim  1850.0   
3          Old denim jeans  GEP358_16HE_F0V41_S_OOO  Denim  1390.0   
4     Denim blouson jacket  GEB255_14PF_F01AY_S_OOO  Denim  2450.0   

               material                                        description  \
0  65% cotton 35% linen  A soft touch and fine texture make this chambr...   
1  65% cotton 35% linen  These pants with casual allure are made of cha...   
2           100% cotton  Refined details and workmanship define the des...   
3           100% cotton  These five-pocket jeans with a regular fit and...   
4           100% cotton  This minimalist and refined blouson jacket fea...   

       colors                                              sizes  \
0     Natural                             

count      288.000000
mean      3554.340278
std       2834.991089
min       1100.000000
25%       1720.000000
50%       2375.000000
75%       4550.000000
max      23900.000000
Name: price, dtype: float64

In [10]:
materials = []

def deriveMaterials(data):
    materialDict = {}

    materialLi = ""
    splitData = data.split(" ")

    for i, n in enumerate(splitData):
        if "%" in n:
            perc = str(float(n.strip("%")) / 100)
            materialDict[perc] = ""
        else:
            if i == 0:
                material = n.strip().lower().strip(",")
                if material not in materials:
                    materials.append(material)
                return f"1.0-{material}"
            else:
                material = n.strip().lower().strip(",")
                materialDict[perc] += (
                    f"{' ' if len(materialDict[perc]) > 0 else ''}{material}"
                )

    percs = [z for z in materialDict.keys()]
    percs.sort(key=lambda y: float(y), reverse=True)

    # Lists unique materials
    for ii, p in enumerate(percs):
        mat = materialDict.get(p)
        if mat not in materials:
            materials.append(mat)
        materialLi += f"{'|' if ii > 0 else ''}{p}-{mat}"
    return materialLi

print(pradaDataset["material"].head())
pradaDataset["derivedMaterial"] = pradaDataset["material"].apply(deriveMaterials)
pradaDataset["derivedMaterial"].head()

0    65% cotton 35% linen
1    65% cotton 35% linen
2             100% cotton
3             100% cotton
4             100% cotton
Name: material, dtype: object


0    0.65-cotton|0.35-linen
1    0.65-cotton|0.35-linen
2                1.0-cotton
3                1.0-cotton
4                1.0-cotton
Name: derivedMaterial, dtype: object

## Feature Engineering
Manually transform features as needed
- Created material embeddings
- Vectorized description & details
- Label encoded clothing categories

In [11]:
print(len(materials), materials)

# Embed material feature
for row, value in enumerate(pradaDataset["derivedMaterial"]):
    mats = value.split("|")
    for variant in mats:
        proportion = float(variant.split("-")[0])
        material = variant.split("-")[1]
        pradaDataset.at[row, material] = proportion

# Fill n/a values for non-present material columns in rows
pradaDataset = pradaDataset.fillna(value=0.00)

20 ['cotton', 'linen', 'viscose', 'elastane', 'polyester', 'silk', 'cashmere', 'polyamide', 'virgin wool', 'recycled cashmere', 'recycled silk', 'lambskin', 'calfskin', 'goatskin', 'recycled polyamide', 'mohair', 'wool', 'recycled polyester', 'acrylic', 'nylon']


In [13]:
# load nlp model and vectorized descriptions and details
cloneDF = pradaDataset.copy()
nlp = en_core_web_lg.load()


def vectorize(text):
    doc = nlp(text)
    return doc.vector_norm


cloneDF["description"] = pradaDataset["description"].apply(vectorize)
cloneDF["details"] = pradaDataset["details"].apply(vectorize)

cloneDF["description"].head(), cloneDF["details"].head()

(0    3.163368
 1    2.876107
 2    3.113864
 3    3.165244
 4    2.991683
 Name: description, dtype: float64,
 0    2.155862
 1    2.480276
 2    2.090301
 3    2.349754
 4    2.158212
 Name: details, dtype: float64)

In [14]:
label_encoder = LabelEncoder()
cloneDF['type'] = label_encoder.fit_transform(cloneDF['type'])
cloneDF['type'].unique()

array([0, 1, 2, 3, 4])

# Review data and experiment

## Visualize correlations
- Materials : Price (feature significance)
- Description/Detail : Price (feature significance)
- Materials : Category (Classifiability)
- Description/Detail : Category (Classifiability)

In [None]:
for material in materials:
    px.scatter(pradaDataset, x=material, y='price', range_x=[0.2, 1.0]).show()

px.scatter(cloneDF, x="description", y="price").show()
px.scatter(cloneDF, x="details", y="price").show()

px.box(cloneDF, y='description', x='type').show()
px.box(cloneDF, y="details", x="type").show()

for material in materials:
    px.box(pradaDataset, y=material, x='type', range_y=[0.0045, 1.05] , title=f'Amount of {material} used per category').show()

## Split data and prepare models
- Category (type), description, details, material vector for features
- 15 Random Forest Regressor models (nestimators and maxdepth variants)

In [17]:
droppedLabels = [
    "name",
    "id",
    "material",
    "price",
    "colors",
    "sizes",
    "derivedMaterial",
]
targetLabel = ["price"]

X = cloneDF.drop(droppedLabels, axis=1)
Y = cloneDF[targetLabel].to_numpy().ravel()

X_train, X_test, y_train, y_test = train_test_split(
    X,
    Y,
    train_size=0.2,
    random_state=2124,
)

In [18]:
estimators = [50, 100, 200, 300, 350]
depths = [None, 1, 2]

models = []
vvv = 0

for est in estimators:
      for depth in depths:
            models.append(RandomForestRegressor(n_estimators=est, max_depth=depth, random_state=2124))
            print(f"{depth} depth, {est} n-estimator model queued. Pos. {vvv}")
            vvv += 1

None depth, 50 n-estimator model queued. Pos. 0
1 depth, 50 n-estimator model queued. Pos. 1
2 depth, 50 n-estimator model queued. Pos. 2
None depth, 100 n-estimator model queued. Pos. 3
1 depth, 100 n-estimator model queued. Pos. 4
2 depth, 100 n-estimator model queued. Pos. 5
None depth, 200 n-estimator model queued. Pos. 6
1 depth, 200 n-estimator model queued. Pos. 7
2 depth, 200 n-estimator model queued. Pos. 8
None depth, 300 n-estimator model queued. Pos. 9
1 depth, 300 n-estimator model queued. Pos. 10
2 depth, 300 n-estimator model queued. Pos. 11
None depth, 350 n-estimator model queued. Pos. 12
1 depth, 350 n-estimator model queued. Pos. 13
2 depth, 350 n-estimator model queued. Pos. 14


## Assess models

In [19]:
for vv, model in enumerate(models):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    print(f"Model {vv} \nMAE: {mae}\nR2: {r2}\n")

Model 0 
MAE: 1256.2493506493508
R2: 0.4183098713770126

Model 1 
MAE: 1285.87482315125
R2: 0.4808996940084468

Model 2 
MAE: 1242.3115985952927
R2: 0.49079171440625247

Model 3 
MAE: 1214.7251082251082
R2: 0.4511772572535583

Model 4 
MAE: 1283.6924451864822
R2: 0.4744219019851956

Model 5 
MAE: 1248.273249934997
R2: 0.4914646693030541

Model 6 
MAE: 1225.9874458874458
R2: 0.44938785924576263

Model 7 
MAE: 1291.8259340891038
R2: 0.48270848305339065

Model 8 
MAE: 1255.3547148626108
R2: 0.490768111685314

Model 9 
MAE: 1221.1839826839828
R2: 0.45329542037504766

Model 10 
MAE: 1290.27863960007
R2: 0.480220719080661

Model 11 
MAE: 1243.8731251995698
R2: 0.4986240265407976

Model 12 
MAE: 1229.1569573283862
R2: 0.4478904389309257

Model 13 
MAE: 1289.641256901544
R2: 0.4833286736374479

Model 14 
MAE: 1251.8257327349595
R2: 0.49689647409755666

