In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold
from xgboost import XGBRegressor

In [None]:
df = pd.read_csv('cloth_prices.csv', sep=';')

In [None]:
df

In [None]:
Y = df['Price']  # dependent variable
X = df[['Item', 'Brand', 'Usage', 'Size']]  # independent variable

In [None]:
kf = KFold(n_splits=4, random_state=0, shuffle=True)
kf.get_n_splits(X)

In [None]:
X_num = np.array(X)
y_num = np.array(Y)

In [None]:
for train_index, test_index in kf.split(X):
    print("TRAIN:", train_index, "TEST:", test_index)
    X_train, X_test = X_num[train_index], X_num[test_index]
    y_train, y_test = y_num[train_index], y_num[test_index]

In [None]:
model = XGBRegressor(n_estimators=1000, max_depth=7, eta=0.1, subsample=0.7, colsample_bytree=0.8)

In [None]:
model.fit(X_train, y_train, verbose=True)

## Fake Data Generator

In [2]:
df_look = pd.read_csv('dataset/thelook.csv')

In [3]:
df_look.head(3)

Unnamed: 0,id,product_id,created_at,sold_at,cost,product_category,product_name,product_brand,product_retail_price,product_department,product_sku,product_distribution_center_id,days_storage
0,72384,13844,2021-05-04 00:18:09 UTC,2021-05-08 00:56:09 UTC,2.76804,Accessories,(ONE) 1 Satin Headband,Funny Girl Designs,6.99,Women,2A3E953A5E3D81E67945BCE5519F84C8,7,4
1,36195,14086,2021-05-07 20:48:50 UTC,2021-06-04 11:23:50 UTC,2.24625,Accessories,(One) CHEER Rhinestone Studded Stretch Headband,Funny Girl Designs,5.99,Women,8EFA9015A4EF4632A954E820ECA834AD,7,27
2,64993,14086,2020-05-04 20:24:01 UTC,2020-05-13 12:20:01 UTC,2.24625,Accessories,(One) CHEER Rhinestone Studded Stretch Headband,Funny Girl Designs,5.99,Women,8EFA9015A4EF4632A954E820ECA834AD,7,8


In [4]:
df_look.sort_values(['days_storage'], ascending=[True],inplace=True)

In [5]:
p1=[0.4, 0.3, 0.15, 0.1, 0.05]
p2=[0.35, .35, 0.15, 0.1, 0.05]
p3=[0.2, 0.3, 0.25, 0.2, 0.05]
p4=[0.1, 0.3, 0.35, 0.15, 0.1]
p5=[0.05, 0.1, 0.15, 0.3, 0.4]

In [6]:
int(np.round(len(df_look['cost'])/5,0))

5450

In [7]:
df_look_1=df_look.iloc[:5450,:]

In [8]:
df_look_1=df_look_1['cost'].apply(
    lambda x: x * np.random.choice(np.arange(.9, 1.1, 0.05), p=p1)
)
df_look_1

20696    29.357639
19351     5.314680
21715    13.053000
21702    87.674400
19353     6.059550
           ...    
3118     47.389861
3124     18.885637
186       8.009935
27127    15.921150
1021     37.910000
Name: cost, Length: 5450, dtype: float64

In [9]:
df_look_2=df_look.iloc[5450:10900,:]
df_look_2=df_look_2['cost'].apply(
    lambda x: x * np.random.choice(np.arange(.9, 1.1, 0.05), p=p2)
)
df_look_2

184       4.442580
24878    17.271000
4099     68.888000
24881    13.291200
3013     21.143011
           ...    
812       6.247934
8294     48.479930
835      20.493000
22440    18.488998
17634    27.423495
Name: cost, Length: 5450, dtype: float64

In [10]:
df_look_3=df_look.iloc[10900:16350,:]
df_look_3=df_look_3['cost'].apply(
    lambda x: x * np.random.choice(np.arange(.9, 1.1, 0.05), p=p3)
)
df_look_3

6082     28.964000
3266     72.758400
15283    37.762500
9322     29.799000
21815    12.584403
           ...    
1808     18.544570
24838    52.455000
25855    27.266400
5446     24.264900
22989    15.024707
Name: cost, Length: 5450, dtype: float64

In [11]:
df_look_4=df_look.iloc[16350:21800,:]
df_look_4=df_look_4['cost'].apply(
    lambda x: x * np.random.choice(np.arange(.9, 1.1, 0.05), p=p4)
)
df_look_4

25571    30.450000
24482    20.649200
1874      9.829080
2189      2.130000
22913    10.784970
           ...    
8918     21.074731
11786    22.193850
381       1.187028
8906     19.123831
12593    54.977012
Name: cost, Length: 5450, dtype: float64

In [12]:
df_look_5=df_look.iloc[21800:27250,:]
df_look_5=df_look_5['cost'].apply(
    lambda x: x * np.random.choice(np.arange(.9, 1.1, 0.05), p=p5)
)
df_look_5

12647     9.229462
8652     65.243145
8841     28.028000
20429    40.366531
8894     68.468400
           ...    
26236    24.597878
18564    28.946026
9308     91.945000
13013    15.495404
3865     20.722591
Name: cost, Length: 5450, dtype: float64

In [13]:
df_look_cost = list(df_look_1) + list(df_look_2)+ list(df_look_3)+list(df_look_4)+list(df_look_5)

In [14]:
df_look=df_look.iloc[:-1,:]

In [15]:
df_look['cost']=df_look_cost

In [16]:
df_look['product_id'].value_counts()

22002    8
8712     7
19805    7
10701    7
13631    6
        ..
6040     1
25164    1
11155    1
20199    1
21899    1
Name: product_id, Length: 17744, dtype: int64

In [17]:
df_priced=df_look.copy()

In [18]:
df_priced['product_brand'].value_counts()

Allegra K         868
Calvin Klein      471
Carhartt          376
Volcom            289
Tommy Hilfiger    284
                 ... 
Konflict            1
Bobi                1
Monologue           1
Motony              1
Heather             1
Name: product_brand, Length: 2324, dtype: int64

In [19]:
df_priced[df_priced['product_brand']=="Allegra K"]['product_brand']

2905     Allegra K
9893     Allegra K
2927     Allegra K
2944     Allegra K
2962     Allegra K
           ...    
4283     Allegra K
15016    Allegra K
15008    Allegra K
14952    Allegra K
14957    Allegra K
Name: product_brand, Length: 868, dtype: object

In [20]:
df_priced.loc[df_priced['product_brand']=="Allegra K", "product_brand"] = "Adidas"

In [21]:
df_priced.loc[df_priced['product_brand']=="Volcom", "product_brand"] = "Puma"
df_priced.loc[df_priced['product_brand']=="Tommy Hilfiger", "product_brand"] = "Nike"

In [22]:
df_priced['product_sku']='Shoes'

In [23]:
import random
usage = ['New', 'Almost New', 'Used', 'Very Used']
for i in range(len(df_priced['product_category'])):
    df_priced['product_category'][i]=random.choice(usage)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_priced['product_category'][i]=random.choice(usage)


In [24]:
for i in range(len(df_priced['product_category'])):
    df_priced['product_retail_price'][i]=int(np.random.choice(np.arange(37, 46, 1)))

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_priced['product_retail_price'][i]=int(np.random.choice(np.arange(37, 46, 1)))


In [25]:
df_priced['product_retail_price']=df_priced['product_retail_price'].astype(int)

In [26]:
df_priced

Unnamed: 0,id,product_id,created_at,sold_at,cost,product_category,product_name,product_brand,product_retail_price,product_department,product_sku,product_distribution_center_id,days_storage
20696,390,20184,2021-11-24 08:06:47 UTC,2021-11-25 03:52:47 UTC,29.357639,Very Used,Men's Single Breasted 3 Button Gray Pinstripe ...,Abini,38,Men,Shoes,4,0
19351,43639,25044,2021-04-01 13:53:31 UTC,2021-04-01 17:33:31 UTC,5.314680,Used,SockGuy Men's Mudflap Girl Socks,SockGuy,41,Men,Shoes,2,0
21715,67554,19168,2022-01-14 14:18:08 UTC,2022-01-14 15:49:08 UTC,13.053000,New,John Henry Men's Long Sleeve Crew Neck Sweater,John Henry,39,Men,Shoes,3,0
21702,20047,18802,2021-02-24 00:58:34 UTC,2021-02-24 12:49:34 UTC,87.674400,New,J.C. Rags Men's Reversed Cable Blazer,J.C. Rags,42,Men,Shoes,1,0
19353,4935,25120,2022-01-15 18:41:17 UTC,2022-01-16 11:16:17 UTC,6.059550,Almost New,SockGuy Men's Unicorn Socks,SockGuy,41,Men,Shoes,2,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
26236,9122,25745,2021-04-15 01:14:50 UTC,2021-06-14 00:17:50 UTC,24.597878,Almost New,Bjorn Borg Men's Double Layer Longs,Bjorn Borg,43,Men,Shoes,9,59
18564,25871,9980,2019-04-26 13:36:50 UTC,2019-06-25 00:40:50 UTC,28.946026,Used,White 52 Inch Hooded Terry Robe W/double Stitc...,Spa & Resort,42,Women,Shoes,1,59
9308,58983,21106,2021-01-07 12:03:52 UTC,2021-03-08 08:17:52 UTC,91.945000,Almost New,Paige Premium Denim Men's Stone Canyon Boot Cu...,PAIGE,44,Men,Shoes,7,59
13013,30630,21899,2021-07-31 15:42:54 UTC,2021-09-29 03:44:54 UTC,15.495404,Almost New,Ed Garments Men's Flat Front Pant. 2577,Ed Garments,39,Men,Shoes,7,59


In [27]:
from faker import Faker
fake = Faker('it_IT')

for i in list(df_priced['product_name'].unique()):
    selected_brand=df_priced[df_priced["product_name"]==i]
    brand=list(selected_brand['product_brand'])[0]
    df_priced.loc[df_priced['product_name']==i, "product_name"] =[k.split(' ')[0] + ' ' + fake.name() for k in list(selected_brand['product_name'])]



In [28]:
df_train = df_priced[['product_sku','product_brand', 'product_category', 'product_department','product_name','product_retail_price', 'days_storage', 'cost']]


In [29]:
df_train.rename({"product_sku" : "Item",
                 "product_department": "Gender",
                 "product_brand": "Brand", 
                 "product_name":"Model",
                 "product_category":"Usage",
                 "product_retail_price": "Size", 
                 "days_storage":"Sale_Until",
                 "cost" :"Prize"},axis='columns', inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().rename(


In [30]:
df_train

Unnamed: 0,Item,Brand,Usage,Gender,Model,Size,Sale_Until,Prize
20696,Shoes,Abini,Very Used,Men,Men's Luigina Trapani,38,0,29.357639
19351,Shoes,SockGuy,Used,Men,SockGuy Virginia Paolucci,41,0,5.314680
21715,Shoes,John Henry,New,Men,John Laura Boaga,39,0,13.053000
21702,Shoes,J.C. Rags,New,Men,J.C. Bianca Pellegrini,42,0,87.674400
19353,Shoes,SockGuy,Almost New,Men,SockGuy Pasqual Ottino-Pareto,41,0,6.059550
...,...,...,...,...,...,...,...,...
26236,Shoes,Bjorn Borg,Almost New,Men,Bjorn Greco Volta,43,59,24.597878
18564,Shoes,Spa & Resort,Used,Women,White Ernesto Gatto,42,59,28.946026
9308,Shoes,PAIGE,Almost New,Men,Paige Agnolo Onisto-Samele,44,59,91.945000
13013,Shoes,Ed Garments,Almost New,Men,Ed Massimo Trillini,39,59,15.495404


In [31]:
df_train=df_train[df_train['Prize'] < 150]
df_train=df_train[df_train['Sale_Until'] < 50]

In [32]:
df_train.to_csv("thelook2.csv")

In [33]:
df_train['Gender'].replace(['Men', 'Women'],
                        [0, 1], inplace=True)

In [34]:
df_train['Usage'].replace(['New', 'Almost New', 'Used', 'Very Used'],
                        [0, 1,2,3], inplace=True)

In [35]:
df_train

Unnamed: 0,Item,Brand,Usage,Gender,Model,Size,Sale_Until,Prize
20696,Shoes,Abini,3,0,Men's Luigina Trapani,38,0,29.357639
19351,Shoes,SockGuy,2,0,SockGuy Virginia Paolucci,41,0,5.314680
21715,Shoes,John Henry,0,0,John Laura Boaga,39,0,13.053000
21702,Shoes,J.C. Rags,0,0,J.C. Bianca Pellegrini,42,0,87.674400
19353,Shoes,SockGuy,1,0,SockGuy Pasqual Ottino-Pareto,41,0,6.059550
...,...,...,...,...,...,...,...,...
7798,Shoes,VIVILLI,2,1,VIVILLI Griselda Ruggieri-Terragni,38,49,12.699600
24613,Shoes,Sons Of Anarchy,0,0,Bold Dott. Claudio Giannone,39,49,15.932312
3178,Shoes,Alex Evenings,3,1,Plus Giancarlo Barracco,42,49,42.195779
12124,Shoes,MontBell,3,0,MontBell Gianpietro Camuccini,41,49,75.222000


In [36]:
len(list(dict(df_train['Brand'].value_counts().loc[lambda x : x>30]).keys()))

148

In [37]:
result = dict()
for i in list(dict(df_train['Brand'].value_counts().loc[lambda x : x>30]).keys()):
    result[i]=df_train[df_train['Brand']==i]

In [38]:
def from_dict_value_to_df(d):
    """
    input = dictionary
    output = dataframe as part of all the values from the dictionary
    """
    df = pd.DataFrame()
    for v in d.values():
        df = df.append(v)
    return df

In [39]:
df_brands=from_dict_value_to_df(result)

In [40]:
df_brands['Brand'].replace(list(df_brands['Brand'].unique()),
                        range(len(list(df_brands['Brand'].unique()))), inplace=True)

In [45]:
df_num=df_brands.drop(['Item', 'Model'], axis=1)

In [51]:
mask = pd.to_numeric(df_num['Usage'], errors='coerce').isna()

Unnamed: 0,Item,Brand,Usage,Gender,Model,Size,Sale_Until,Prize
2905,Shoes,0,0,1,Allegra Stefano Gabba,42,0,5.649250
9893,Shoes,0,2,1,Allegra Sig. Lucio Natta,43,0,6.764181
2927,Shoes,0,0,1,Allegra Dott. Anita Gentileschi,41,0,4.488939
2944,Shoes,0,3,1,Allegra Giorgio Zaccardo,42,0,4.392930
2962,Shoes,0,1,1,Allegra Dott. Alfio Cimarosa,45,0,4.050373
...,...,...,...,...,...,...,...,...
20515,Shoes,147,3,0,Elegant Angelo Zeffirelli,42,44,43.150683
20458,Shoes,147,2,0,Classy Tina Orlando,45,45,9.807094
81,Shoes,147,0,0,American Giuliana Palmisano,41,47,5.031048
20448,Shoes,147,0,0,Classy Alfredo Zacchia-Luxardo,44,47,9.257369


In [None]:
# import pickle

# import numpy as np
# import pandas as pd
# from sklearn.model_selection import KFold
# from xgboost import XGBRegressor

In [None]:
# Y = df_train['Prize']  # dependent variable
# X = df_train[['Item', 'Brand', 'Usage', 'Model', 'Size', 'Sale_Until']]  # independent variable

In [None]:
# from pandas import DataFrame
# import matplotlib.pyplot as plt
# from sklearn.cluster import KMeans
# import seaborn as sns

# sns.set(style='whitegrid', rc={'figure.figsize':(11.7,8.27)})
# sns.set_context('poster')

In [None]:
# kmeans = KMeans(n_clusters=4).fit(df_train[['Prize', 'Sale_Until']])
# centroids = kmeans.cluster_centers_
# print(centroids)

In [None]:
# plt.scatter(df_train['Sale_Until'], df_train['Prize'], c= kmeans.labels_.astype(float), s=50, alpha=0.5)
# plt.scatter(centroids[:, 0], centroids[:, 1], c='green', s=50)
# plt.show()