Notebook para análise descritiva dos dados da H&M.

# Libraries

In [1]:
import pandas as pd
import numpy as np

# Data collection

In [2]:
data_raw = pd.read_csv('https://raw.githubusercontent.com/lucasquemelli/ds_ao_dev/main/data_clean.csv')
df01 = data_raw.copy()
df01.head()

Unnamed: 0.1,Unnamed: 0,product_id,fit,color,product_name,product_price,style_id,color_id,scrape_datetime,cotton,polyester,spandex,lyocell,rayon,elastomultiester,model_size,jeans_size
0,0,1100162003,regular_fit,denim_gray,essentials_no_2_the_jeans,39.99,1100162,3,2022-11-07 11:20:15,0.99,0.65,0.01,0.0,0.0,0.0,187.0,31/32
1,1,1100162003,regular_fit,denim_gray,,,1100162,3,2022-11-07 11:20:15,0.99,0.65,0.01,0.0,0.0,0.0,187.0,31/32
2,2,1100162002,regular_fit,denim_blue,essentials_no_2_the_jeans,39.99,1100162,2,2022-11-07 11:20:15,0.99,0.65,0.01,0.0,0.0,0.0,189.0,31/32
3,3,1100162002,regular_fit,denim_blue,,,1100162,2,2022-11-07 11:20:15,0.99,0.65,0.01,0.0,0.0,0.0,189.0,31/32
4,4,1024256001,slim_fit,black,slim_jeans,19.99,1024256,1,2022-11-07 11:20:15,0.99,0.65,0.01,0.0,0.0,0.0,185.0,31/32


# 1.0 Data description

## 1.1 Data dimensions

In [3]:
df01.shape

(128, 17)

In [4]:
print("Number of rows: {}".format(df01.shape[0]))
print("Number of cols: {}".format(df01.shape[1]))

Number of rows: 128
Number of cols: 17


## 1.2 Data types

In [5]:
df01.dtypes

Unnamed: 0            int64
product_id            int64
fit                  object
color                object
product_name         object
product_price       float64
style_id              int64
color_id              int64
scrape_datetime      object
cotton              float64
polyester           float64
spandex             float64
lyocell             float64
rayon               float64
elastomultiester    float64
model_size          float64
jeans_size           object
dtype: object

In [6]:
df01 = df01.drop(columns=['Unnamed: 0']).reset_index(drop=True)

In [7]:
df01['scrape_datetime'] = pd.to_datetime(df01['scrape_datetime']) 

In [8]:
df01.dtypes

product_id                   int64
fit                         object
color                       object
product_name                object
product_price              float64
style_id                     int64
color_id                     int64
scrape_datetime     datetime64[ns]
cotton                     float64
polyester                  float64
spandex                    float64
lyocell                    float64
rayon                      float64
elastomultiester           float64
model_size                 float64
jeans_size                  object
dtype: object

## 1.3 Missing values

In [9]:
df01.isna()

Unnamed: 0,product_id,fit,color,product_name,product_price,style_id,color_id,scrape_datetime,cotton,polyester,spandex,lyocell,rayon,elastomultiester,model_size,jeans_size
0,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
1,False,False,False,True,True,False,False,False,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
3,False,False,False,True,True,False,False,False,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
123,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,True
124,False,False,False,True,True,False,False,False,False,False,False,False,False,False,True,True
125,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
126,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False


In [10]:
df01.isna().sum()

product_id           0
fit                  0
color                0
product_name        56
product_price       56
style_id             0
color_id             0
scrape_datetime      0
cotton               0
polyester            0
spandex              0
lyocell              0
rayon                0
elastomultiester     0
model_size          39
jeans_size          45
dtype: int64

In [11]:
df01.isna().sum()/df01.shape[0]

product_id          0.000000
fit                 0.000000
color               0.000000
product_name        0.437500
product_price       0.437500
style_id            0.000000
color_id            0.000000
scrape_datetime     0.000000
cotton              0.000000
polyester           0.000000
spandex             0.000000
lyocell             0.000000
rayon               0.000000
elastomultiester    0.000000
model_size          0.304688
jeans_size          0.351562
dtype: float64

In [12]:
df_aux01 = df01.dropna()
df_aux01.shape

(44, 16)

Deletar os valores faltantes resultaria em um impacto muito grande no dataframe. Como o HTML do site da H&M muda diariamente, também não seria viável coletar mais dados automaticamente todos os dias. 

A solução que proponho é não utilizar a coluna de product_name, model_size e jeans_size. Para product_price, verificar o agrupamento de preço por product_id e se não variar muito atribuir os valores de preço para os valores faltantes dentro do mesmo product_id.



## 1.4 Missing values replacement

In [13]:
df01[['product_id', 'product_price']].sort_values('product_id', ascending=True).groupby('product_id').head()

Unnamed: 0,product_id,product_price
35,690449022,
34,690449022,39.99
25,690449036,39.99
50,690449051,39.99
63,690449056,39.99
...,...,...
1,1100162003,
90,1107750001,44.99
91,1107750001,
126,1114023003,44.99


In [14]:
for i in range(len(df01)):
    if df01.loc[i, 'product_price'] > 0:
        df01.loc[i, 'product_price'] = df01.loc[i, 'product_price']
    
    else:
        df01.loc[i, 'product_price'] = 0

In [15]:
df_aux = df01[['product_id', 'product_price']].groupby('product_id').max().reset_index()   

In [16]:
df02 = pd.merge(df01, df_aux, on='product_id', how='left')

In [17]:
df02 = df02.drop(columns=['product_price_x'])
df02.rename(columns={'product_price_y':'product_price'}, inplace=True)

In [18]:
df02.head()

Unnamed: 0,product_id,fit,color,product_name,style_id,color_id,scrape_datetime,cotton,polyester,spandex,lyocell,rayon,elastomultiester,model_size,jeans_size,product_price
0,1100162003,regular_fit,denim_gray,essentials_no_2_the_jeans,1100162,3,2022-11-07 11:20:15,0.99,0.65,0.01,0.0,0.0,0.0,187.0,31/32,39.99
1,1100162003,regular_fit,denim_gray,,1100162,3,2022-11-07 11:20:15,0.99,0.65,0.01,0.0,0.0,0.0,187.0,31/32,39.99
2,1100162002,regular_fit,denim_blue,essentials_no_2_the_jeans,1100162,2,2022-11-07 11:20:15,0.99,0.65,0.01,0.0,0.0,0.0,189.0,31/32,39.99
3,1100162002,regular_fit,denim_blue,,1100162,2,2022-11-07 11:20:15,0.99,0.65,0.01,0.0,0.0,0.0,189.0,31/32,39.99
4,1024256001,slim_fit,black,slim_jeans,1024256,1,2022-11-07 11:20:15,0.99,0.65,0.01,0.0,0.0,0.0,185.0,31/32,19.99


In [20]:
df02 = df02.drop(columns=['product_name', 'model_size', 'jeans_size'])

In [21]:
df02.isna().sum()

product_id          0
fit                 0
color               0
style_id            0
color_id            0
scrape_datetime     0
cotton              0
polyester           0
spandex             0
lyocell             0
rayon               0
elastomultiester    0
product_price       0
dtype: int64

## 1.5 Data description

### 1.5.1 Numerical data

In [22]:
numerical_attributes = df02.select_dtypes(include=['float64']) 

In [23]:
numerical_attributes.head()

Unnamed: 0,cotton,polyester,spandex,lyocell,rayon,elastomultiester,product_price
0,0.99,0.65,0.01,0.0,0.0,0.0,39.99
1,0.99,0.65,0.01,0.0,0.0,0.0,39.99
2,0.99,0.65,0.01,0.0,0.0,0.0,39.99
3,0.99,0.65,0.01,0.0,0.0,0.0,39.99
4,0.99,0.65,0.01,0.0,0.0,0.0,19.99


### 1.5.2 Categorical data

In [24]:
categorical_attributes = df02.select_dtypes(include=['object']) 

In [25]:
categorical_attributes.head()

Unnamed: 0,fit,color
0,regular_fit,denim_gray
1,regular_fit,denim_gray
2,regular_fit,denim_blue
3,regular_fit,denim_blue
4,slim_fit,black


# 2.0 Feature Engineering

# 3.0 Attributes filtering

# 4.0 EDA (exploratory data analysis)