In [182]:
import pandas as pd
import plotly.express as px

In [183]:
path = './dados/shopping_trends.csv'
dados = pd.read_csv(path)
df = pd.DataFrame(dados)

In [184]:
df

Unnamed: 0,Customer ID,Age,Gender,Item Purchased,Category,Purchase Amount (USD),Location,Size,Color,Season,Review Rating,Subscription Status,Payment Method,Shipping Type,Discount Applied,Promo Code Used,Previous Purchases,Preferred Payment Method,Frequency of Purchases
0,1,55,Male,Blouse,Clothing,53,Kentucky,L,Gray,Winter,3.1,Yes,Credit Card,Express,Yes,Yes,14,Venmo,Fortnightly
1,2,19,Male,Sweater,Clothing,64,Maine,L,Maroon,Winter,3.1,Yes,Bank Transfer,Express,Yes,Yes,2,Cash,Fortnightly
2,3,50,Male,Jeans,Clothing,73,Massachusetts,S,Maroon,Spring,3.1,Yes,Cash,Free Shipping,Yes,Yes,23,Credit Card,Weekly
3,4,21,Male,Sandals,Footwear,90,Rhode Island,M,Maroon,Spring,3.5,Yes,PayPal,Next Day Air,Yes,Yes,49,PayPal,Weekly
4,5,45,Male,Blouse,Clothing,49,Oregon,M,Turquoise,Spring,2.7,Yes,Cash,Free Shipping,Yes,Yes,31,PayPal,Annually
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3895,3896,40,Female,Hoodie,Clothing,28,Virginia,L,Turquoise,Summer,4.2,No,Cash,2-Day Shipping,No,No,32,Venmo,Weekly
3896,3897,52,Female,Backpack,Accessories,49,Iowa,L,White,Spring,4.5,No,PayPal,Store Pickup,No,No,41,Bank Transfer,Bi-Weekly
3897,3898,46,Female,Belt,Accessories,33,New Jersey,L,Green,Spring,2.9,No,Credit Card,Standard,No,No,24,Venmo,Quarterly
3898,3899,44,Female,Shoes,Footwear,77,Minnesota,S,Brown,Summer,3.8,No,PayPal,Express,No,No,24,Venmo,Weekly


# Análise de Padrões de Compra

### Produtos mais comprados em cada categoria

In [185]:
df_produto_categoria = df.groupby('Category')[['Item Purchased']].count().sort_values('Item Purchased', ascending=False)
df_produto_categoria

Unnamed: 0_level_0,Item Purchased
Category,Unnamed: 1_level_1
Clothing,1737
Accessories,1240
Footwear,599
Outerwear,324


In [186]:
px.bar(data_frame=df_produto_categoria)

### Diferenças de comportamento entre gêneros 

In [187]:
df_categoria_genero = df[['Gender', 'Category']]

df_categoria_genero.groupby('Gender')[['Category']].value_counts()

Gender  Category   
Female  Clothing        556
        Accessories     392
        Footwear        199
        Outerwear       101
Male    Clothing       1181
        Accessories     848
        Footwear        400
        Outerwear       223
Name: count, dtype: int64

In [188]:
df_categoria_genero = df[['Gender', 'Category']]
categoria_contagem = df_categoria_genero.groupby(['Gender', 'Category']).size()

categoria_contagem = categoria_contagem.reset_index(name='Total de compras')

categoria_mais_vendida = categoria_contagem.loc[
    categoria_contagem.groupby('Gender')['Total de compras'].idxmax()
]

df_totais_genero = categoria_mais_vendida[['Gender', 'Category', 'Total de compras']]

df_totais_genero.columns = ['Gender', 'Categoria mais vendida', 'Total de compras']

total_gasto_por_genero = df.groupby('Gender')['Purchase Amount (USD)'].sum()

df_totais_genero['Total Purchase Amount (USD)'] = df_totais_genero['Gender'].map(total_gasto_por_genero)

df_totais_genero

Unnamed: 0,Gender,Categoria mais vendida,Total de compras,Total Purchase Amount (USD)
1,Female,Clothing,556,75191
5,Male,Clothing,1181,157890


### Diferenças de comportamento entre faixas etárias

In [205]:
df_categoria_idade = df[['Age', 'Category']]
df_categoria_idade.query('Age >= 18 and Age < 25')

Unnamed: 0,Age,Category
1,19,Clothing
3,21,Footwear
20,21,Clothing
24,18,Outerwear
25,18,Clothing
...,...,...
3821,20,Footwear
3830,22,Accessories
3848,22,Accessories
3868,18,Clothing


In [210]:
bins = [18, 25, 35, 50, 100] 
labels = ['18-25', '26-35', '36-50', 'Acima de 50']

df_categoria_idade['Faixa_Etaria'] = pd.cut(df_categoria_idade['Age'], bins=bins, labels=labels, right=False)



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



Unnamed: 0,Age,Category,Faixa_Etaria
0,55,Clothing,Acima de 50
1,19,Clothing,18-25
2,50,Clothing,Acima de 50
3,21,Footwear,18-25
4,45,Clothing,36-50
...,...,...,...
3895,40,Clothing,36-50
3896,52,Accessories,Acima de 50
3897,46,Accessories,36-50
3898,44,Footwear,36-50


In [211]:
categoria_por_faixa = df_categoria_idade.groupby(['Faixa_Etaria', 'Category']).size()

categoria_por_faixa = categoria_por_faixa.reset_index(name='Total de compras')

categoria_mais_comprada = categoria_por_faixa.loc[
    categoria_por_faixa.groupby('Faixa_Etaria')['Total de compras'].idxmax()
]

categoria_mais_comprada







Unnamed: 0,Faixa_Etaria,Category,Total de compras
1,18-25,Clothing,236
5,26-35,Clothing,340
9,36-50,Clothing,476
13,Acima de 50,Clothing,685


In [212]:
df.loc[(df['Age'] > 25) & (df['Gender'] == 'Male'), ['Age', 'Gender']]

Unnamed: 0,Age,Gender
0,55,Male
2,50,Male
4,45,Male
5,46,Male
6,63,Male
...,...,...
2643,49,Male
2646,33,Male
2647,60,Male
2648,51,Male


### Quais produtos têm maior demanda em cada estação

In [246]:
df_item_season = df.groupby('Season')[['Item Purchased']].value_counts().reset_index(name='Total de Compra')

df_item_season = df_item_season

df_item_season = df_item_season.loc[
    df_item_season.groupby('Season')['Total de Compra'].idxmax()
]

df_item_season

Unnamed: 0,Season,Item Purchased,Total de Compra
0,Fall,Jacket,54
25,Spring,Sweater,52
50,Summer,Pants,50
75,Winter,Sunglasses,52
