# Sales Data Analysis: Querying with Pandas

In [2]:
import numpy as np
import pandas as pd

In [6]:
df1 = pd.read_csv(r"currency_rates_082022_EUR.csv")

In [11]:
df1.head()

Unnamed: 0,currency,date,rate
0,EUR,8/1/2020,1.0
1,EUR,8/2/2020,1.0
2,EUR,8/3/2020,1.0
3,EUR,8/4/2020,1.0
4,EUR,8/5/2020,1.0


In [12]:
df1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 220 entries, 0 to 219
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   currency  220 non-null    object 
 1   date      220 non-null    object 
 2   rate      220 non-null    float64
dtypes: float64(1), object(2)
memory usage: 5.3+ KB


In [13]:
df1.shape

(220, 3)

#### Making a new feature with the data type datetime

In [53]:
df1['date2'] = pd.to_datetime(df1["date"], format='%d-%m-%Y')
df1['date2']

0     2020-01-08
1     2020-02-08
2     2020-03-08
3     2020-04-08
4     2020-05-08
         ...    
215   2020-06-08
216   2020-07-08
217   2020-08-08
218   2020-09-08
219   2020-10-08
Name: date2, Length: 220, dtype: datetime64[ns]

In [15]:
df1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 220 entries, 0 to 219
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype         
---  ------    --------------  -----         
 0   currency  220 non-null    object        
 1   date      220 non-null    object        
 2   rate      220 non-null    float64       
 3   date2     220 non-null    datetime64[ns]
dtypes: datetime64[ns](1), float64(1), object(2)
memory usage: 7.0+ KB


In [54]:
df1.head()

Unnamed: 0,currency,date,rate,date2
0,EUR,8/1/2020,1.0,2020-01-08
1,EUR,8/2/2020,1.0,2020-02-08
2,EUR,8/3/2020,1.0,2020-03-08
3,EUR,8/4/2020,1.0,2020-04-08
4,EUR,8/5/2020,1.0,2020-05-08


In [10]:
df2 = pd.read_csv(r"sales_082022.csv")

In [16]:
df2.head()

Unnamed: 0,buyer_id,buyer_country,seller_id,seller_country,product_code,category,brand,currency,price,date
0,qzpjsi9t0o,GB,6oufpaot,GB,ydecnydr6obf,Dresses,,GBP,19.11,2020-08-03
1,o79pns1qwo,GB,50xki2yg,GB,38chj507o6h2,Outerwear,Adidas Originals,GBP,15.92,2020-08-09
2,sjgbjdkhqx,GB,50xki2yg,GB,i9ynyxj5tdp8,Tops - Mens,,GBP,6.37,2020-08-07
3,o79pns1qwo,GB,s0p322hh,GB,t50xe7moye8v,Tops - Mens,Berghaus,GBP,25.27,2020-08-09
4,o79pns1qwo,GB,dpkx192v,GB,hp4r9bjq68af,Bottoms - Womens,,GBP,8.9,2020-08-09


In [17]:
df2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 237671 entries, 0 to 237670
Data columns (total 10 columns):
 #   Column          Non-Null Count   Dtype  
---  ------          --------------   -----  
 0   buyer_id        237671 non-null  object 
 1   buyer_country   237671 non-null  object 
 2   seller_id       237671 non-null  object 
 3   seller_country  237671 non-null  object 
 4   product_code    237671 non-null  object 
 5   category        237655 non-null  object 
 6   brand           119330 non-null  object 
 7   currency        237671 non-null  object 
 8   price           237671 non-null  float64
 9   date            237671 non-null  object 
dtypes: float64(1), object(9)
memory usage: 18.1+ MB


#### How many brands have sales between 35 and 55?

In [18]:
a = df2.groupby(["brand"]).filter(lambda x: (len(x) >= 35) & (len(x) <= 55))
len(a["brand"].unique())

81

#### How many sales in the Jewellery category have no brand?

In [19]:
b = df2[(df2["category"]=="Jewellery")]["brand"].isnull().count()
b

14923

In [20]:
df2["brand"].value_counts()

brand
Nike                  9575
Brandy Melville       5065
Topshop               4561
PrettyLittle Thing    4136
Adidas                3905
                      ... 
Preen                    1
Wacoal                   1
Shu Uemura               1
Parker                   1
Vita Fede                1
Name: count, Length: 1134, dtype: int64

#### How many pairs of shoes were bought by Australian buyers?

In [21]:
df2[(df2["buyer_country"]=="AU") & (df2["category"]=="Shoes")].shape[0]

658

In [22]:
df2.groupby(["brand","price"]).sum()

Unnamed: 0_level_0,Unnamed: 1_level_0,buyer_id,buyer_country,seller_id,seller_country,product_code,category,currency,date
brand,price,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
& Other Stories,1.05,nezegsi0do,GB,h8k2c4pi,GB,m9effzbetuyu,Beauty,GBP,2020-08-03
& Other Stories,1.09,e085x49lji,GB,y05o6ci9,GB,d2q5jtu55xfp,Beauty,GBP,2020-08-03
& Other Stories,3.34,hqgglirkl0,US,7r3z9ni8,US,cpzubnkyfimm,Jewellery,USD,2020-08-05
& Other Stories,5.33,uvqmjo5s0h,GB,oihflodt,GB,p4nr23676zu9,Accessories,GBP,2020-08-08
& Other Stories,5.42,asmyflngwx,GB,8aynm971,GB,3khzbmwg7kti,Tops - Womens,GBP,2020-08-07
...,...,...,...,...,...,...,...,...,...
mixxmix,38.60,4e5ghiilam,US,02rng3fz,US,c4egj3vklpf1,Tops - Womens,USD,2020-08-03
philosophy,14.16,zfhydakqn9,US,t5sb17qv,US,2bb2sw7ciihb,Beauty,USD,2020-08-08
philosophy,22.50,o4c80ddl18,US,sk94plsd,US,giya7hvheeqd,Beauty,USD,2020-08-03
philosophy,28.04,y960w6f4lw,US,wm7c3qx3,US,s0mceevz5kto,Beauty,USD,2020-08-06


#### What is the number of sales for Adidas or Adidas Originals brand with a price more than 100?

In [61]:
df2[((df2["brand"]=="Adidas") | (df2["brand"]=="Adidas Originals")) & (df2["price"] > 100)]

Unnamed: 0,buyer_id,buyer_country,seller_id,seller_country,product_code,category,brand,currency,price,date,date2
519,jst7a6ik8x,US,eojrnt23,US,q5ebzqzkv66e,Shoes,Adidas,USD,100.86,2020-08-09,2020-08-09
2660,ohu1sxdy0h,GB,xuz8s3rj,GB,zihn0f365zq9,Tops - Mens,Adidas,GBP,114.39,2020-08-04,2020-08-04
5876,laow8waul5,US,zqi2ltll,US,siaucl0nbb73,Outerwear,Adidas,USD,114.39,2020-08-05,2020-08-05
22248,stglehmmo5,GB,jg7ipeli,GB,8q1a11svj3uv,Shoes,Adidas,GBP,198.17,2020-08-03,2020-08-03
23111,tiqqf4tkly,RoW,2y3hurds,GB,nbiw3uib1fdc,Tops - Mens,Adidas,GBP,164.06,2020-08-06,2020-08-06
...,...,...,...,...,...,...,...,...,...,...,...
234151,s0n08yxt1s,US,p7tlp937,US,k4dlbrsnc87v,Shoes,Adidas,USD,178.85,2020-08-06,2020-08-06
235082,o1m8wzrrwe,GB,9ij0jfxk,GB,bx94bq3kmihw,Shoes,Adidas,GBP,141.51,2020-08-06,2020-08-06
235340,2pwfw3qe7q,GB,s0rn7cam,GB,xhshg6ldz31b,Shoes,Adidas,GBP,114.79,2020-08-05,2020-08-05
236590,5mb4t331ed,GB,mwfnyt0j,GB,rbpjoat6qpgz,Shoes,Adidas,GBP,221.86,2020-08-06,2020-08-06


#### How many sales were made by GB buyers from GB sellers? What is the total price?

In [24]:
df2[(df2["buyer_country"]=="GB") & (df2["seller_country"]=="GB")]

Unnamed: 0,buyer_id,buyer_country,seller_id,seller_country,product_code,category,brand,currency,price,date
0,qzpjsi9t0o,GB,6oufpaot,GB,ydecnydr6obf,Dresses,,GBP,19.11,2020-08-03
1,o79pns1qwo,GB,50xki2yg,GB,38chj507o6h2,Outerwear,Adidas Originals,GBP,15.92,2020-08-09
2,sjgbjdkhqx,GB,50xki2yg,GB,i9ynyxj5tdp8,Tops - Mens,,GBP,6.37,2020-08-07
3,o79pns1qwo,GB,s0p322hh,GB,t50xe7moye8v,Tops - Mens,Berghaus,GBP,25.27,2020-08-09
4,o79pns1qwo,GB,dpkx192v,GB,hp4r9bjq68af,Bottoms - Womens,,GBP,8.90,2020-08-09
...,...,...,...,...,...,...,...,...,...,...
237661,1bge6csm05,GB,uiuddd57,GB,otroego4614j,Tops - Womens,,GBP,31.77,2020-08-05
237665,e5vfphshsu,GB,6cnezohx,GB,lro4soook6h2,Accessories,Hugo Boss,GBP,33.85,2020-08-08
237666,p7ibf7soot,GB,ss7jy9ks,GB,hzc7pmjxmsb6,Bottoms - Womens,,GBP,17.11,2020-08-06
237668,k5hmcfxcs6,GB,n2ez0xph,GB,5afe7yhdpcmz,Other,,GBP,58.28,2020-08-05


In [25]:
a = df2[(df2["buyer_country"]=="GB") & (df2["seller_country"]=="GB")]
a["price"].sum()

3184495.5800000005

#### Making a new date2 column in datetime format

In [27]:
df2['date2'] = pd.to_datetime(df2["date"], format='%Y-%m-%d')
df2['date2']

0        2020-08-03
1        2020-08-09
2        2020-08-07
3        2020-08-09
4        2020-08-09
            ...    
237666   2020-08-06
237667   2020-08-05
237668   2020-08-05
237669   2020-08-05
237670   2020-08-06
Name: date2, Length: 237671, dtype: datetime64[ns]

In [28]:
df2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 237671 entries, 0 to 237670
Data columns (total 11 columns):
 #   Column          Non-Null Count   Dtype         
---  ------          --------------   -----         
 0   buyer_id        237671 non-null  object        
 1   buyer_country   237671 non-null  object        
 2   seller_id       237671 non-null  object        
 3   seller_country  237671 non-null  object        
 4   product_code    237671 non-null  object        
 5   category        237655 non-null  object        
 6   brand           119330 non-null  object        
 7   currency        237671 non-null  object        
 8   price           237671 non-null  float64       
 9   date            237671 non-null  object        
 10  date2           237671 non-null  datetime64[ns]
dtypes: datetime64[ns](1), float64(1), object(9)
memory usage: 19.9+ MB


In [29]:
df2.shape

(237671, 11)

#### Merging the datasets according to the currency and date2 columns

In [30]:
pd.merge(df1, df2, how='inner', on=["currency","date2"])

Unnamed: 0,currency,date_x,rate,date2,buyer_id,buyer_country,seller_id,seller_country,product_code,category,brand,price,date_y
0,EUR,8/8/2020,1.00000,2020-08-08,oagt2ozal7,IE,b57xo98q,IE,2l6ze3xh4roy,Jewellery,,21.25,2020-08-08
1,EUR,8/8/2020,1.00000,2020-08-08,zos594tze5,IT,spoetwqz,IT,vlbfdpb8pke2,Other,,26.92,2020-08-08
2,EUR,8/8/2020,1.00000,2020-08-08,xpbfdqczlk,IT,8b0q1eqs,IT,3hkn59tx2qpi,Shoes,,29.68,2020-08-08
3,EUR,8/8/2020,1.00000,2020-08-08,se9ly22dm4,IT,6kda5j6s,IT,whinz3le49i7,Accessories,,16.84,2020-08-08
4,EUR,8/8/2020,1.00000,2020-08-08,ujmyyjvqp6,IT,6kda5j6s,IT,1igw02musrfy,Accessories,,42.91,2020-08-08
...,...,...,...,...,...,...,...,...,...,...,...,...,...
36536,DKK,8/8/2020,7.44830,2020-08-08,qc0mfiohbu,RoW,0wyc03qe,RoW,u8lk4w5l9aa4,Outerwear,,131.53,2020-08-08
36537,PLN,8/8/2020,4.41360,2020-08-08,szgbzi5lif,RoW,b8lruwrf,RoW,2ynovr4yjo8z,Tops - Mens,,154.26,2020-08-08
36538,JPY,8/8/2020,124.87340,2020-08-08,lfsyu3wu59,GB,qi7xypay,RoW,eg8hv187dvy2,Accessories,,11155.17,2020-08-08
36539,NOK,8/8/2020,10.64540,2020-08-08,wnmihth4io,GB,5yogzc17,RoW,wrgq88ujwh5d,Bottoms - Womens,,533.06,2020-08-08


#### How many sales were made from US buyers by US sellers

In [31]:
a = df2[(df2["buyer_country"]=="US") & (df2["seller_country"]=="US")]
a

Unnamed: 0,buyer_id,buyer_country,seller_id,seller_country,product_code,category,brand,currency,price,date,date2
23,94lh9gu3bu,US,dhsa2q6o,US,1m846u2l2dyl,Jewellery,,USD,16.29,2020-08-03,2020-08-03
24,fmdcxftbp1,US,dhsa2q6o,US,9xo46id3ggkm,Tops - Womens,,USD,18.35,2020-08-06,2020-08-06
79,nk167akpfw,US,zhu1ajrz,US,431v9gn7ouiw,Music,,USD,13.44,2020-08-03,2020-08-03
80,nk167akpfw,US,zhu1ajrz,US,36o939nzswpa,Music,,USD,5.53,2020-08-03,2020-08-03
81,nk167akpfw,US,zhu1ajrz,US,8mli531pzaba,Music,,USD,5.26,2020-08-03,2020-08-03
...,...,...,...,...,...,...,...,...,...,...,...
237660,f9jbvr90xu,US,quu0ky19,US,50vyfd46inih,Shoes,,USD,25.84,2020-08-06,2020-08-06
237662,say6ah2869,US,2jwn7shh,US,dfo3ztaga4de,Accessories,,USD,9.15,2020-08-05,2020-08-05
237663,jji9og2wj0,US,w8g31gsr,US,gjwpt7hxl03u,Shoes,American Apparel,USD,33.56,2020-08-04,2020-08-04
237667,2hg8zirueq,US,bk6apusi,US,fskn84lfyrvz,Home,,USD,69.00,2020-08-05,2020-08-05


#### Show the brands and their total sale amounts and sort the list descending

In [37]:
grouped = df2.groupby('brand').sum(numeric_only=True)
grouped


Unnamed: 0_level_0,price
brand,Unnamed: 1_level_1
& Other Stories,4615.52
032c,376.78
11 Degrees,432.76
1822 Denim,150.78
3.1 Phillip Lim,356.16
...,...
ghd,1965.11
kensie,23.45
liquid blue,2396.47
mixxmix,225.45


In [59]:
grouped.price.sort_values(ascending=False)

brand
Nike                  373559.23
Adidas                110351.33
American Vintage      105227.77
Brandy Melville       100055.63
Dr. Martens            84418.65
                        ...    
Ann Demeulemeester         6.19
3LAB                       5.57
Popular Sports             5.26
360 Cashmere               3.36
36572                      1.12
Name: price, Length: 1134, dtype: float64

#### What is the name of the product with maximum sale?

In [51]:
grouped['price'].idxmax()

'Nike'

In [57]:
grouped[grouped.index==grouped['price'].idxmax()]   # show the record for the max value

Unnamed: 0_level_0,price
brand,Unnamed: 1_level_1
Nike,373559.23


END OF THE PROJECT