In [1]:
import os 
import shutil
import pickle
import datetime
import numpy as np
import pandas as pd
from datetime import date, timedelta

import matplotlib.pyplot as plt
pd.set_option('display.float_format', lambda x: '%.4f' % x)

## Load Raw Data

### Customer Info

In [2]:
# customers
path = '../data/processed'
customers = pd.read_pickle(os.path.join(path,'customers.pkl'))
customers.head(2)

Unnamed: 0,customer_id,FN,Active,club_member_status,fashion_news_frequency,age,postal_code
0,0,0.0,0.0,ACTIVE,NONE,49.0,52043ee2162cf5aa7ee79974281641c6f11a68d276429a...
1,1,0.0,0.0,ACTIVE,NONE,25.0,2973abc54daa8a5f8ccfe9362140c63247c5eee03f1d93...


### Index to Customer_ID

In [3]:
# mapping index
path = '../data/processed'
infile = open(os.path.join(path,'index_to_cusId.pkl'),'rb')
index_to_id_dict = pickle.load(infile)
infile.close()

In [4]:
customers["customer_id"] = customers["customer_id"].map(index_to_id_dict)
customers.head()

Unnamed: 0,customer_id,FN,Active,club_member_status,fashion_news_frequency,age,postal_code
0,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3...,0.0,0.0,ACTIVE,NONE,49.0,52043ee2162cf5aa7ee79974281641c6f11a68d276429a...
1,0000423b00ade91418cceaf3b26c6af3dd342b51fd051e...,0.0,0.0,ACTIVE,NONE,25.0,2973abc54daa8a5f8ccfe9362140c63247c5eee03f1d93...
2,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,0.0,0.0,ACTIVE,NONE,24.0,64f17e6a330a85798e4998f62d0930d14db8db1c054af6...
3,00005ca1c9ed5f5146b52ac8639a40ca9d57aeff4d1bd2...,0.0,0.0,ACTIVE,NONE,54.0,5d36574f52495e81f019b680c843c443bd343d5ca5b1c2...
4,00006413d8573cd20ed7128e53b7b13819fe5cfc2d801f...,1.0,1.0,ACTIVE,Regularly,52.0,25fa5ddee9aac01b35208d01736e57942317d756b32ddd...


### Transaction File

In [5]:
path = '../data/processed'
trans = pd.read_pickle(os.path.join(path,'transactions.pkl'))
trans["customer_id"] = trans["customer_id"].map(index_to_id_dict)
trans.head()

Unnamed: 0,t_dat,customer_id,article_id,price,sales_channel_id
0,2018-09-20,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,663713001,0.0508,2
1,2018-09-20,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,541518023,0.0305,2
2,2018-09-20,00007d2de826758b65a93dd24ce629ed66842531df6699...,505221004,0.0152,2
3,2018-09-20,00007d2de826758b65a93dd24ce629ed66842531df6699...,685687003,0.0169,2
4,2018-09-20,00007d2de826758b65a93dd24ce629ed66842531df6699...,685687004,0.0169,2


### Articles Description

In [6]:
# articles description
articles = pd.read_pickle(os.path.join(path,'articles.pkl'))
articles.head(2)

Unnamed: 0,article_id,product_code,prod_name,product_type_no,product_type_name,product_group_name,graphical_appearance_no,graphical_appearance_name,colour_group_code,colour_group_name,...,department_name,index_code,index_name,index_group_no,index_group_name,section_no,section_name,garment_group_no,garment_group_name,detail_desc
0,108775015,108775,Strap top,253,Vest top,Garment Upper body,1010016,Solid,9,Black,...,Jersey Basic,A,Ladieswear,1,Ladieswear,16,Womens Everyday Basics,1002,Jersey Basic,Jersey top with narrow shoulder straps.
1,108775044,108775,Strap top,253,Vest top,Garment Upper body,1010016,Solid,10,White,...,Jersey Basic,A,Ladieswear,1,Ladieswear,16,Womens Everyday Basics,1002,Jersey Basic,Jersey top with narrow shoulder straps.


## Select n Customer

In [17]:
# Set Number of selected Customers
n = 50

In [23]:
import names
# temp = trans[trans.t_dat<'2020-09-15']\
#             .groupby(['customer_id'])\
#             .agg({'article_id':'nunique'})\
#             .rename(columns={'article_id':'quantity'})\
#             .reset_index()

# # temp['quantity'] = temp['quantity'].clip(0,5)
# temp['quantity'] = temp['quantity'].astype('int8')
# temp = temp[temp['quantity'] <100]
# temp.sort_values(by=['quantity'],ascending=False,inplace=True)

# # Select N Customers
# cus_list = temp[['customer_id']][:n].reset_index(drop=True)


start_dt =  datetime.datetime(2020,9,15)
end_dt = start_dt + timedelta(7)

temp_tran = trans[(trans.t_dat > start_dt) & (trans.t_dat <= end_dt)]
print('Min date: ', temp_tran.t_dat.min())
print('Max date: ', temp_tran.t_dat.max())
print(f'Total Customers: {temp_tran.customer_id.nunique()}')

target = pd.DataFrame(temp_tran.groupby(['customer_id'])['article_id'].apply(lambda x: list(set(x))))\
                        .reset_index()\
                        .rename(columns={'article_id':'actual'})
# weekly_purchased['weekly_purchased_products'] = weekly_purchased['weekly_purchased_products'].apply(lambda x: list(set(x)))
target['q'] = target['actual'].apply(lambda x: len(x))
target.sort_values(by=['q'],ascending=False,inplace=True)
target.head()

cus_list = target[['customer_id']][target['q'] >=6].sample(n=50)

# random name
cus_name = {}
for i in range(n):
    cus_name[i] = names.get_full_name()
cus_name = pd.DataFrame.from_dict(cus_name,orient='index',columns=['name'])



cus_path = '../static/model'
# select_cus = pd.read_csv(os.path.join(cus_path,'submission50_non.csv'))

# # Final list
select_cus = customers[customers.customer_id.isin(cus_list.customer_id)].reset_index(drop=True)
select_cus = select_cus.join(cus_name)



select_cus.head()

Min date:  2020-09-16 00:00:00
Max date:  2020-09-22 00:00:00
Total Customers: 68984


Unnamed: 0,customer_id,FN,Active,club_member_status,fashion_news_frequency,age,postal_code,name
0,01343722529be320bc0955acc1b7c40322e5f801c6b8e0...,1.0,1.0,ACTIVE,Regularly,29.0,92d5ac954048b8a9d119164dce74a57248eb202f3e33f4...,Irwin Patwell
1,0a31e048711796ab3ec0179034fd9d7282101e42c89684...,0.0,0.0,ACTIVE,NONE,42.0,e21cd50431e52341f3b63be1c30e1b06117daa4c0aa333...,Paul Laird
2,0c86d1502d0f9c23157fb89bd97e45587372777b318a83...,1.0,1.0,ACTIVE,Regularly,21.0,fdc3644ca98683b48388ff4ed2e7eb8d4fde8a53397014...,Francine Pajtas
3,0ebd7f3073819f86ba5f2ad025db2b51ae36ad7039d453...,0.0,0.0,ACTIVE,NONE,59.0,2cb9882a67337d022fefcc992d70b976c4017b79e8678d...,John Ricci
4,10f245b02120d5c89b574db89b290983238a0a4e98a935...,0.0,0.0,ACTIVE,NONE,27.0,bf141b6ce945af367969176b2de742fb4fd11196eeddd4...,Frank Johnson


## Recent Purchase

In [24]:
select_trans = trans[trans.customer_id.isin(select_cus.customer_id)]
print('Min date: ', select_trans.t_dat.min())
print('Max date: ', select_trans.t_dat.max())
print(f'Total Customers: {select_trans.customer_id.nunique()}')

recent_purchase = pd.DataFrame(select_trans.groupby(['customer_id'])['article_id'].apply(lambda x: list(set(x))))\
                        .reset_index()\
                        .rename(columns={'article_id':'recent_purchase'})
recent_purchase['recent_purchase'] = recent_purchase['recent_purchase'].apply(lambda x: x[:12])
recent_purchase.head()

Min date:  2018-09-21 00:00:00
Max date:  2020-09-22 00:00:00
Total Customers: 50


Unnamed: 0,customer_id,recent_purchase
0,01343722529be320bc0955acc1b7c40322e5f801c6b8e0...,"[0765739001, 0708246001, 0801384004, 073873100..."
1,0a31e048711796ab3ec0179034fd9d7282101e42c89684...,"[0516859008, 0529008003, 0703366002, 068223800..."
2,0c86d1502d0f9c23157fb89bd97e45587372777b318a83...,"[0637515004, 0716548003, 0547367006, 072346900..."
3,0ebd7f3073819f86ba5f2ad025db2b51ae36ad7039d453...,"[0842314001, 0873884005, 0746330004, 092904200..."
4,10f245b02120d5c89b574db89b290983238a0a4e98a935...,"[0761624001, 0907696002, 0776237010, 080248500..."


## Prediction

In [25]:
submit = pd.read_csv('submissions.csv',dtype=str)
submit = submit[submit.customer_id.isin(select_cus.customer_id)].reset_index(drop=True)
print('total customers: ' ,len(submit))
submit.head()

total customers:  50


Unnamed: 0,customer_id,prediction
0,01343722529be320bc0955acc1b7c40322e5f801c6b8e0...,0894668003 0714790028 0874754002 0714790020 07...
1,0a31e048711796ab3ec0179034fd9d7282101e42c89684...,0794575002 0682550002 0621381016 0621381012 07...
2,0c86d1502d0f9c23157fb89bd97e45587372777b318a83...,0685813001 0685813042 0685813003 0685813040 06...
3,0ebd7f3073819f86ba5f2ad025db2b51ae36ad7039d453...,0919273002 0901330002 0790368006 0919273004 09...
4,10f245b02120d5c89b574db89b290983238a0a4e98a935...,0740519002 0730683052 0852174001 0804992017 08...


In [28]:
submit.groupby(['prediction']).agg({'customer_id':'count'}).sort_values(by='customer_id',ascending=False)

Unnamed: 0_level_0,customer_id
prediction,Unnamed: 1_level_1
0160442007 0884319006 0914441004 0579541001 0863595006 0914441005 0677930086 0160442010 0677930066 0857163001 0579541089 0836142003,1
0910601001 0893141002 0372860002 0910601002 0896851001 0906305001 0904416002 0673677022 0904416001 0759814022 0794575002 0751471022,1
0884319003 0870328002 0870328001 0870328003 0903673001 0715624052 0894668002 0933032002 0884319002 0927957001 0759465001 0905945001,1
0889870001 0750422018 0839332001 0881577001 0904545002 0677930037 0898596007 0678942054 0720125001 0516859008 0892309001 0827635001,1
0893059004 0915529003 0893141002 0865799006 0878190005 0915529005 0893059003 0902419001 0894668002 0874819002 0763469002 0894320001,1
0893059005 0914805002 0871710012 0871517014 0905945002 0905945001 0923569002 0914805006 0903926002 0828912004 0881577002 0874113005,1
0894668003 0714790028 0874754002 0714790020 0748355003 0797988002 0896169005 0815808001 0894668002 0914441002 0767423011 0922381001,1
0894780001 0897221001 0873276001 0914805006 0898684001 0923037001 0873276004 0874819002 0888343003 0905945001 0708138026 0683001020,1
0894780001 0928907001 0905945001 0855080009 0855080001 0855080005 0707269003 0905945002 0904625001 0677930086 0855080011 0893059003,1
0896152001 0719530003 0787946002 0473954008 0806388001 0717490064 0898596002 0751471042 0927922001 0685816001 0821395005 0924453003,1


## Unique Articles

In [29]:
unique_articles = []
for i in submit.prediction:
    unique_articles = unique_articles + i.split()
    
for i in recent_purchase.recent_purchase:
    unique_articles = unique_articles + i 


## Get image to new directory

In [30]:
from tqdm import tqdm
from PIL import Image

save_path = '../static/model/images'
raw_path = '../data/raw/images'

width = 1166
hieght = 1750
reduce = 0.3

# Clear all file in the directory
for f in os.listdir(save_path):
    os.remove(os.path.join(save_path, f))

# Copy using picture
for i in unique_articles:
    src = os.path.join(raw_path,i[:3],i+'.jpg')
    dst = os.path.join(save_path,i+'.jpg')
    if os.path.exists(src):
        shutil.copyfile(src, dst)
    else:
        print(f'Missing articles: {i}')

# Resize Image
multiple_images = os.listdir(save_path)

# Looping over all of the images:
for image in multiple_images:
    img = Image.open(os.path.join(save_path, image))
    img.thumbnail(size=(width*reduce,hieght*reduce))
#     print(img)
    # We would run the command below to save the images:
    img.save(os.path.join(save_path, image), optimize=True)
print('Done. Resize Image')

Missing articles: 0179208001
Missing articles: 0179208001
Missing articles: 0863646002
Missing articles: 0446224011
Done. Resize Image


## Selected Articles Info

In [31]:
select_articles = articles[articles.article_id.isin(unique_articles)].reset_index(drop=True)
print('Total Selected Articles: ',len(select_articles))
select_articles.head()

Total Selected Articles:  957


Unnamed: 0,article_id,product_code,prod_name,product_type_no,product_type_name,product_group_name,graphical_appearance_no,graphical_appearance_name,colour_group_code,colour_group_name,...,department_name,index_code,index_name,index_group_no,index_group_name,section_no,section_name,garment_group_no,garment_group_name,detail_desc
0,111586001,111586,Shape Up 30 den 1p Tights,273,Leggings/Tights,Garment Lower body,1010016,Solid,9,Black,...,Tights basic,B,Lingeries/Tights,1,Ladieswear,62,"Womens Nightwear, Socks & Tigh",1021,Socks and Tights,Tights with built-in support to lift the botto...
1,156227002,156227,Box 4p Kneehighs,304,Underwear Tights,Socks & Tights,1010016,Solid,13,Beige,...,Tights basic,B,Lingeries/Tights,1,Ladieswear,62,"Womens Nightwear, Socks & Tigh",1021,Socks and Tights,Four pairs of knee highs. 20 denier.
2,160442007,160442,3p Sneaker Socks,302,Socks,Socks & Tights,1010016,Solid,9,Black,...,Shopbasket Socks,B,Lingeries/Tights,1,Ladieswear,62,"Womens Nightwear, Socks & Tigh",1021,Socks and Tights,"Short, fine-knit socks designed to be hidden b..."
3,160442010,160442,3p Sneaker Socks,302,Socks,Socks & Tights,1010016,Solid,10,White,...,Shopbasket Socks,B,Lingeries/Tights,1,Ladieswear,62,"Womens Nightwear, Socks & Tigh",1021,Socks and Tights,"Short, fine-knit socks designed to be hidden b..."
4,179208001,179208,Control Top 100 den 1p Tights,273,Leggings/Tights,Garment Lower body,1010016,Solid,9,Black,...,Tights basic,B,Lingeries/Tights,1,Ladieswear,62,"Womens Nightwear, Socks & Tigh",1021,Socks and Tights,Matt opaque tights with a control top to hold ...


## Save Selected Files

In [33]:
select_articles.head()

Unnamed: 0,article_id,product_code,prod_name,product_type_no,product_type_name,product_group_name,graphical_appearance_no,graphical_appearance_name,colour_group_code,colour_group_name,...,department_name,index_code,index_name,index_group_no,index_group_name,section_no,section_name,garment_group_no,garment_group_name,detail_desc
0,111586001,111586,Shape Up 30 den 1p Tights,273,Leggings/Tights,Garment Lower body,1010016,Solid,9,Black,...,Tights basic,B,Lingeries/Tights,1,Ladieswear,62,"Womens Nightwear, Socks & Tigh",1021,Socks and Tights,Tights with built-in support to lift the botto...
1,156227002,156227,Box 4p Kneehighs,304,Underwear Tights,Socks & Tights,1010016,Solid,13,Beige,...,Tights basic,B,Lingeries/Tights,1,Ladieswear,62,"Womens Nightwear, Socks & Tigh",1021,Socks and Tights,Four pairs of knee highs. 20 denier.
2,160442007,160442,3p Sneaker Socks,302,Socks,Socks & Tights,1010016,Solid,9,Black,...,Shopbasket Socks,B,Lingeries/Tights,1,Ladieswear,62,"Womens Nightwear, Socks & Tigh",1021,Socks and Tights,"Short, fine-knit socks designed to be hidden b..."
3,160442010,160442,3p Sneaker Socks,302,Socks,Socks & Tights,1010016,Solid,10,White,...,Shopbasket Socks,B,Lingeries/Tights,1,Ladieswear,62,"Womens Nightwear, Socks & Tigh",1021,Socks and Tights,"Short, fine-knit socks designed to be hidden b..."
4,179208001,179208,Control Top 100 den 1p Tights,273,Leggings/Tights,Garment Lower body,1010016,Solid,9,Black,...,Tights basic,B,Lingeries/Tights,1,Ladieswear,62,"Womens Nightwear, Socks & Tigh",1021,Socks and Tights,Matt opaque tights with a control top to hold ...


In [34]:
temp = submit.merge(recent_purchase,on='customer_id',how='left')
temp['recent_purchase'] = temp['recent_purchase'].apply(lambda x: ' '.join(x))
temp.head()

Unnamed: 0,customer_id,prediction,recent_purchase
0,01343722529be320bc0955acc1b7c40322e5f801c6b8e0...,0894668003 0714790028 0874754002 0714790020 07...,0765739001 0708246001 0801384004 0738731002 07...
1,0a31e048711796ab3ec0179034fd9d7282101e42c89684...,0794575002 0682550002 0621381016 0621381012 07...,0516859008 0529008003 0703366002 0682238007 05...
2,0c86d1502d0f9c23157fb89bd97e45587372777b318a83...,0685813001 0685813042 0685813003 0685813040 06...,0637515004 0716548003 0547367006 0723469007 05...
3,0ebd7f3073819f86ba5f2ad025db2b51ae36ad7039d453...,0919273002 0901330002 0790368006 0919273004 09...,0842314001 0873884005 0746330004 0929042001 08...
4,10f245b02120d5c89b574db89b290983238a0a4e98a935...,0740519002 0730683052 0852174001 0804992017 08...,0761624001 0907696002 0776237010 0802485004 07...


In [35]:
final_save_path = '../static/model'

temp.to_csv(os.path.join(final_save_path,'prediction_v2.csv'))
select_articles.to_csv(os.path.join(final_save_path,'articles_v2.csv'))
# recent_purchase.to_csv(os.path.join(final_save_path,'recent_purchase.csv'))
select_cus.to_csv(os.path.join(final_save_path,'customers_v2.csv'))