In [1]:
import os 
import shutil
import pickle
import datetime
import numpy as np
import pandas as pd
from datetime import date, timedelta

import matplotlib.pyplot as plt
pd.set_option('display.float_format', lambda x: '%.4f' % x)

## Load Raw Data

### Customer Info

In [2]:
# customers
path = '../data/processed'
customers = pd.read_pickle(os.path.join(path,'customers.pkl'))
customers.head(2)

Unnamed: 0,customer_id,FN,Active,club_member_status,fashion_news_frequency,age,postal_code
0,0,0.0,0.0,ACTIVE,NONE,49.0,52043ee2162cf5aa7ee79974281641c6f11a68d276429a...
1,1,0.0,0.0,ACTIVE,NONE,25.0,2973abc54daa8a5f8ccfe9362140c63247c5eee03f1d93...


### Index to Customer_ID

In [3]:
# mapping index
path = '../data/processed'
infile = open(os.path.join(path,'index_to_cusId.pkl'),'rb')
index_to_id_dict = pickle.load(infile)
infile.close()

In [4]:
customers["customer_id"] = customers["customer_id"].map(index_to_id_dict)
customers.head()

Unnamed: 0,customer_id,FN,Active,club_member_status,fashion_news_frequency,age,postal_code
0,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3...,0.0,0.0,ACTIVE,NONE,49.0,52043ee2162cf5aa7ee79974281641c6f11a68d276429a...
1,0000423b00ade91418cceaf3b26c6af3dd342b51fd051e...,0.0,0.0,ACTIVE,NONE,25.0,2973abc54daa8a5f8ccfe9362140c63247c5eee03f1d93...
2,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,0.0,0.0,ACTIVE,NONE,24.0,64f17e6a330a85798e4998f62d0930d14db8db1c054af6...
3,00005ca1c9ed5f5146b52ac8639a40ca9d57aeff4d1bd2...,0.0,0.0,ACTIVE,NONE,54.0,5d36574f52495e81f019b680c843c443bd343d5ca5b1c2...
4,00006413d8573cd20ed7128e53b7b13819fe5cfc2d801f...,1.0,1.0,ACTIVE,Regularly,52.0,25fa5ddee9aac01b35208d01736e57942317d756b32ddd...


### Transaction File

In [5]:
path = '../data/processed'
trans = pd.read_pickle(os.path.join(path,'transactions.pkl'))
trans["customer_id"] = trans["customer_id"].map(index_to_id_dict)
trans.head()

Unnamed: 0,t_dat,customer_id,article_id,price,sales_channel_id
0,2018-09-20,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,663713001,0.0508,2
1,2018-09-20,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,541518023,0.0305,2
2,2018-09-20,00007d2de826758b65a93dd24ce629ed66842531df6699...,505221004,0.0152,2
3,2018-09-20,00007d2de826758b65a93dd24ce629ed66842531df6699...,685687003,0.0169,2
4,2018-09-20,00007d2de826758b65a93dd24ce629ed66842531df6699...,685687004,0.0169,2


### Articles Description

In [6]:
# articles description
articles = pd.read_pickle(os.path.join(path,'articles.pkl'))
articles.head(2)

Unnamed: 0,article_id,product_code,prod_name,product_type_no,product_type_name,product_group_name,graphical_appearance_no,graphical_appearance_name,colour_group_code,colour_group_name,...,department_name,index_code,index_name,index_group_no,index_group_name,section_no,section_name,garment_group_no,garment_group_name,detail_desc
0,108775015,108775,Strap top,253,Vest top,Garment Upper body,1010016,Solid,9,Black,...,Jersey Basic,A,Ladieswear,1,Ladieswear,16,Womens Everyday Basics,1002,Jersey Basic,Jersey top with narrow shoulder straps.
1,108775044,108775,Strap top,253,Vest top,Garment Upper body,1010016,Solid,10,White,...,Jersey Basic,A,Ladieswear,1,Ladieswear,16,Womens Everyday Basics,1002,Jersey Basic,Jersey top with narrow shoulder straps.


## Select n Customer

In [7]:
# Set Number of selected Customers
n = 50

In [8]:
import names
# temp = trans[trans.t_dat<'2020-09-15']\
#             .groupby(['customer_id'])\
#             .agg({'article_id':'nunique'})\
#             .rename(columns={'article_id':'quantity'})\
#             .reset_index()

# # temp['quantity'] = temp['quantity'].clip(0,5)
# temp['quantity'] = temp['quantity'].astype('int8')
# temp = temp[temp['quantity'] <100]
# temp.sort_values(by=['quantity'],ascending=False,inplace=True)

# # Select N Customers
# cus_list = temp[['customer_id']][:n].reset_index(drop=True)

# random name
cus_name = {}
for i in range(n):
    cus_name[i] = names.get_full_name()
cus_name = pd.DataFrame.from_dict(cus_name,orient='index',columns=['name'])



cus_path = '../static/model'
select_cus = pd.read_csv(os.path.join(cus_path,'submission50_non.csv'))

# # Final list
# select_cus = customers[customers.customer_id.isin(cus_list.customer_id)].reset_index(drop=True)
select_cus = select_cus.join(cus_name)



select_cus.head()

Unnamed: 0,customer_id,prediction,recent_purchase,name
0,0971164b9c437c786a46e092ae17dd1d63894cf6d0baee...,0756662008 0428291016 0824264001 0878026002 07...,428291016,Michelle Torres
1,0a0f5e47d56428eba1f1fda252c5e35a4202e8e687aeb9...,0554598028 0616453001 0698273001 0888488002 07...,554598001,Eddie Munoz
2,0a12965b9c32c9f72d761e6b65bc1d446ea718419c5419...,0663515005 0689563004 0562252039 0867969002 03...,413707001,Vanessa Biagioni
3,0c5555ac69bd3af2871f203a746667483c8b3df955901e...,0930380001 0698786004 0745058001 0658907011 06...,770211001,Virginia Krueger
4,106b273fc758b3d4d2dd1213c43580a0d8fd99f5e0253e...,0693350001 0671453002 0554757003 0823793002 07...,554757003,Karen Kaiser


## Recent Purchase

In [17]:
select_trans = trans[trans.customer_id.isin(select_cus.customer_id)]
print('Min date: ', select_trans.t_dat.min())
print('Max date: ', select_trans.t_dat.max())
print(f'Total Customers: {select_trans.customer_id.nunique()}')

recent_purchase = pd.DataFrame(select_trans.groupby(['customer_id'])['article_id'].apply(lambda x: list(set(x))))\
                        .reset_index()\
                        .rename(columns={'article_id':'recent_purchase'})
recent_purchase['recent_purchase'] = recent_purchase['recent_purchase'].apply(lambda x: x[:12])
recent_purchase.head()

Min date:  2018-09-20 00:00:00
Max date:  2020-09-22 00:00:00
Total Customers: 50


Unnamed: 0,customer_id,recent_purchase
0,0971164b9c437c786a46e092ae17dd1d63894cf6d0baee...,"[0572998007, 0506166046, 0752512006, 092061000..."
1,0a0f5e47d56428eba1f1fda252c5e35a4202e8e687aeb9...,"[0568601006, 0687365001, 0555326005, 057078100..."
2,0a12965b9c32c9f72d761e6b65bc1d446ea718419c5419...,"[0680263013, 0561814002, 0770383001, 069093600..."
3,0c5555ac69bd3af2871f203a746667483c8b3df955901e...,"[0596517001, 0591334014, 0903910001, 081735300..."
4,106b273fc758b3d4d2dd1213c43580a0d8fd99f5e0253e...,"[0636323001, 0813898002, 0752512006, 087713700..."


## Prediction

In [10]:
submit = pd.read_csv('submissions.csv',dtype=str)
submit = submit[submit.customer_id.isin(select_cus.customer_id)].reset_index(drop=True)
print('total customers: ' ,len(submit))
submit.head()

total customers:  50


Unnamed: 0,customer_id,prediction
0,0971164b9c437c786a46e092ae17dd1d63894cf6d0baee...,0906169002 0925246001 0863595005 0929275001 07...
1,0a0f5e47d56428eba1f1fda252c5e35a4202e8e687aeb9...,0112679048 0111609001 0111593001 0111586001 01...
2,0a12965b9c32c9f72d761e6b65bc1d446ea718419c5419...,0610776002 0877666001 0610776105 0610776001 09...
3,0c5555ac69bd3af2871f203a746667483c8b3df955901e...,0112679048 0111609001 0111593001 0111586001 01...
4,106b273fc758b3d4d2dd1213c43580a0d8fd99f5e0253e...,0112679048 0111609001 0111593001 0111586001 01...


In [11]:
submit.groupby(['prediction']).agg({'customer_id':'count'}).sort_values(by='customer_id',ascending=False)

Unnamed: 0_level_0,customer_id
prediction,Unnamed: 1_level_1
0112679048 0111609001 0111593001 0111586001 0111565003 0111565001 0110065011 0110065002 0110065001 0108775051 0108775044 0108775015,18
0870328003 0841383002 0873274002 0706016019 0904567002 0568597006 0882899003 0783346018 0640021019 0824995001 0898918002 0717490008,1
0920610005 0920610002 0827635002 0920610001 0720504008 0677930037 0874113005 0873771002 0871997002 0904545002 0805000007 0685814003,1
0918292001 0919365008 0748355003 0868823008 0856270002 0868823007 0868823011 0919365003 0914537001 0898439001 0898692006 0572998013,1
0917293003 0917293004 0921266001 0901330002 0805000001 0685813043 0938804001 0748566027 0876009002 0677930023 0865076001 0906305001,1
0916468002 0554450036 0554450001 0765743007 0829643003 0902419001 0554450046 0868134001 0685814022 0554450043 0911214001 0673677023,1
0915529003 0865929014 0904416001 0904416002 0865929007 0574109042 0902388001 0911870002 0865929004 0610776001 0907409001 0568601043,1
0914441001 0706016019 0914441005 0865929002 0818754001 0574109011 0818754004 0762143001 0767473009 0902518005 0574109039 0640021019,1
0909014001 0871997001 0871997002 0915611004 0855239001 0916000003 0677930077 0912095007 0898692006 0903473001 0910824001 0815808001,1
0906169002 0925246001 0863595005 0929275001 0794575002 0918517001 0906226002 0811525001 0730683062 0889550002 0908799002 0760084003,1


## Unique Articles

In [18]:
unique_articles = []
for i in submit.prediction:
    unique_articles = unique_articles + i.split()
    
for i in recent_purchase.recent_purchase:
    unique_articles = unique_articles + i 


## Get image to new directory

In [19]:
from tqdm import tqdm
from PIL import Image

save_path = '../static/model/images'
raw_path = '../data/raw/images'

width = 1166
hieght = 1750
reduce = 0.3

# Clear all file in the directory
for f in os.listdir(save_path):
    os.remove(os.path.join(save_path, f))

# Copy using picture
for i in unique_articles:
    src = os.path.join(raw_path,i[:3],i+'.jpg')
    dst = os.path.join(save_path,i+'.jpg')
    if os.path.exists(src):
        shutil.copyfile(src, dst)
    else:
        print(f'Missing articles: {i}')

# Resize Image
multiple_images = os.listdir(save_path)

# Looping over all of the images:
for image in multiple_images:
    img = Image.open(os.path.join(save_path, image))
    img.thumbnail(size=(width*reduce,hieght*reduce))
#     print(img)
    # We would run the command below to save the images:
    img.save(os.path.join(save_path, image), optimize=True)
print('Done. Resize Image')

Missing articles: 0610776002
Missing articles: 0610776001
Missing articles: 0179208001
Missing articles: 0610776001
Missing articles: 0179208001
Missing articles: 0610776001
Missing articles: 0408875001
Missing articles: 0408875001
Missing articles: 0408875001
Missing articles: 0901588001
Missing articles: 0408875001
Done. Resize Image


## Selected Articles Info

In [20]:
select_articles = articles[articles.article_id.isin(unique_articles)].reset_index(drop=True)
print('Total Selected Articles: ',len(select_articles))
select_articles.head()

Total Selected Articles:  841


Unnamed: 0,article_id,product_code,prod_name,product_type_no,product_type_name,product_group_name,graphical_appearance_no,graphical_appearance_name,colour_group_code,colour_group_name,...,department_name,index_code,index_name,index_group_no,index_group_name,section_no,section_name,garment_group_no,garment_group_name,detail_desc
0,108775015,108775,Strap top,253,Vest top,Garment Upper body,1010016,Solid,9,Black,...,Jersey Basic,A,Ladieswear,1,Ladieswear,16,Womens Everyday Basics,1002,Jersey Basic,Jersey top with narrow shoulder straps.
1,108775044,108775,Strap top,253,Vest top,Garment Upper body,1010016,Solid,10,White,...,Jersey Basic,A,Ladieswear,1,Ladieswear,16,Womens Everyday Basics,1002,Jersey Basic,Jersey top with narrow shoulder straps.
2,108775051,108775,Strap top (1),253,Vest top,Garment Upper body,1010017,Stripe,11,Off White,...,Jersey Basic,A,Ladieswear,1,Ladieswear,16,Womens Everyday Basics,1002,Jersey Basic,Jersey top with narrow shoulder straps.
3,110065001,110065,OP T-shirt (Idro),306,Bra,Underwear,1010016,Solid,9,Black,...,Clean Lingerie,B,Lingeries/Tights,1,Ladieswear,61,Womens Lingerie,1017,"Under-, Nightwear","Microfibre T-shirt bra with underwired, moulde..."
4,110065002,110065,OP T-shirt (Idro),306,Bra,Underwear,1010016,Solid,10,White,...,Clean Lingerie,B,Lingeries/Tights,1,Ladieswear,61,Womens Lingerie,1017,"Under-, Nightwear","Microfibre T-shirt bra with underwired, moulde..."


## Save Selected Files

In [36]:
select_articles['prod_name'][select_articles.article_id=='0108775015'].values[0]

'Strap top'

In [29]:
select_articles.head()

Unnamed: 0,article_id,product_code,prod_name,product_type_no,product_type_name,product_group_name,graphical_appearance_no,graphical_appearance_name,colour_group_code,colour_group_name,...,department_name,index_code,index_name,index_group_no,index_group_name,section_no,section_name,garment_group_no,garment_group_name,detail_desc
0,108775015,108775,Strap top,253,Vest top,Garment Upper body,1010016,Solid,9,Black,...,Jersey Basic,A,Ladieswear,1,Ladieswear,16,Womens Everyday Basics,1002,Jersey Basic,Jersey top with narrow shoulder straps.
1,108775044,108775,Strap top,253,Vest top,Garment Upper body,1010016,Solid,10,White,...,Jersey Basic,A,Ladieswear,1,Ladieswear,16,Womens Everyday Basics,1002,Jersey Basic,Jersey top with narrow shoulder straps.
2,108775051,108775,Strap top (1),253,Vest top,Garment Upper body,1010017,Stripe,11,Off White,...,Jersey Basic,A,Ladieswear,1,Ladieswear,16,Womens Everyday Basics,1002,Jersey Basic,Jersey top with narrow shoulder straps.
3,110065001,110065,OP T-shirt (Idro),306,Bra,Underwear,1010016,Solid,9,Black,...,Clean Lingerie,B,Lingeries/Tights,1,Ladieswear,61,Womens Lingerie,1017,"Under-, Nightwear","Microfibre T-shirt bra with underwired, moulde..."
4,110065002,110065,OP T-shirt (Idro),306,Bra,Underwear,1010016,Solid,10,White,...,Clean Lingerie,B,Lingeries/Tights,1,Ladieswear,61,Womens Lingerie,1017,"Under-, Nightwear","Microfibre T-shirt bra with underwired, moulde..."


In [22]:
temp = submit.merge(recent_purchase,on='customer_id',how='left')
temp['recent_purchase'] = temp['recent_purchase'].apply(lambda x: ' '.join(x))
temp.head()

Unnamed: 0,customer_id,prediction,recent_purchase
0,0971164b9c437c786a46e092ae17dd1d63894cf6d0baee...,0906169002 0925246001 0863595005 0929275001 07...,0572998007 0506166046 0752512006 0920610002 07...
1,0a0f5e47d56428eba1f1fda252c5e35a4202e8e687aeb9...,0112679048 0111609001 0111593001 0111586001 01...,0568601006 0687365001 0555326005 0570781008 05...
2,0a12965b9c32c9f72d761e6b65bc1d446ea718419c5419...,0610776002 0877666001 0610776105 0610776001 09...,0680263013 0561814002 0770383001 0690936006 06...
3,0c5555ac69bd3af2871f203a746667483c8b3df955901e...,0112679048 0111609001 0111593001 0111586001 01...,0596517001 0591334014 0903910001 0817353003 07...
4,106b273fc758b3d4d2dd1213c43580a0d8fd99f5e0253e...,0112679048 0111609001 0111593001 0111586001 01...,0636323001 0813898002 0752512006 0877137002 08...


In [23]:
final_save_path = '../static/model'

temp.to_csv(os.path.join(final_save_path,'prediction.csv'))
select_articles.to_csv(os.path.join(final_save_path,'articles.csv'))
# recent_purchase.to_csv(os.path.join(final_save_path,'recent_purchase.csv'))
select_cus.to_csv(os.path.join(final_save_path,'customers.csv'))

In [None]:
{key:"{:,}".format(overall_data[overall_data.channel == key]['annual_accounts'].values[0]) for key in overall_data.channel}