In [None]:
import pandas as pd
import numpy as np
import datetime
from tqdm import tqdm

In [None]:
transactions = pd.read_csv("../input/h-and-m-personalized-fashion-recommendations/transactions_train.csv",usecols=['t_dat','customer_id','article_id'])
submission = pd.read_csv('../input/h-and-m-personalized-fashion-recommendations/sample_submission.csv')

transactions['t_dat'] = pd.to_datetime(transactions['t_dat'])
transactions = transactions.loc[transactions['t_dat'] > pd.to_datetime('2020-09-01')]

#only choose customers who bought more than 4 times.
MINIMUM_PURCHASES = 4
dfh = transactions.groupby("customer_id")['article_id'].apply(lambda items: list(items))
dfh = dfh[dfh.str.len() > MINIMUM_PURCHASES]
transactions = transactions.loc[transactions['customer_id'].isin(dfh.index)]

#only choose articles which were bought more than 10 times
article_bought_count = transactions[['article_id', 't_dat']].groupby('article_id').count().reset_index().rename(columns={'t_dat': 'count'})
most_bought_articles = article_bought_count[article_bought_count['count']>10]['article_id'].values
transactions = transactions[transactions['article_id'].isin(most_bought_articles)]


In [None]:
print('The number of customers: ',transactions['customer_id'].nunique())
print('The number of articles: ',transactions['article_id'].nunique())

In [None]:
#mapping
customers_toindex = {c:n for n,c in enumerate(np.unique(transactions['customer_id'].values))}
articles_toindex = {c:n for n,c in enumerate(np.unique(transactions['article_id'].values))}

#convert into integer index.
transactions['customer_id'] = transactions['customer_id'].map(customers_toindex)


In [None]:
#article_customer: article is the index, customers is the values for comparing part.
article_customer = transactions.groupby("article_id")['customer_id'].apply(lambda customer: list(set(customer)))

#customer_article: to find article bought the most times by one customer without the limitation of purchase times.
customer_article = transactions.groupby(["customer_id","article_id"])['t_dat'].count()
customer_article = customer_article.reset_index().sort_values(by = ['customer_id','t_dat'],ascending = [True,False])

In [None]:
print(article_customer.head())
print(customer_article.head())

In [None]:
#The most popular articles to make up for the absent positions.
default_transactions = transactions.loc[transactions['t_dat']> pd.to_datetime('2020-09-14')]
default_top12 = default_transactions.groupby('article_id')['t_dat'].count().reset_index().sort_values(by='t_dat', ascending=False)['article_id'][:12]
default_top12 =  ['0'+str(i) for i in default_top12]

In [None]:
print(default_top12)

In [None]:
class ItemtoItem:
    def __init__(self,transactions,submission,customers_toindex,articles_toindex,article_customer,customer_article,default_top12):
        """
        Constructor
        """
        self.transactions = transactions
        self.submission = submission
        self.customers_toindex = customers_toindex
        self.articles_toindex = articles_toindex
        self.article_customer = article_customer
        self.customer_article = customer_article
        self.default_top12 = default_top12


    def compare_vectors(self,v1, v2):
        """
        Compare the two customer vectors. get a similarity score.

        """
        intersection = len(set(v1) & set(v2))
        denominator = np.sqrt(len(v1) * len(v2))
        return intersection / denominator


    def get_similar_items_for_target_article(self,u, v):
        """
        Using target article to compare in other article_customer.

        Arguments:
            u:  the article bought before
            v:  the "vector" representation of the article (list of customer_id)

        Returns:
            the three most similar article to target article
            tuple of list ([similar article_id])
        """
        similar_articles = self.article_customer.apply(lambda v_other: self.compare_vectors(v, v_other)).sort_values(ascending=False).index[:3]

        return similar_articles

    def get_target_article(self,user):
        """
        Using customer_article dataframe to find the article this customer bought before. Find the
        similar three articles to target article.

        Arguments:
                user: customer_id
        
        Return: Find the most similar item based on previous purchase.

        """
        target_articles = self.customer_article.loc[self.customer_article['customer_id']== user]["article_id"].values[:3].tolist()
        single_customer_similar_articles = []
        for target_article in target_articles:
            v = self.article_customer.loc[self.article_customer.index == target_article].tolist()[0]
            similar_items = self.get_similar_items_for_target_article(target_article, v)
            single_customer_similar_articles += similar_items.tolist()

        return single_customer_similar_articles
    

    def get_recommendation(self):
        """
        Main functions
        """
        recommendations = []
        customers = self.submission['customer_id']
        for customer in tqdm(customers):
                if customer in self.customers_toindex:
                    rec_aux1 = []
                    rec_aux2 = []
                    aux = []
                    # Return the similar items found for this customer.
                    rec_aux1 = self.get_target_article(self.customers_toindex[customer])
                    # Return the default recommendation.
                    rec_aux2 = self.default_top12
                    # Merge both recommendation lists.
                    aux = rec_aux1 + rec_aux2
                    aux = aux[:12]
                    aux = ['0'+str(i) for i in aux]
                    recommendations.append(' '.join(aux))
                else:# if couldn't find the customer in the map.
                    # Return the default recommendation
                    recommendations.append(' '.join(self.default_top12))

        return pd.DataFrame({
                'customer_id': customers,
                'prediction': recommendations,
            })

    
    

In [None]:
rec = ItemtoItem(transactions,submission,customers_toindex,articles_toindex,article_customer,customer_article,default_top12)
sub = rec.get_recommendation()

In [None]:
sub.to_csv('submission.csv', index=False)