In [None]:
# ~/Kaggle/hm-recommendations

In [None]:
import numpy as np
import pandas as pd
import os
from tqdm import tqdm
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud, STOPWORDS
from datetime import datetime
from PIL import Image

In [None]:
from pathlib import Path

DATA_DIR="../input/h-and-m-personalized-fashion-recommendations/"

In [None]:
data_dir = Path(DATA_DIR)
print(f"files and folders: {os.listdir(data_dir)}")
print("Subfolders in images folder: ", len(list( (data_dir / "images").glob('*') )) )

In [None]:

articles_df             = pd.read_csv(data_dir/"articles.csv")
customers_df            = pd.read_csv(data_dir/"customers.csv")
sample_submission_df    = pd.read_csv(data_dir/"sample_submission.csv")
transactions_train_df   = pd.read_csv(data_dir/"transactions_train.csv")

- **Save transaction data as a numpy array**

In [None]:
for df in [articles_df, customers_df, sample_submission_df]:
    print(df.columns)
    print()

In [None]:
sample_submission_df.head()

Observations

- **In the article data**, the only missing data is for the detailed description of the article (0.4% missing data).

- **Customer data** Only customer id and postal code are completely filled. Age, fashion news frequency have arounfd 1% misssing data, FN has 65% missing and Active has 66% missing data.

- No missing data from transactions train data source.


## NUMBER OF PRODUCTS PER CATEGORY

In [None]:
temp = articles_df.groupby(["product_group_name"])["product_type_name"].nunique()
df = pd.DataFrame({'Product Group': temp.index,
                   'Product Types': temp.values
                  })
df = df.sort_values(['Product Types'], ascending=False)
plt.figure(figsize = (8,6))
plt.title('Number of Product Types per each Product Group')
sns.set_color_codes("pastel")
s = sns.barplot(x = 'Product Group', y="Product Types", data=df)
s.set_xticklabels(s.get_xticklabels(),rotation=70)
locs, labels = plt.xticks()
plt.show()

## Wordcloud from product name

In [None]:
stopwords = set(STOPWORDS)

def show_wordcloud(data, title = None):
    wordcloud = WordCloud(
        background_color='white',
        stopwords=stopwords,
        max_words=200,
        max_font_size=40, 
        scale=5,
        random_state=1
    ).generate(str(data))

    fig = plt.figure(1, figsize=(10,10))
    plt.axis('off')
    if title: 
        fig.suptitle(title, fontsize=14)
        fig.subplots_adjust(top=2.3)

    plt.imshow(wordcloud)
    plt.show()

show_wordcloud(articles_df["prod_name"], "Wordcloud from product name")

# Customers data

## AGE

In [None]:
temp = customers_df.groupby(["age"])["customer_id"].count()
df = pd.DataFrame({'Age': temp.index,
                   'Customers': temp.values
                  })
df = df.sort_values(['Age'], ascending=False)
plt.figure(figsize = (16,6))
plt.title(f'Number of Customers per each Age')
sns.set_color_codes("pastel")
s = sns.barplot(x = 'Age', y="Customers", data=df)
s.set_xticklabels(s.get_xticklabels(),rotation=90)
locs, labels = plt.xticks()
plt.show()

In [None]:
temp = customers_df.groupby(["fashion_news_frequency"])["customer_id"].count()
df = pd.DataFrame({'Fashion News Frequency': temp.index,
                   'Customers': temp.values
                  })
df = df.sort_values(['Customers'], ascending=False)
plt.figure(figsize = (6,6))
plt.title(f'Number of Customers per each Fashion News Frequency')
sns.set_color_codes("pastel")
s = sns.barplot(x = 'Fashion News Frequency', y="Customers", data=df)
s.set_xticklabels(s.get_xticklabels(),rotation=45)
locs, labels = plt.xticks()
plt.show()

In [None]:
temp = customers_df.groupby(["club_member_status"])["customer_id"].count()
df = pd.DataFrame({'Club Member Status': temp.index,
                   'Customers': temp.values
                  })
df = df.sort_values(['Customers'], ascending=False)
plt.figure(figsize = (6,6))
plt.title(f'Number of Customers per each Club Member Status')
sns.set_color_codes("pastel")
s = sns.barplot(x = 'Club Member Status', y="Customers", data=df)
s.set_xticklabels(s.get_xticklabels(),rotation=90)
locs, labels = plt.xticks()
plt.show()

# Transactions data

## Logaritmic distribution of price frequency in transactions

In [None]:
df = transactions_train_df.sample(100_000)
fig, ax = plt.subplots(1, 1, figsize=(14, 7))
sns.kdeplot(np.log(df.loc[df["sales_channel_id"]==1].price.value_counts()))
sns.kdeplot(np.log(df.loc[df["sales_channel_id"]==2].price.value_counts()))
ax.legend(labels=['Sales channel 1', 'Sales channel 1'])
plt.title("Logaritmic distribution of price frequency in transactions, grouped per sales channel (100k sample)")
plt.show()

# Initial submission

For this initial submission, we apply the following simplified logic:

- if there are articles for a certain client, pick the most recent buys;
- if there are not articles for a certain client, just pick the most frequently buyed articles.

In [None]:
transactions_train_df = transactions_train_df.sort_values(["customer_id", "t_dat"], ascending=False)

# Let's capture first what are the most frequent recently bought articles.
last_date = transactions_train_df.t_dat.max()
print(last_date)
print(transactions_train_df.loc[transactions_train_df.t_dat==last_date].shape)

Top N frequently bought together items.

In [None]:
most_frequent_articles = list(transactions_train_df.loc[transactions_train_df.t_dat==last_date].article_id.value_counts()[0:12].index)
art_list = []
for art in most_frequent_articles:
    art = "0"+str(art)
    art_list.append(art)
art_str = " ".join(art_list)
print("Frequent articles bought recently: ", art_str)

In [None]:
def padding_articles(x):
    if x:
        xl = x.split()
        x = []
        for xi in xl:
            x.append("0"+xi)
        dimm_x = len(x)
        if dimm_x < 12:
            x.extend(art_list[:12-dimm_x])
        return(" ".join(x))


agg_df = transactions_train_df.groupby(["customer_id"])["article_id"].agg(lambda x: str(x.values[0:12])[1:-1]).reset_index()
agg_df["article_id"] = agg_df["article_id"].apply(lambda x: padding_articles(x))
print("Aggregated transaction history: ", agg_df.customer_id.nunique())

print("Submission sample: ", sample_submission_df.customer_id.nunique())
print()

print(sample_submission_df.shape)
sample_submission_df.head()

In [None]:
submission_df = agg_df.merge(sample_submission_df[["customer_id"]], how="right")
submission_df.columns = ["customer_id", "prediction"]
print(submission_df.shape)
submission_df.head()

In [None]:
len(submission_df.iloc[0,:].prediction.split())

In [None]:
print("Rows with missing data in submission: ", submission_df.loc[submission_df.prediction.isna()].shape[0])

In [None]:
submission_df.loc[submission_df.prediction.isna(), ["prediction"]] = art_str
print("Rows with missing data in submission: ", submission_df.loc[submission_df.prediction.isna()].shape[0])

In [None]:
submission_df.to_csv("baseline_submission.csv", index=False)