# Importing Libraries

In [1]:
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from warnings import filterwarnings
import nltk
from nltk.corpus import stopwords
import re
import string
from nltk.stem import WordNetLemmatizer
from nltk import word_tokenize
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer 
from sklearn.metrics.pairwise import cosine_similarity
filterwarnings('ignore')

# Importing Datasets

In [2]:
articles = pd.read_csv("../input/h-and-m-personalized-fashion-recommendations/articles.csv")
customers = pd.read_csv("../input/h-and-m-personalized-fashion-recommendations/customers.csv")
transactions = pd.read_csv("../input/h-and-m-personalized-fashion-recommendations/transactions_train.csv")

## Articles Dataset

This dataset contains products and related information about them. Rows with null values were removed from the data set.

In [3]:
articles = articles.dropna()

In [4]:
articles.columns

In [5]:
articles.shape,customers.shape,transactions.shape

Only NLP-related variables were selected from the dataset and all those variables containing text were combined in one column with the name "text". Since various numeric values will not be used, they were not selected.

In [6]:
articles["text"] = articles["prod_name"].map(str) + " " + articles["product_type_name"] +" "+ articles["product_group_name"]+ " "+ articles['graphical_appearance_name']+" "+ articles['colour_group_name'] +" "+ articles['perceived_colour_value_name']+ " " + articles["perceived_colour_master_name"] +" "+ articles["department_name"]+ " "+ articles['index_name']+" "+articles['index_group_name'] +" "+articles['section_name']+ " "+ articles['garment_group_name']+" "+articles['detail_desc']
articles.head(2)

Finally, a dataframe created only includes 'article_id', 'product_code', 'text' columns.

In [7]:
df_all = articles[['article_id', 'product_code', 'text']]
#pd.set_option("display.max_colwidth", -1)

The text variable needs to be cleared for NLP implementation. For this reason, the necessary files have been downloaded.

In [8]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')

Text cleaning function defined and applied on text variable

In [9]:
stop = stopwords.words('english')
stop_words_ = set(stopwords.words('english'))
wn = WordNetLemmatizer()

def black_txt(token):
    return  token not in stop_words_ and token not in list(string.punctuation)  and len(token)>2   
  
def clean_txt(text):
  clean_text = []
  clean_text2 = []
  text = re.sub("'", "",text)
  text=re.sub("(\\d|\\W)+"," ",text) 
  text = text.replace("nbsp", "")
  clean_text = [ wn.lemmatize(word, pos="v") for word in word_tokenize(text.lower()) if black_txt(word)]
  clean_text2 = [word for word in clean_text if black_txt(word)]
  return " ".join(clean_text2)

In [10]:
text ='Strap top Vest top Garment ''Upper11 body Solid Black Dark Black Jersey Basic Ladieswear Ladieswear Womens Everyday Basics Jersey Basic Jersey top with narrow shoulder straps.'
print('step0',text)
text = re.sub("'", "",text)
print('step1',text)
text=re.sub("(\\d|\\W)+"," ",text)
print('step2',text)
text = text.replace("nbsp", "")
print('step3',text)

l1= word_tokenize(text)

print('step4',l1)



In [11]:
df_all['text'] = df_all['text'].apply(clean_txt)

Initializing tfidf vectorizer for articles, fitting and transforming the vector

In [12]:
tfidf_vectorizer = TfidfVectorizer()
tfidf_article = tfidf_vectorizer.fit_transform((df_all['text'])) 
tfidf_article.toarray()

## Transactions Dataset

In [13]:
transactions = transactions.dropna()

transactions['InvoiceDate'] = pd.to_datetime(transactions['t_dat'],format='%Y-%m-%d')
transactions=transactions[["InvoiceDate", "customer_id", "article_id", "price","sales_channel_id"]].drop_duplicates()

In [14]:
transactions = transactions[:5000000]

In [15]:
transactions.shape

Sorting the dataset by customer id to see all of a customer's purchases

In [16]:
transactions =  transactions.sort_values(by='customer_id')
transactions.head()

In [17]:
merged_df = df_all.merge(transactions, how = 'inner', on = ['article_id'])

merged_df = merged_df[:10000000]

The text information of all the products purchased by the user are gathered in the same 'text' variable.

In [18]:
merged_df2 = merged_df.groupby('customer_id', sort=False)['text'].apply(' '.join).reset_index()
merged_df2.head(5)

# **Recommendation**

A random customer_id was chosen to make a reccommendation

In [19]:
u = "000058a12d5b43e67d225668fa1f8d618c13dc232df0cad8ffe7ad4a1091e318" #customer_id
index = np.where(merged_df2['customer_id'] == u)[0][0]
cust_q = merged_df2.iloc[[index]]
cust_q

### Products user bought before

## Define a Reccommendation Function 

In [20]:
recommendation = pd.DataFrame(columns = ['customer_id', 'article_id',  'product_code', 'detail_desc', 'score'])
recommendation

Recommendation function includes customer ID, article ID, product code, description and similarity score.

In [21]:
def recommendation_product(top, df_all, scores):
  recommendation = pd.DataFrame(columns = ['customer_id', 'article_id',  'product_code', 'detail_desc', 'score'])
  count = 0
  for i in top:
      recommendation.at[count, 'customer_id'] = u
      recommendation.at[count, 'article_id'] = df_all['article_id'][i]
      recommendation.at[count, 'product_code'] = df_all['product_code'][i]
      recommendation.at[count, 'detail_desc'] = articles['detail_desc'][i]   
      recommendation.at[count, 'score'] =  scores[count]
      count += 1
  return recommendation

## Calculating Cosine Similarity for the User

In [22]:
user_tfidf = tfidf_vectorizer.transform(cust_q['text'])
cos_similarity_tfidf = map(lambda x: cosine_similarity(user_tfidf, x),tfidf_article)

In [48]:
output2 = list(cos_similarity_tfidf)

## Recommendations with TFIDF

In [24]:
top = sorted(range(len(output2)), key=lambda i: output2[i], reverse=True)[:10]
tf_list_scores = [output2[i][0][0] for i in top]
recommendation_product(top, df_all, tf_list_scores)

In [25]:
tf_idf_score=pd.DataFrame(recommendation_product(top, df_all, tf_list_scores), columns = ['article_id', 'score'])

In [26]:
tf_idf_score

## Reccomendations with CountVectorizer

In [27]:
from sklearn.feature_extraction.text import CountVectorizer
count_vectorizer = CountVectorizer()

count_artid = count_vectorizer.fit_transform((df_all['text'])) #fitting and transforming the vector
count_artid

In [28]:
from sklearn.metrics.pairwise import cosine_similarity
user_count = count_vectorizer.transform(cust_q['text'])
cos_similarity_countv = map(lambda x: cosine_similarity(user_count, x),count_artid)

In [29]:
output3 = list(cos_similarity_countv)

In [30]:
top = sorted(range(len(output3)), key=lambda i: output3[i], reverse=True)[:10]
list_scores_cv = [output3[i][0][0] for i in top]
recommendation_product(top, df_all, list_scores_cv)

In [38]:
top

In [31]:
cv_score=pd.DataFrame(recommendation_product(top, df_all, list_scores_cv), columns = ['article_id', 'score'])

## Reccommendations with KNN

In [32]:
from sklearn.neighbors import NearestNeighbors
KNN = NearestNeighbors(n_neighbors=11)
KNN.fit(tfidf_article)
NNs = KNN.kneighbors(user_tfidf, return_distance=True) 

In [33]:
top = NNs[1][0][1:]
index_score = NNs[0][0][1:]
recommendation_product(top, df_all, index_score)

In [34]:
knn_score=pd.DataFrame(recommendation_product(top, df_all, index_score), columns = ['article_id', 'score'])

# Comparison 

In [35]:
tf_idf_score=tf_idf_score.rename(columns={"score":"tf_idf_score"})
cv_score=cv_score.rename(columns={"score":"cv_score"})
knn_score=knn_score.rename(columns={"score":"knn_score"})

In [36]:
pd.concat([tf_idf_score, cv_score, knn_score], axis=1)

It seems that while knn and tf-idf make **almost** the same recommendations, the system based on countvectorizer makes different recommendations.