# Model Building Instacart Capstone
#### This Notebook takes the data exported in the EDA notebook and turns it in to a TF-IDF model 
#### It also uses cosine similarities to find the most similar users to a target user
#### Finally it builds a loop to look at similar users and grab 5 recommended products for a target user based on their most similar users

In [1]:
#Importing the Basics


import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
#Importing sklearn

import sklearn
print(sklearn.__version__)

1.2.1


In [3]:
# Read in user data set
grouped_users_path = 'grouped_users_multiplied_output.csv'
grouped_users = pd.read_csv(grouped_users_path)

In [4]:
#Let's see how many rows we are working with in our new data set
aisles_rows = grouped_users.shape[0]
aisles_columns = grouped_users.shape[1]

print('row number = ' + str(aisles_rows))
print('column number = ' + str(aisles_columns))

#We are working with a really big data set. This may prove to be a challenge to fit the model with.

row number = 206209
column number = 2


In [5]:
#checking what our data looks like to make sure it is good to use

grouped_users.head()

Unnamed: 0,user_id,repeated_product_name
0,1,"Soda,Soda,Soda,Soda,Soda,Soda,Soda,Soda,Soda,S..."
1,2,"Chipotle Beef & Pork Realstick,Chipotle Beef &..."
2,3,"Vanilla Unsweetened Almond Milk,Vanilla Unswee..."
3,4,"Enchilada Black Bean Vegetable,Enchilada Black..."
4,5,"Red Raspberries,Red Raspberries,Red Raspberrie..."


In [6]:
#beacuse there are so many rows, I am going to start with around 1/3 of the data
#okay here's the deal, my computer ran out of memory when I tried to go to 1/3 of the data
#so I am going to use a much smaller set of the data as a proof of concept
#and if it works I would make the recommendation to run it again on a more powerful machiene 
#with all of the available data

users_smaller_set = grouped_users.head(25000)

In [7]:
from sklearn.feature_extraction.text import TfidfVectorizer


vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(users_smaller_set['repeated_product_name'])

In [8]:
from sklearn.metrics.pairwise import cosine_similarity

In [9]:
similarities = cosine_similarity(X)

In [10]:
#select the user we want to make reccomendations for. In this case we'll use user 1

user_id = 1 

# find similar users using cosine similarities
#here we're asking cosine similarity to find users that are similar to our target user 1 
similar_users = similarities[user_id]

#here we are defining that we want five reccomendations back
n_recommendations = 5 

#here we are sorting the similar users we get back in to the most similar
#this will help us get recommendations from the top similar users
top_similar_users = similar_users.argsort()[-n_recommendations-1:-1][::-1] 

#this is a place to store recommendations we get back
recommended_products = set()  

# this is a counter for recommendations
num_recommendations = 0

#here we are saying for users in our list of similar users to user 1
for user_idx in top_similar_users:
    #go look at the products they bought (order history) And split them apart on the commas
    products = users_smaller_set.iloc[user_idx]['repeated_product_name'].split(',')
    
    

    #Now loop through those products 
    for product in products:
        
        #first make sure the product is not empty
        if product.strip():  
            
            #If we don't have the product in our reccomended list, add it to the list
            if product not in recommended_products:
                recommended_products.add(product)
                #and add +1 to the count for number of reccomendations we are giving
                num_recommendations += 1

            # once we get to the desired number of reccomendations (in this case 5)
            #break out of this nested loop
            if num_recommendations >= n_recommendations:
                break

    # And break out of this loop too
    if num_recommendations >= n_recommendations:
        break

#create a list of recommended products with what we found in our loop
#based on the number of recommendations we requested
recommended_products = list(recommended_products)[:n_recommendations]

#And print those recommendations
print(recommended_products)

['Boneless Skinless Chicken Breasts', 'Total 2% Lowfat Greek Strained Yogurt with Peach', 'Total 2% Greek Strained Yogurt with Cherry 5.3 oz', 'Total 2% with Strawberry Lowfat Greek Strained Yogurt', 'Bag of Organic Bananas']
