# KNN recommendation

In [1]:
import pandas as pd

df_raw = pd.read_csv('../../data/03_correlation_data/presence_dataset/20210331124802/presences.csv')
df = df_raw.copy()
print(f"Dataset size:{df.memory_usage(index=True).sum()}") # dtype category = 4_209_304 | no dtype category = 2_928_976 
df_raw.head(3)

Dataset size:2928976


Unnamed: 0,thread_id,product_name
0,e9molz,110
1,e9d9z1,Ibis Ripmo AF
2,e9as86,Airstream 2


In [2]:
###
# Data cleaning
###

# String normalization
df['product_name'] = df['product_name'].str.casefold()
df['product_name'] = df['product_name'].str.strip()
print(f"Normalization product dimension: {len(df['product_name'].unique())}")

# Remove all products with all number on the name
def has_numbers(inputString):
    return all(char.isdigit() for char in inputString)
df['name_with_digits'] = df['product_name'].apply(has_numbers)
df = df[df["name_with_digits"] == False]
df = df.drop("name_with_digits", 1)
print(f"Digits cleaning product dimension: {len(df['product_name'].unique())}")

# Remove all products with more than 2 words
df = df[df['product_name'].str.split().str.len().lt(2)]
print(f"2-words product dimension: {len(df['product_name'].unique())}")

###
# Metadata calculation
###

# Extract threads and products info
threads_unique = df['thread_id'].unique()
products_unique = df['product_name'].unique()
util_mtx_dimension = (len(threads_unique), len(products_unique))

print(f"Dataset len: {len(df)}, columns:{set(df.columns)}")
print(f"Unique threads:{len(threads_unique)}")
print(f"Unique products:{len(products_unique)}")
print(f"Utility matrix - shape predicted:{util_mtx_dimension}")

if util_mtx_dimension[1] > util_mtx_dimension[0]:
    print("[INFO] There are more products than threads")

Normalization product dimension: 40168
Digits cleaning product dimension: 39727
2-words product dimension: 10691
Dataset len: 115237, columns:{'product_name', 'thread_id'}
Unique threads:27577
Unique products:10691
Utility matrix - shape predicted:(27577, 10691)


In [4]:
import numpy as np
from scipy.sparse import csr_matrix
from bidict import bidict

# Build the indexer mapper
thread2int = bidict({u:i for i, u in enumerate(df["thread_id"].unique())})
product2int = bidict({m:i for i, m in enumerate(df["product_name"].unique())})

# Map text to index
threads_idx = [thread2int[u] for u in df["thread_id"]]
products_idx = [product2int[m] for m in df["product_name"]]

# Create matrix with 1 values [1,1,1,1,...[len(threads)]]
data = np.ones(len(threads_idx), )

# Create the utility matrix - sparse matrix [len(threads), len(products)]
utility_matrix = csr_matrix((data, (threads_idx, products_idx)))
assert utility_matrix.shape == util_mtx_dimension, f"A_sparse have a wrong dimension, expected:{util_mtx_dimension}"

print(f"Utility matrix shape:{utility_matrix.get_shape()}[threads, products]")
print("Utility matrix structure:")
utility_matrix.todense()[:3]

Utility matrix shape:(27577, 10691)[threads, products]
Utility matrix structure:


matrix([[5., 0., 0., ..., 0., 0., 0.],
        [0., 2., 0., ..., 0., 0., 0.],
        [0., 0., 1., ..., 0., 0., 0.]])

#### Model
> Scikit [doc](https://scikit-learn.org/stable/modules/generated/sklearn.neighbors.NearestNeighbors.html#sklearn.neighbors.NearestNeighbors)

In [22]:
from sklearn.neighbors import NearestNeighbors
knn = NearestNeighbors(metric='minkowski',
                       p = 2,
                       algorithm='auto', 
                       n_neighbors=5, 
                       n_jobs=-1,
                      )

In [23]:
train_matrix = utility_matrix.transpose() # shape required: (n_samples, n_features) 

In [24]:
knn.fit(train_matrix) 

NearestNeighbors(n_jobs=-1)

In [29]:
def get_bike_recommendation(bike_name):
    neighbors = 10
    bike_id = product2int.get(bike_name,None)
    
    if not bike_id:
        print(f"Bike {bike_name} not found")
        pass
    print(f"Searching '{bike_name}' - id:{bike_id}")
    
    distances, indices = knn.kneighbors(train_matrix[bike_id], n_neighbors=neighbors+1)
    distances, indices = distances.tolist().pop(), indices.tolist().pop()
    
    for dis, idx in zip(distances, indices):
        bike_name = list(product2int.keys())[list(product2int.values()).index(idx)]
        bike_name = product2int.inverse[idx]
        print(f"dis:{dis}, idx:{idx}, bike suggested:{bike_name}")
        
get_bike_recommendation("stoic")

Searching 'stoic' - id:48
dis:0.0, idx:48, bike suggested:stoic
dis:7.211102550927978, idx:1584, bike suggested:gf
dis:7.3484692283495345, idx:706, bike suggested:superior
dis:7.416198487095663, idx:1780, bike suggested:xp
dis:7.681145747868608, idx:6977, bike suggested:respect
dis:7.681145747868608, idx:6979, bike suggested:roo
dis:7.681145747868608, idx:6974, bike suggested:skull120
dis:7.681145747868608, idx:6975, bike suggested:jogon
dis:7.681145747868608, idx:6972, bike suggested:52/36
dis:7.681145747868608, idx:6986, bike suggested:superflash
dis:7.681145747868608, idx:6989, bike suggested:xsmall
