In [1]:
import pandas as pd

In [2]:
products = pd.read_csv('data_source/products.csv')
products.product_id = pd.to_numeric(products.product_id, downcast='integer')

In [3]:
def get_percentage_of_similarity(row):
    """
        This function calculates similarity percentage.
        
        Logic:
            - Take a row from dataframe as input with
              (price, goods_group, manufacturer) for both of product ids 
              that need to be checked.
            - Set similarity to 0 by default.
            - If group matchs - set 1, othewise 0.
            - If manufacturer matchs - same.
            - Calculate price similarity with next formula: percents = (price1 - price2) / price1 * 100,
              next 100(mean 100 percent) - percents.
            - Add all three components, divide it by 3 and multipicate by 100.
            - Round to one number after dot.
            - Return result.
    """
    similarity = 0
    if row['p_group'] == row['goods_group']:
        similarity += 1
    if row['p_man'] == row['manufacturer']:
        similarity += 1
    
    price_similarity = (row['p_price'] - row['price']) / row['p_price'] * 100
    price_similarity = abs(100 - price_similarity)
    
    if price_similarity > 100:
        price_similarity = 0
    else:
        price_similarity /= 100
    
    similarity = (similarity + price_similarity) / 3 * 100
    similarity = round(similarity, 1)
    return similarity

In [4]:
def get_df_with_similarities(product_id, number_of_most_similar):
    """
    This function returns dataframe with similarities.
    Logic: 
        - get row withgiven product id.
        - rename columns for make possible concatenation.
        - concatenate products dataframe with given product.
        - Add new column 'similarity_percentage' to concatenated dataframe 
          and apply function which calculate similarities.
        - return this DataFrame.
    """
    product = products[(products.product_id == product_id)]
    product = product.rename(columns={'product_id': 'p_id', 'price': 'p_price', 
                                      'goods_group': 'p_group', 'manufacturer': 'p_man'})
    product = pd.concat([product] * len(products), ignore_index=True)
    
    result_df = pd.concat([product, products], axis=1)
    result_df['similarity_percentage'] = result_df.apply(axis=1, func=get_percentage_of_similarity)

    return result_df.sort_values('similarity_percentage', ascending=False).head(number_of_most_similar)

### Test it.

In [5]:
get_df_with_similarities(product_id=539093, number_of_most_similar=10)

Unnamed: 0,p_id,p_price,p_group,p_man,product_id,price,goods_group,manufacturer,similarity_percentage
3742,539093,1899.0,Ігрові фігурки,Hasbro transformers,539093,1899.0,Ігрові фігурки,Hasbro transformers,100.0
1793,539093,1899.0,Ігрові фігурки,Hasbro transformers,521121,1798.0,Ігрові фігурки,Hasbro transformers,98.2
3727,539093,1899.0,Ігрові фігурки,Hasbro transformers,510269,589.0,Ігрові фігурки,Hasbro transformers,77.0
3728,539093,1899.0,Ігрові фігурки,Hasbro transformers,510270,589.0,Ігрові фігурки,Hasbro transformers,77.0
3737,539093,1899.0,Ігрові фігурки,Hasbro transformers,518865,569.0,Ігрові фігурки,Hasbro transformers,76.7
3736,539093,1899.0,Ігрові фігурки,Hasbro transformers,481602,569.0,Ігрові фігурки,Hasbro transformers,76.7
3738,539093,1899.0,Ігрові фігурки,Hasbro transformers,518866,569.0,Ігрові фігурки,Hasbro transformers,76.7
3735,539093,1899.0,Ігрові фігурки,Hasbro transformers,481600,569.0,Ігрові фігурки,Hasbro transformers,76.7
1794,539093,1899.0,Ігрові фігурки,Hasbro transformers,525767,569.0,Ігрові фігурки,Hasbro transformers,76.7
3734,539093,1899.0,Ігрові фігурки,Hasbro transformers,518872,549.0,Ігрові фігурки,Hasbro transformers,76.3
