In [1]:
import pandas as pd

### Load products data.

In [2]:
products = pd.read_csv('data_source/products.csv')
products.product_id = pd.to_numeric(products.product_id, downcast='integer')

### Calculate percentage of similarity.
### How to calculate percentage logic:
#### In products we have only 3 parameters whic we can use for calculating similarities: price, goods_groop and manufacturer. I've decided to calculate it with next formula: 
#### percentage = (goods_group similariy(1 or 0) + price similarity(from 0 to 1) + manufacturer similarity(1 or 0)) / 3 * 100.

#### Note: for calculating similarity by price I've wrote separate function.

In [3]:
def get_percentage_of_similarity(row):
    """
        This function calculates similarity percentage.
        
        Logic:
            - Take a row from dataframe as input with
              (price, goods_group, manufacturer) for both of product ids 
              that need to be checked.
            - Set similarity to 0 by default.
            - If group matchs - set 1, othewise 0.
            - If manufacturer matchs - same.
            - Calculate price similarity with next formula: percents = (price1 - price2) / price1 * 100,
              next 100(mean 100 percent) - percents.
            - Add all three components, divide it by 3 and multipicate by 100.
            - Round to one number after dot.
            - Return result.
    """
    similarity = 0
    if row['p_group'] == row['goods_group']:
        similarity += 1
    if row['p_man'] == row['manufacturer']:
        similarity += 1
    
    price_similarity = (row['p_price'] - row['price']) / row['p_price'] * 100
    price_similarity = abs(100 - price_similarity)
    
    if price_similarity > 100:
        price_similarity = 0
    else:
        price_similarity /= 100
    
    similarity = (similarity + price_similarity) / 3 * 100
    similarity = round(similarity, 1)
    return similarity

In [4]:
def get_df_with_similarities(product_id, product_ids_to_check):
    """
        This function returns dataframe with similarities.
        
        Logic: 
            - get row withgiven product id.
            - rename columns for make possible concatenation.
            - get df with product ids which need to be checked.
            - concat these dataframes.
            - Add new column 'similarity_percentage' to concatenated dataframe 
              and apply function which calculate similarities.
            - return this DataFrame.
    """
    product = products[(products.product_id == product_id)]
    product = product.rename(columns={'product_id': 'p_id', 'price': 'p_price', 
                                      'goods_group': 'p_group', 'manufacturer': 'p_man'})
    product = pd.concat([product] * len(product_ids_to_check), ignore_index=True)
    
    products_to_check_df = products.loc[(products.product_id.isin(product_ids_to_check))]
    products_to_check_df.reset_index(inplace=True)
    
    result_df = pd.concat([product, products_to_check_df], axis=1)
    result_df['similarity_percentage'] = result_df.apply(axis=1, func=get_percentage_of_similarity)

    return result_df

### Test it

In [5]:
number_of_samples = 150

product_ids_to_check = list(products.product_id.sample(number_of_samples))

result_df = get_df_with_similarities(product_id=518018, product_ids_to_check=product_ids_to_check)

result_df.sort_values('similarity_percentage', ascending=False)

Unnamed: 0,p_id,p_price,p_group,p_man,index,product_id,price,goods_group,manufacturer,similarity_percentage
67,518018,899.0,Дитячі машинки,MZ,1843,540747,1799.0,Дитячі машинки,MZ,66.7
120,518018,899.0,Дитячі машинки,MZ,3178,523208,897.0,Дитячі машинки,Sulong Toys,66.6
37,518018,899.0,Дитячі машинки,MZ,1119,429383,579.0,Дитячі машинки,Big Motors,54.8
66,518018,899.0,Дитячі машинки,MZ,1817,526911,529.0,Дитячі машинки,Blaze & The monster machines,52.9
94,518018,899.0,Дитячі машинки,MZ,2500,539453,492.0,Дитячі машинки,Maisto,51.6
2,518018,899.0,Дитячі машинки,MZ,49,446152,349.0,Дитячі машинки,CAT,46.3
15,518018,899.0,Дитячі машинки,MZ,434,466959,297.0,Дитячі машинки,WADER,44.3
149,518018,899.0,Дитячі машинки,MZ,3723,538895,285.0,Дитячі машинки,Технопарк,43.9
143,518018,899.0,Дитячі машинки,MZ,3639,542112,269.0,Дитячі машинки,Автопром,43.3
41,518018,899.0,Дитячі машинки,MZ,1171,363250,179.0,Дитячі машинки,Hot Wheels,40.0
