In [2]:
import pandas as pd
import pickle

In [3]:
path = "dataset/"

outfit_data = pd.read_csv(path + "/outfit_data.csv")
product_data = pd.read_csv(path + "/product_data.csv")

In [4]:
non_interseting = product_data[product_data['des_product_family'].isin(['Fragances', 'Deco Accessories', 'Intimate', 'Deco Textiles', 'Bedding']) | product_data["des_sex"].isin(["Male","Unisex"]) | product_data["des_age"].isin(["Kids"])]
non_interseting_codes = non_interseting['cod_modelo_color']
outfit_data = outfit_data[~outfit_data['cod_modelo_color'].isin(non_interseting_codes)]

In [5]:
# outfit_data['cod_modelo_color'] = outfit_data['cod_modelo_color'].str.slice(stop=-3)
outfit_data = outfit_data.drop_duplicates()
product_counts = outfit_data.groupby('cod_modelo_color').size().reset_index(name='counts')

merged_df = pd.merge(outfit_data, outfit_data, how='left', on='cod_outfit')
merged_df = merged_df[merged_df['cod_modelo_color_x'] > merged_df['cod_modelo_color_y']]
merged_df = merged_df.groupby(['cod_modelo_color_x', 'cod_modelo_color_y']).size().reset_index(name='weight')

# normalize the weights by the number of times the product appears in an outfit
merged_df = pd.merge(merged_df, product_counts, how='left', left_on='cod_modelo_color_x', right_on='cod_modelo_color')
# the set is analysed over the condition merged_df['cod_modelo_color_x'] > merged_df['cod_modelo_color_y']
# it means analysing only the superior diagonal matrix associated to the graph  
merged_df['weight'] = merged_df['weight'] / merged_df['counts']

merged_df['edge'] = merged_df.apply(lambda x: (x['cod_modelo_color_y'], x['weight']), axis=1)
merged_df = merged_df.groupby('cod_modelo_color_x').agg({'edge': lambda x: list(x)})
merged_df

Unnamed: 0_level_0,edge
cod_modelo_color_x,Unnamed: 1_level_1
41041002-02,"[(41039067-05, 0.5)]"
41045021-02,"[(41025020-02, 0.14285714285714285)]"
41045815-45,"[(41039067-05, 0.25)]"
41065822-50,"[(41055822-08, 1.0)]"
41083009-08,"[(41069066-05, 0.14285714285714285)]"
...,...
67091003-99,"[(57029209-99, 1.0), (57095923-99, 1.0), (6700..."
67106705-99,"[(57025921-OR, 1.0), (57037880-OR, 1.0), (5708..."
87040069-OR,"[(57000438-OR, 1.0), (57001503-02, 1.0), (5701..."
87062013-OR,"[(57040459-99, 1.0), (57063817-99, 1.0), (5707..."


In [35]:
pruning_table = pd.merge(product_data, product_data, how='left', on='des_product_aggregated_family')
pruning_table = pruning_table[pruning_table['cod_modelo_color_x'] > pruning_table['cod_modelo_color_y']]
pruning_table = pruning_table.groupby('cod_modelo_color_x').agg({'cod_modelo_color_y': lambda x: list(x)})
pruning_table_dict = pruning_table.to_dict()['cod_modelo_color_y']

pruning_table_dict

{'37000577-30': ['37000577-10'],
 '37010681-99': ['37000577-30', '37000577-10'],
 '37010684-99': ['37010681-99', '37000577-30', '37000577-10'],
 '37010684-CU': ['37010684-99', '37010681-99', '37000577-30', '37000577-10'],
 '37010741-37': ['37010684-CU',
  '37010684-99',
  '37010681-99',
  '37000577-30',
  '37000577-10'],
 '37040047-02': ['37010684-CU',
  '37010684-99',
  '37010681-99',
  '37000577-30',
  '37000577-10',
  '37010741-37'],
 '37050118-PL': ['37010684-CU',
  '37010684-99',
  '37010681-99',
  '37040047-02',
  '37000577-30',
  '37000577-10',
  '37010741-37'],
 '37064382-99': ['37010684-CU',
  '37010684-99',
  '37010681-99',
  '37040047-02',
  '37050118-PL',
  '37000577-30',
  '37000577-10',
  '37010741-37'],
 '41017020-08': ['41005828-05'],
 '41025822-05': ['37095905-56'],
 '41027798-50': ['41025782-23'],
 '41030739-56': ['41017020-08', '41005828-05'],
 '41035760-07': ['41017020-08', '41030739-56', '41005828-05'],
 '41039066-43': ['41025822-05', '37095905-56'],
 '41039067-05'

In [36]:
import networkx as nx

G = nx.Graph()

for row in merged_df.iterrows():
    nodei = row[0]
    for edge in row[1]['edge']:
        nodej = edge[0]
        if not nodei in pruning_table_dict or not nodej in pruning_table_dict[nodei]:
            G.add_edge(nodei, nodej, weight=edge[1])


In [60]:
communities = nx.community.louvain_communities(G, weight='weight', resolution=4)

In [61]:
# create a list of tuples with id and community

communities = list(communities)


data = []

for i, community in enumerate(communities):
    data += [(i, item) for item in community]

df = pd.DataFrame(data, columns=['community_id', 'cod_modelo_color'])
df["community_id"].value_counts()

1      305
33     192
94     184
53     167
4      163
      ... 
88       8
40       3
24       3
32       3
115      2
Name: community_id, Length: 117, dtype: int64

In [66]:
communities = list(communities)
print(len(communities))

{'57085800-01', '57040085-65', '57001168-01', '57077736-43', '57083795-99', '57073816-32', '57033804-DI', '57027717-99', '57094406-43', '57094781-99', '57093801-30', '57083795-32', '57024775-01', '57094024-30', '57084414-02', '57097728-99', '57036728-92', '57024397-05', '57093795-99', '57096016-92', '57063819-02', '57042510-99', '57053799-43', '57055795-70', '57094763-TN', '57042524-OR', '57073808-TM', '57000187-01', '57084388-99', '57045139-99', '57074028-01', '57042003-TC', '57063801-90', '57075801-01', '57064404-08', '57072001-92', '57084764-DI', '57021179-TM', '57007761-06', '57013803-TM', '57056028-43', '57055802-05', '57034407-09', '57096711-08', '57064396-99', '57004401-TM', '57050181-02', '57054419-37', '57044398-05', '57005834-43', '57014778-99', '47052881-10', '57074401-50', '57093802-TN', '57095795-37', '57054415-37', '57073812-01', '57064779-06', '57054386-05', '57013813-05', '57001061-70', '57083801-30', '57096722-TN', '57044407-09', '57084424-06', '57053804-TM', '57040290

In [58]:
compatibilities = dict(zip(merged_df.cod_modelo_color_x, merged_df.cod_modelo_color_y))

with open(path + 'compatibilities.pickle', 'wb') as handle:
    pickle.dump(compatibilities, handle, protocol=pickle.HIGHEST_PROTOCOL)

AttributeError: 'DataFrame' object has no attribute 'cod_modelo_color_x'