In [2]:
!pip install --upgrade networkx

import json
import numpy as np
import pandas as pd
import networkx as nx
import os
from google.colab import drive





In [3]:
# Paths
project_dir = "Learning from Networks/"
meta_path = 'clean_meta_Software.json'
ratings_path = 'ratings_Software.csv'

In [4]:
# Mounting google drive and change dir
drive.mount('/content/drive', force_remount=True)
os.chdir(os.path.join("/content/drive/MyDrive/",project_dir))
!ls

Mounted at /content/drive
clean_meta_Software.json  ratings_Software.csv	relations.csv


In [5]:
try:
    meta_df = pd.read_json(meta_path, lines=True)
    meta_df = meta_df[["asin", "categories", "related", "price", "salesRank"]]
    meta_df['price'] = meta_df['price'].fillna(0)
    # meta_df["related"] = meta_df["related"].fillna({'also_bought':[], 'also_viewed':[], 'bought_together':[]})

    def fill_related(row):
      row = {'also_bought': [], 'also_viewed': [], 'bought_together': []} if pd.isna(row) else row
      row.setdefault('also_bought', [])
      row.setdefault('also_viewed', [])
      row.setdefault('bought_together', [])
      # 'also_bought'] = [] if pd.isna(row['also_bought']) else row['also_bought']
      # row['also_viewed'] = [] if pd.isna(row['also_viewed']) else row['also_viewed']
      # row['bought_together'] = [] if pd.isna(row['bought_together']) else row['bought_together']

      return row

    def fix_categories(row):
        return row[0]


    meta_df["related"] = meta_df["related"].apply(fill_related)
    meta_df["categories"] = meta_df["categories"].apply(fix_categories)


    print(meta_df)
except ValueError as e:
    print(f"Error reading JSON: {e}")


             asin                                         categories  \
0      0030429137  [Software, Business & Office, Visualization & ...   
1      0077338030   [Software, Education & Reference, Encyclopedias]   
2      0078798019                  [Software, Education & Reference]   
3      0080964184                            [Software, Photography]   
4      0131433172                                  [Movies & TV, TV]   
...           ...                                                ...   
18464  B00LEZEY4S                      [Software, Utilities, Backup]   
18465  B00LK2K692  [Software, Programming & Web Development, Prog...   
18466  B00LK2WEWY  [Software, Programming & Web Development, Prog...   
18467  B00LK1W8O4  [Software, Accounting & Finance, Personal Fina...   
18468  B00LNVBOG4   [Software, Education & Reference, Encyclopedias]   

                                                 related  price  \
0      {'also_bought': ['0030429145'], 'also_viewed':...   0.00   
1

In [38]:
# Prepare co-purchase relationships
def prepare_relations():
    relations_df = pd.DataFrame([], columns=["source", "target"])

    for index, row in meta_df.iterrows():
        bought = set(row["related"]['also_bought']+row["related"]['bought_together'])

        for target in bought:
            relations_df = pd.concat([relations_df, pd.DataFrame([{"source": row["asin"], "target": target}])], ignore_index=True)

    return relations_df

relations_df = prepare_relations()

In [40]:
relations_df.to_csv('relations.csv', index=False)
relations_df

Unnamed: 0,source,target
0,0030429137,0030429145
1,0077338030,0078111048
2,0077338030,0471479519
3,0078798019,0078619718
4,0321636864,0321624785
...,...,...
74190,B00KQV8RNK,B00CTTEKJW
74191,B00KQV8RNK,B00KMECBVA
74192,B00KQV8RNK,B00KQV8DJ8
74193,B00KQV8RNK,B00JKKMTX2


In [6]:
# section associated to the rating fields: number_of_ratings, avg_rating
def aggregate(group):
    return pd.Series({
        "number_of_ratings": len(group['Rate'].tolist()),
        "avg_rating": np.average(group['Rate'])
    })


rating_df = pd.read_csv(ratings_path, names=["User", "asin", "Rate", "Time"])

rating_df = rating_df.groupby("asin").apply(aggregate).reset_index()

print(rating_df)


             asin  number_of_ratings  avg_rating
0      0030429137                2.0         4.0
1      0077338030                1.0         1.0
2      0078798019                1.0         5.0
3      0080964184                5.0         2.4
4      0131433172                1.0         5.0
...           ...                ...         ...
18182  B00LBETECY                2.0         5.0
18183  B00LEZEY4S                3.0         5.0
18184  B00LK1W8O4                5.0         4.8
18185  B00LK2K692                1.0         5.0
18186  B00LK2WEWY                1.0         5.0

[18187 rows x 3 columns]


In [36]:
df = meta_df.copy()

input_dataset = pd.DataFrame(columns=['source', 'target', 'rel'])

lastIndex = df.last_valid_index()
positive_results = 0
total_count = 0
for i in range(0, lastIndex):
  node1 = df.loc[i].to_dict()
  if len(node1['related']['also_viewed']) == 0 : continue

  cond = df['asin'].isin(node1['related']['also_viewed'])
  for i, node2 in df.query('@cond').iterrows():
    # node2 = df.loc[j].to_dict()

    if node2['asin'] in node1['related']['also_bought']+node1['related']['bought_together'] \
      or node1['asin'] in node2['related']['also_bought']+node2['related']['bought_together']:
      new_row = pd.DataFrame([{"source": node1["asin"], "target": node2['asin'], "rel": 1}])
      positive_results+=1
    else:
      new_row = pd.DataFrame([{"source": node1["asin"], "target": node2['asin'], "rel": 0}])
    input_dataset = pd.concat([input_dataset, new_row], ignore_index=True)

    total_count += 1

    if total_count % 5000 == 0:
      print('positive results:', positive_results, 'of', total_count, f'({positive_results/total_count}%)')




positive results: 118 of 5000 (0.0236%)
positive results: 211 of 10000 (0.0211%)
positive results: 299 of 15000 (0.019933333333333334%)
positive results: 374 of 20000 (0.0187%)
positive results: 491 of 25000 (0.01964%)
positive results: 590 of 30000 (0.019666666666666666%)
positive results: 692 of 35000 (0.019771428571428572%)
positive results: 787 of 40000 (0.019675%)
positive results: 869 of 45000 (0.019311111111111112%)


In [37]:
input_dataset

Unnamed: 0,source,target,rel
0,0080964184,B002CIP12U,0
1,0080964184,B00CH6ATMO,0
2,0080964184,B00CH6AWOY,0
3,0080964184,B00CH6AXPW,0
4,0080964184,B00EOI2SR2,0
...,...,...,...
45404,B00LEZEY4S,B00GLQXJFK,0
45405,B00LEZEY4S,B00GOUBT6E,0
45406,B00LEZEY4S,B00HME9UWQ,0
45407,B00LEZEY4S,B00HP034BA,0




In [44]:
df = pd.merge(meta_df, rating_df, how='inner', on='asin')

df = df.dropna(axis=1)


# one hot encoding on the category field
categories_one_hot = pd.get_dummies(df['categories'].apply(pd.Series).stack()).sum(level=0)
df = pd.concat([df, categories_one_hot], axis=1)

df = df.drop('categories', axis=1)
# df = df.drop('Apps for Android', axis=1)

df['number_of_ratings'] = df['number_of_ratings'].fillna(0)
df['avg_rating'] = df['avg_rating'].fillna(0)

# Print the resulting DataFrame
print(df)

  categories_one_hot = pd.get_dummies(df['categories'].apply(pd.Series).stack()).sum(level=0)


             asin                                            related  price  \
0      0030429137  {'also_bought': ['0030429145'], 'also_viewed':...   0.00   
1      0077338030  {'also_bought': ['0078111048', '0471479519'], ...   0.00   
2      0078798019  {'also_bought': ['0078619718'], 'also_viewed':...   0.00   
3      0080964184  {'also_viewed': ['B002CIP12U', 'B00CH6ATMO', '...  99.00   
4      0131433172  {'also_bought': [], 'also_viewed': [], 'bought...   0.00   
...           ...                                                ...    ...   
18182  B00LBETECY  {'also_bought': [], 'also_viewed': [], 'bought...   0.00   
18183  B00LEZEY4S  {'also_viewed': ['B00L4ED7Y2', 'B00HP034BA', '...  45.51   
18184  B00LK2K692  {'also_bought': [], 'also_viewed': [], 'bought...   0.00   
18185  B00LK2WEWY  {'also_bought': [], 'also_viewed': [], 'bought...   0.00   
18186  B00LK1W8O4  {'also_bought': [], 'also_viewed': [], 'bought...   0.00   

       number_of_ratings  avg_rating  Accessories  

In [None]:
G = nx.Graph()

G.add_nodes_from(df['asin'])

In [None]:
for i in range(df.shape[0]):
    value = df.iloc[i]
    also_bought = set(value["related"].get("also_bought", []))
    bought_together = set(value["related"].get("bought_together", []))

    valid_also_bought = also_bought.intersection(df['asin'].values)
    valid_bought_together = bought_together.intersection(df['asin'].values)

    G.add_edges_from((value['asin'], j) for j in valid_also_bought)
    G.add_edges_from((value['asin'], j) for j in valid_bought_together)



In [None]:
# nx.write_g

nx.write_gml(G, 'graph.gz')

In [None]:
G = nx.read_gml("graph.gz")

In [None]:
print("The edge size of graph G: ",len(G.edges))
print("The node size of graph G: ",len(G.nodes))

In [None]:
# nodes_to_remove = [node for node, degree in dict(G_purchase.degree()).items() if degree == 0]
# G_purchase.remove_nodes_from(nodes_to_remove)


In [None]:
# import matplotlib.pyplot as plt

# nx.draw(G_purchase, with_labels=False, )
