In [1]:
!pip install --upgrade networkx

import json
import numpy as np
import pandas as pd
import networkx as nx
import os
from google.colab import drive

drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [3]:
os.chdir("/content/drive/MyDrive/Learning from Networks/")
!ls

In [None]:
meta_path = 'clean_meta_Apps_for_Android.json'

try:
    meta_df = pd.read_json(meta_path, lines=True)
    meta_df = meta_df[["asin", "categories", "related", "price", "salesRank"]]
    meta_df['price'] = meta_df['price'].fillna(0)
    # meta_df["related"] = meta_df["related"].fillna({'also_bought':[], 'also_viewed':[], 'bought_together':[]})

    def fill_related(row):
        return {'also_bought': [], 'also_viewed': [], 'bought_together': []} if pd.isna(row) else row

    def fix_categories(row):
        return row[0]


    meta_df["related"] = meta_df["related"].apply(fill_related)
    meta_df["categories"] = meta_df["categories"].apply(fix_categories)


    print(meta_df.iloc[61547])
except ValueError as e:
    print(f"Error reading JSON: {e}")


asin                                                 B00M0DLUIQ
categories                      [Apps for Android, Photography]
related       {'also_bought': [], 'also_viewed': [], 'bought...
price                                                       0.0
salesRank                                                   NaN
Name: 61547, dtype: object


In [None]:
# section associated to the rating fields: number_of_ratings, avg_rating
def aggregate(group):
    return pd.Series({
        "number_of_ratings": len(group['Rate'].tolist()),
        "avg_rating": np.average(group['Rate'])
    })


rating_path = './ratings_Apps_for_Android.csv'

rating_df = pd.read_csv(rating_path, names=["User", "asin", "Rate", "Time"])

rating_df = rating_df.groupby("asin").apply(aggregate).reset_index()

print(rating_df)


             asin  number_of_ratings  avg_rating
0      B004A9SDD8               61.0    3.868852
1      B004AFQAUA              231.0    4.004329
2      B004AGCR1K                3.0    1.000000
3      B004AHBBPW              377.0    4.713528
4      B004ALFHV2               12.0    4.583333
...           ...                ...         ...
61270  B00LY9GOLU                1.0    5.000000
61271  B00LY9HZ3G                1.0    5.000000
61272  B00LZ9DVEM                1.0    5.000000
61273  B00M07JQQK                1.0    5.000000
61274  B00M0DLUIQ                1.0    5.000000

[61275 rows x 3 columns]


In [None]:
df = pd.merge(meta_df, rating_df, how='inner', on='asin')

df = df.dropna(axis=1)


# one hot encoding on the category field
categories_one_hot = pd.get_dummies(df['categories'].apply(pd.Series).stack()).sum(level=0)
df = pd.concat([df, categories_one_hot], axis=1)

df = df.drop('categories', axis=1)
df = df.drop('Apps for Android', axis=1)

df['number_of_ratings'] = df['number_of_ratings'].fillna(0)
df['avg_rating'] = df['avg_rating'].fillna(0)

# Print the resulting DataFrame
print(df)

  categories_one_hot = pd.get_dummies(df['categories'].apply(pd.Series).stack()).sum(level=0)


             asin                                            related  price  \
0      B004A9SDD8  {'also_bought': ['B006M3K874', 'B00F85SMOI', '...    0.0   
1      B004AFQAUA  {'also_bought': ['B005UDD1R6', 'B00LBH2UMM', '...    0.0   
2      B004AGCR1K  {'also_viewed': ['B00A7W29BE', 'B00I5PB9UM', '...    0.0   
3      B004AHBBPW  {'also_bought': ['B006T2U1V6', 'B00FBCIYZU', '...    0.0   
4      B004ALFHV2  {'also_bought': ['B00CA6LZAG', 'B00FE5ZF4W', '...    0.0   
...           ...                                                ...    ...   
61270  B00LWU2FKU  {'also_bought': ['B00LJJWBSA', 'B00LUEJYE8', '...    0.0   
61271  B00M0DLUIQ  {'also_bought': [], 'also_viewed': [], 'bought...    0.0   
61272  B00LXGPGAY  {'also_viewed': ['B00LY6KB94', 'B00LWVE1WO', '...    0.0   
61273  B00LY8XFOK  {'also_bought': ['B00LBH2UMM', 'B00KNWYDU8', '...    0.0   
61274  B00M07JQQK  {'also_bought': ['B00KPM74NY', 'B00JTE929I', '...    0.0   

       number_of_ratings  avg_rating  Alarms & Cloc

In [None]:
G = nx.Graph()

G.add_nodes_from(df['asin'])

In [None]:
for i in range(df.shape[0]):
    value = df.iloc[i]
    also_bought = set(value["related"].get("also_bought", []))
    bought_together = set(value["related"].get("bought_together", []))

    valid_also_bought = also_bought.intersection(df['asin'].values)
    valid_bought_together = bought_together.intersection(df['asin'].values)

    G.add_edges_from((value['asin'], j) for j in valid_also_bought)
    G.add_edges_from((value['asin'], j) for j in valid_bought_together)



In [None]:
# nx.write_g

nx.write_gml(G, 'graph.gz')

In [None]:
G = nx.read_gml("graph.gz")

In [None]:
print("The edge size of graph G: ",len(G.edges))
print("The node size of graph G: ",len(G.nodes))

The edge size of graph G:  3282804
The node size of graph G:  61275


In [None]:
# nodes_to_remove = [node for node, degree in dict(G_purchase.degree()).items() if degree == 0]
# G_purchase.remove_nodes_from(nodes_to_remove)


In [None]:
# import matplotlib.pyplot as plt

# nx.draw(G_purchase, with_labels=False, )
