###### May 2022
###### Author: Martina Lorenzo Valerio 

# Data Visualisation: Amazon product co-purchasing network metadata

### Import libraries

In [None]:
import string
import re
from nltk.corpus import stopwords
from stemming.porter2 import stem
import networkx 


In [24]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/martinamanno/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [47]:
#open the file to read Amazon product metadata from http://snap.stanford.edu/data/amazon-meta.html

fhr = open ('/Users/martinamanno/Desktop/LUISS/CORSI da seguire/DATA VISUALIZATION/project/amazon-meta.txt', 'r', encoding='utf-8', errors= 'ignore')

### Description of the dataset
The dataset provides product and review metadata on 548,552 different products. The data was collected in 2006 by crawling the Amazon website. The dataset contains information about:
- ID:product number
- ASIN: Amazon Standard Identification Number is a 10-character alphanumeric unique identifier assigned by Amazon.com for product identification
- Title: title of the product
- Group: the product groups can be Book, DVD, Video or Music
- Sales rank: the Amazon sales rank represents how a product is selling in comparison to other products in its primary category. The lower the rank, the better a product is selling
- Similar: ASINs of co-purchased products, such as people who buy X also buy Y
- Categories: location in product category hierarchy to which the product belongs (separated by |, category id in [])
- Reviews: product review information such as the total number of reviews, average rating and the individual customer review information including time, user id, rating, total number of votes on the review, total number of helpfulness votes which means how many people found the review to be helpful


 

### Dataset preprocessing

Initialize a nested product dictionary that will hold cleaned up amazon product data. 



In [48]:
amazonProducts= {}

Before analyzing the social network, some processes are required to use ASIN as the key and the others as the metadata associated with ASIN. 
- All the categories associated with the ASIN are concatenated, and then the are subject to Text Preprocessing steps: lowercase, stemming, remove digit/punctuation, remove stop words, retain only unique words. 
- The copurchased ASINs in the “similar” field are filtered down to only those ASINs that have metadata associated with it.

In [49]:
(Id, ASIN, Title, Categories, Group, Copurchased, SalesRank, TotalReviews, AvgRating, DegreeCentrality, ClusteringCoeff) = ("", "", "", "", "", "", 0, 0, 0.0, 0, 0.0)

for line in fhr:
    line = line.strip()
    # a pduct block started
    if(line.startswith("Id")):
        Id = line[3:].strip()
    elif(line.startswith("ASIN")):
        ASIN = line[5:].strip()
    elif(line.startswith("title")):
        Title = line[6:].strip()
        Title = ' '.join(Title.split())
        #TitleTfidf = Title.lower()
        #TitleTfidf = re.compile('[%s]' % re.escape(string.digits+string.punctuation)).sub(' ',TitleTfidf)
        #TitleTfidf = ' '.join(set(TitleTfidf.split())-set(stopwords.words("english")))
        #TitleTfidf = ' '.join(stem(word) for word in TitleTfidf.split())
    elif(line.startswith("group")):
        Group = line[6:].strip()
    elif(line.startswith("salesrank")):
        SalesRank = line[10:].strip()
    elif(line.startswith("similar")):
        ls = line.split()
        Copurchased = ' '.join([c for c in ls[2:]])
    elif(line.startswith("categories")):
        ls = line.split()
        Categories = ' '.join((fhr.readline()).lower() for i in range(int(ls[1].strip())))
        Categories = re.compile('[%s]' % re.escape(string.digits+string.punctuation)).sub(' ',Categories)
        Categories = ' '.join(set(Categories.split())-set(stopwords.words("english")))
        Categories = ' '.join(stem(word) for word in Categories.split())
    elif(line.startswith("reviews")):
        ls = line.split()
        TotalReviews = ls[2].strip()
        AvgRating = ls[7].strip()
    # a product block eneded
    # write out fields to amazonProducts Dictionary
    elif (line==""):
        try:
            MetaData = {}
            if (ASIN != ""):
                amazonProducts[ASIN] = MetaData
            MetaData['Id'] = Id
            MetaData['Title'] = Title
            MetaData['Categories'] = ' '.join(set(Categories.split()))
            MetaData['Group'] = Group
            MetaData['Copurchased'] = Copurchased
            MetaData['SalesRank'] = int(SalesRank)
            MetaData['TotalReviews'] = int(TotalReviews)
            MetaData['AvgRating'] = float(AvgRating)
            MetaData['DegreeCentrality'] = DegreeCentrality
            MetaData['ClusteringCoeff'] = ClusteringCoeff
        except NameError:
            continue
        (Id, ASIN, Title, Categories, Group, Copurchased, SalesRank, TotalReviews, AvgRating, DegreeCentrality, ClusteringCoeff) = ("", "", "", "", "", "", 0, 0, 0.0, 0, 0.0)
fhr.close

<function TextIOWrapper.close()>

Filter Amazon products dictionary down to only Group=Book and write it to Amazon books dictionary.

In [50]:
#create book specific dictionary exclusively for books
amazonBooks = {}
for asin,metadata in amazonProducts.items():
    if (metadata['Group']=='Book'):
        amazonBooks[asin]=amazonProducts[asin]
        
#remove any copurchased items from copurchase list. If we don't have metadata associated with it
for asin, metadata in amazonBooks.items():
    amazonBooks[asin]['Copurchased']= ' '.join([cp for cp in metadata['Copurchased'].split() if cp in amazonBooks.keys()])
    

Use the copurchase data in Amazon books dictionary to create the copurchase graph structure as follows:
- NODES: are the ASINs
- EDGES: exist between two Nodes if the two ASINs were co-purchased
- EDGE WEIGHT: it is based on category similarity
- SIMILARITY: it is a measure between any two ASINs that were co-purchased and it can be calculated as the number of words that are common between categories of connected Nodes divided by the total number of words in both categories of connected nodes. The similarity range goes from 0 (most dissimilar) to 1 (most similar).

 

In [51]:
#create a product copurchase graph for analysis
#the graph nodes are product ASINs, the graph edge exists if two products were copurchased, with edge weight being a measure of category similarity between ASINs
copurchaseGraph = networkx.Graph()
for asin, metadata in amazonBooks.items():
    copurchaseGraph.add_node(asin)
    for a in metadata ['Copurchased'].split():
        copurchaseGraph.add_node(a.strip())
        similarity= 0
        n1= set((amazonBooks[asin]['Categories']).split())
        n2= set ((amazonBooks[a]['Categories']).split())
        n1In2 = n1 & n2 #intersection: number of words that are common between categories of connected nodes
        n1Un2 = n1 | n2 #union: total number of words in both categories of connected nodes
        if (len(n1Un2)) > 0:
            similarity = round (len(n1In2)/len(n1Un2), 2)
        copurchaseGraph.add_edge(asin, a.strip(), weight = similarity)

Add the graph-related measures for each ASIN node to the Amazon books dictionary: degree centrality and clustering coefficient.

In [52]:
dc = networkx.degree(copurchaseGraph)
for asin in networkx.nodes(copurchaseGraph):
    metadata = amazonBooks[asin]
    metadata['DegreeCentrality'] = int(dc[asin])
    ego = networkx.ego_graph(copurchaseGraph, asin, radius = 1)
    metadata['ClusteringCoeff'] = round(networkx.average_clustering(ego), 2)
    amazonBooks[asin] = metadata

Now, write out Amazon books data to the amazon-books.txt file and the copurchase graph data to the amazon-books-copurchase.edgelist file.


In [53]:
fhw = open('./amazon-books.txt', 'w', encoding = 'utf-8', errors = 'ignore')
fhw.write('Id\t' + 'ASIN\t' + 'Title\t'+
         'Categories\t' + 'Group\t' + 'Copurchased\t'+
         'SalesRank\t' + 'TotalReviews\t' + 'AvgRating\t'+
         'DegreeCentrality\t' + 'ClusteringCoeff\n')
for asin, metadata in amazonBooks.items():
    fhw.write(metadata['Id'] + '\t' + 
              asin + '\t' +
              metadata['Title'] + '\t' +
              metadata['Categories'] + '\t' +
              metadata['Group'] + '\t' +
              metadata['Copurchased'] +'\t' + 
              str(metadata['SalesRank']) + '\t' +
              str(metadata['TotalReviews']) + '\t' +
              str(metadata['AvgRating']) + '\t' +
              str(metadata['DegreeCentrality']) + '\t' +
              str(metadata['ClusteringCoeff']) + '\n')
fhw.close()

# write copurchaseGraph to file
fhw = open('amazon-books-copurchase.edgelist', 'wb')
networkx.write_weighted_edgelist(copurchaseGraph, fhw)
fhw.close()

### Plots


In [56]:
fhr= open('amazon-books-copurchase.edgelist', 'rb')
copurchaseGraph = nx.read_weighted_edgelist(fhr)
fhr.close()

### Network analysis


In [59]:
import networkx as nx
import seaborn as sns
deg_cen_g_copurchase = nx.degree_centrality(copurchaseGraph)
sorted_deg_cen_g_copurchase = sorted(deg_cen_g_copurchase.items(), key=lambda x:x[1], reverse = True)[0:50]
print(sorted_deg_cen_g_copurchase)
ASIN = [i for i,j in sorted_deg_cen_g_copurchase]
degree = [j for i,j in sorted_deg_cen_g_copurchase]

#sns.set(rc={'figure.figsize':(15,10)})
#ax = sns.barplot(x = ASIN, y = degree, palette = sns.color_palette("rocket"))
#ax.set_xticklabels(ax.get_xticklabels(), rotation = 45)


[('0890420254', 0.0011984641903338685), ('1557987912', 0.0009506336324561858), ('0684801523', 0.0008507616165950301), ('0486291138', 0.0008359657623933774), ('0805047905', 0.0007989761268892457), ('0875163238', 0.0007952771633388325), ('0486280861', 0.0007767823455867666), ('0553212788', 0.0007582875278347007), ('0192833723', 0.0007249968558809822), ('0671722905', 0.0007249968558809822), ('096290497X', 0.0007249968558809822), ('0812550927', 0.0006658134390743714), ('0803606540', 0.0006621144755239582), ('0689710550', 0.0006510175848727187), ('0451527046', 0.0006288238035702397), ('0316769487', 0.0006103289858181738), ('0152026320', 0.0006029310587173474), ('0761924426', 0.0005918341680661079), ('0140157352', 0.0005844362409652816), ('0486411214', 0.0005622424596628025), ('0130336297', 0.0005585434961123893), ('0465084486', 0.0005585434961123893), ('0689714734', 0.0005474466054611498), ('0553210092', 0.0005437476419107366), ('0440404959', 0.0005363497148099102), ('014131088X', 0.0005363

In [60]:
#list(copurchaseGraph.adj['0890420254'])
#list(copurchaseGraph.edges(data=True))[:1000]
copurchaseGraph.adj['0890420254']

node = 0
for i in list(copurchaseGraph.edges(data=True)):
    if i[0] == '0890420254':
        neigh = purchaseGraph.adj['0890420254']
        

[('0827229534', '0804215715', {'weight': 0.7}),
 ('0827229534', '156101074X', {'weight': 0.5}),
 ('0827229534', '0687023955', {'weight': 0.8}),
 ('0827229534', '0687074231', {'weight': 0.8}),
 ('0827229534', '082721619X', {'weight': 0.7}),
 ('0827229534', '0805415505', {'weight': 0.7}),
 ('0827229534', '0802842623', {'weight': 0.7}),
 ('0827229534', '0800628411', {'weight': 0.7}),
 ('0804215715', '0687179246', {'weight': 0.7}),
 ('0804215715', '0687173094', {'weight': 0.75}),
 ('0804215715', '0805415505', {'weight': 1.0}),
 ('0804215715', '0687023955', {'weight': 0.9}),
 ('0804215715', '0687336481', {'weight': 0.78}),
 ('0804215715', '0801021979', {'weight': 0.39}),
 ('0804215715', '082720230X', {'weight': 0.5}),
 ('0804215715', '156101074X', {'weight': 0.39}),
 ('0804215715', '0802801323', {'weight': 0.6}),
 ('0804215715', '0801090598', {'weight': 0.33}),
 ('0804215715', '0687066301', {'weight': 0.78}),
 ('0804215715', '0800620968', {'weight': 1.0}),
 ('0804215715', '0825420733', {'we

In [72]:
neigh = copurchaseGraph.edges('0890420254', data= True)


weight =[ ]
for i in neigh:
    weight.append(i[2]['weight'])
weight

import numpy as np
np.mean(weight)

0.42185185185185187

### Network visualization