## Considerations

In [None]:
!sudo update-alternatives --config python3

!python3 --version

!sudo apt install python3-pip

In [None]:
! pyt

Dataset extracted from https://snap.stanford.edu/data/amazon0302.html
- Amazon network https://snap.stanford.edu/data/amazon0302.html
- Metadata https://snap.stanford.edu/data/amazon-meta.html

In [7]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [8]:
!pip install igraph


Traceback (most recent call last):
  File "/usr/local/bin/pip3", line 5, in <module>
    from pip._internal.cli.main import main
ModuleNotFoundError: No module named 'pip._internal'


## Importings

In [4]:
import pandas as pd
import numpy as np
import os
from tqdm import tqdm
import matplotlib.pyplot as plt
import seaborn as sns
import igraph as ig
import time
import itertools
import warnings
import pickle

warnings.filterwarnings('ignore')

ModuleNotFoundError: ignored

## Dataset

### Metadata

Treating metadata and organizing into a pandas dataframe

In [None]:
# # read all lines of the meta data into content list
# with open('amazon-meta.txt', encoding='utf8') as f:
#     content = f.readlines()

In [None]:
# remove the beginning and trailing white spaces
# content = [x.strip() for x in content]

In [None]:
# # Write extracted information to testfile.txt in a format of ',' demilited files.
# # The columns are Id, title, group, categories, totalreviews, avgrating.
# # The code stores all extracted information about a product into previoulines,
# # and write the content into file only when all information are available. Hence,
# # if review information for a product is not available, the product won't appear
# # in the final file.
# file = open("testfile.txt","w", encoding='utf8')
# previouslines = ['Id', 'title', 'group', 'categories', 'totalreviews', 'avgrating']
# for line in content:
#     lines = line.split(':')
#     if lines[0] == "Id":
#         if (len(previouslines) == 6):
#             for component in previouslines[0:5]:
#                 file.write(component)
#                 file.write(',')
#             file.write(previouslines[5])
#             file.write("\n")
#         previouslines = []
#         previouslines.append(lines[1].strip())
        
#     if lines[0] == "title":
#         title = ':'.join(lines[1:]).strip().replace(',', ' ').replace('\n', ' ').strip()
#         previouslines.append(title)
       
#     if lines[0] == "group":
#         previouslines.append(lines[1].strip())

#     if lines[0] == "categories":
#         previouslines.append(lines[1].strip())
    
#     if lines[0] == "reviews" and lines[1].strip() == "total":
#         previouslines.append(lines[2].split(' ')[1])
#         previouslines.append(lines[4].strip())
# file.close()

In [None]:
# metadata
desc = pd.read_csv('drive/MyDrive/testfile.txt')

In [None]:
desc.head()

#### EDA

In [None]:
desc.nunique()

In [None]:
desc['group'].value_counts()

In [None]:
desc = desc[desc['group'].isin(['Book', 'Music', 'DVD', 'Video'])]

In [None]:
sns.countplot(data=desc, y='group');

In [None]:
desc.head(2)

In [None]:
desc[['totalreviews', 'avgrating']].corr()

In [None]:
sns.histplot(data=desc, x='totalreviews', bins=10);

In [None]:
sns.histplot(data=desc, x='avgrating');

In [None]:
desc['avgrating'].hist()

### Network

Reading dataframe and merging with metadata information

In [None]:
df = pd.read_csv('drive/MyDrive/Amazon0302.txt', skiprows=4, header=None, sep='\t', names=['from', 'to'])

In [None]:
# Join the link data with the meta data. The joined data contains the description for
# from node product as well as to node product.
df = (
    df
    .merge(desc, left_on=['from'], right_on=['Id'], suffixes=['.1', '.from'])
    .merge(desc, left_on=['to'], right_on=['Id'], suffixes=['.from', '.to'])
)

In [None]:
df.head(3)

## EDA

Basic exploratory analysis

In [None]:
print('Unique titles:', desc['title'].nunique())
print('Unique Categories:', desc['categories'].nunique())

In [None]:
desc['group'].value_counts()

In [None]:
df.shape

In [None]:
df.head(3)

## Creating graph

### Vertices

In [None]:
df.head(2)

In [None]:
# creating weight for edges
# avgrating performed better and indicates the strenght of higher ratings

df['same_category'] = np.where(df['categories.from']==df['categories.to'], 1, 0)
df['avgrating'] = df['avgrating.from'] * df['avgrating.to']

In [None]:
g = ig.Graph.DataFrame(df[['from', 'to', 'same_category', 'avgrating']], directed=False)

In [None]:
g.summary()

In [None]:
g.vs.attribute_names()

In [None]:
degree = pd.DataFrame({
    'degree': g.degree()
}) 

In [None]:
_, ax = plt.subplots(1, 2, figsize=(10, 4))

sns.histplot(data=degree, x='degree', bins=10, ax=ax[0])
sns.kdeplot(data=degree, x='degree', ax=ax[1])
plt.tight_layout();

## Clustering

Selecting only the highest connected network

In [None]:
cc = g.clusters(mode='weak')
print('# connected components', len(cc))

In [None]:
cc = np.array(
    sorted(cc, key=len, reverse=True), dtype='object'
)

In [None]:
print('Top 5 connected networks')
[len(cc[i]) for i in range(5)]

In [None]:
g = g.subgraph(cc[0])

Trying different community detection algorithms without and with the weights created

### Leiden

In [None]:
start = time.time()
coms_leid_raw = g.community_leiden()
print('Time elapsed:', time.time() - start)
print('Modularity:', coms_leid_raw.modularity)

In [None]:
start = time.time()
coms_leid_w = g.community_leiden(weights=g.es['avgrating'])
print('Time elapsed:', time.time() - start)
print('Modularity:', coms_leid_w.modularity)

### Louvain

In [None]:
start = time.time()
coms_louv_raw = g.community_multilevel()
print('Time elapsed:', time.time() - start)
print('Modularity:', coms_louv_raw.modularity)

In [None]:
start = time.time()
coms_louv_w = g.community_multilevel(weights=g.es['avgrating'])
print('Time elapsed:', time.time() - start)
print('Modularity:', coms_louv_w.modularity)

### Infomap

In [None]:
# start = time.time()
# coms_info_raw = g.community_infomap()
# print('Time elapsed:', time.time() - start)
# print('Modularity:', coms_info_raw.modularity)

In [None]:
# start = time.time()
# coms_info_w = g.community_infomap(edge_weights=g.es['avgrating'])
# print('Time elapsed:', time.time() - start)
# print('Modularity:', coms_info_w.modularity)

### Save community

In [None]:
coms = {
    'leiden_raw': coms_leid_raw,
    'leiden_weighted': coms_leid_w,
    'louvain_raw': coms_louv_raw,
    'louvain_weighted': coms_louv_w
    # 'infomap_raw': coms_info_raw,
    # 'infomap_weighted': coms_info_w
}

In [None]:
# with open('coms.pkl', 'wb') as f:
#     pickle.dump(coms, f)

# with open('coms.pkl', 'rb') as f:
#     coms = pickle.load(f)

## Evaluating clusters

In [None]:
df_coms = pd.DataFrame({
    'Id': g.vs()['name']
})

In [None]:
for i, j in coms.items():
    df_coms[i] = j.membership

In [None]:
# df_coms = pd.DataFrame({
#     'Id': g.vs()['name'],
#     names[0]: coms[0].membership,
#     names[1]: coms[1].membership,
#     names[2]: coms[2].membership,
#     names[3]: coms[3].membership,
#     names[4]: coms[4].membership,
#     names[5]: coms[5].membership
# })

In [None]:
names = ['leiden_raw', 'leiden_weighted', 'louvain_raw', 'louvain_weighted']

df_coms_summary = pd.DataFrame({
    'modularity': [i.modularity for i in coms.values()],
    'communities': [len(set(i.membership)) for i in coms.values()]
}, index=names).sort_values('modularity')

In [None]:
df_coms_summary

**Louvain** weighted presented the highest modularity and showed 72698 communities (to be checked)

In [None]:
_, ax = plt.subplots(1, 2, sharey=True, figsize=(12, 4))

df_coms_summary['modularity'].plot.barh(title='Modularity', ax=ax[0])
df_coms_summary['communities'].plot.barh(title='# communities', ax=ax[1]);

We can compare the nmi and adjusted rand between the communities created

In [None]:
comb_names = []
comb_nmi = []
comb_rand = []

for n, c in zip(itertools.combinations(coms.keys(), 2), itertools.combinations(coms.values(), 2)):
    comb_names.append(f'{n[0]} & {n[1]}')
    comb_nmi.append(ig.compare_communities(c[0], c[1], method='nmi'))
    comb_rand.append(ig.compare_communities(c[0], c[1], method='adjusted_rand'))

In [None]:
df_comb = pd.DataFrame({
    'nmi': comb_nmi,
    'adjusted_rand': comb_rand
}, index=comb_names).sort_values('nmi')

df_comb

In [None]:
_, ax = plt.subplots(1, 2, sharey=True, figsize=(12, 4))
df_comb['nmi'].plot.barh(title='NMI', ax=ax[0])
df_comb['adjusted_rand'].plot.barh(title='Adjusted rand', ax=ax[1]);

## Louvaing weighted

Selecting the highest community of louvain weighted

In [None]:
g_giant_louv_w = coms['louvain_weighted'].giant()

In [None]:
df_giant_louv_w = pd.DataFrame({
    'Id': g_giant_louv_w.vs['name'],
    'degree': g_giant_louv_w.degree(),
    'betweenness': g_giant_louv_w.betweenness(),
    'clustering_coefficient': g_giant_louv_w.transitivity_local_undirected()
})

In [None]:
df_giant_louv_w['Id'] = df_giant_louv_w['Id'].astype(int)

In [None]:
df_giant_louv_w = df_giant_louv_w.merge(desc, on='Id')

In [None]:
df_giant_louv_w.shape

We can check the products of the highest community 

In [None]:
df_giant_louv_w[
    (df_giant_louv_w['betweenness'] > np.quantile(df_giant_louv_w['betweenness'], 0.95)) &
    (df_giant_louv_w['degree'] > np.quantile(df_giant_louv_w['degree'], 0.95))
]

In [None]:
sns.pairplot(data=df_giant_louv_w[['degree', 'betweenness', 'clustering_coefficient', 'group']], hue='group')

## Visualization

In [None]:
# considering the top 2nd most connected network
g_sub = g.subgraph(cc[1])

In [None]:
layout_fr = g_sub.layout('fr')

visual_style = {
    #Define style from network plotting
    "vertex_size": 5,
    "vertex_color": 'orange',
    "edge_arrow_size": 0.5,
    "edge_width":0.5,
    "vertex_label": None,
    "layout": layout_fr,
    "bbox": (600, 600),
    "edge_width": 1
}

ig.plot(g_sub, **visual_style)