In [1]:
import pandas as pd
import glob
from sklearn.externals import joblib as jl
import os
from urllib.parse import urlparse
import numpy as np
import tldextract

%matplotlib inline

In [2]:
def ref_domains_extract(data_path):
    domains = os.listdir(data_path)
    data = pd.DataFrame(columns=['domain', 'ref_domain', 'source'])
    doms =[]
    print('Done:')
    for dom in domains:
        files = glob.glob("{}/{}/*.csv".format(data_path, dom))
        for f in files:
            if 'backlinks-subdomains-recent' in f:
                links = pd.read_csv(f, usecols=['Referring Page URL'], sep='\t', encoding='utf-16')['Referring Page URL']
                source = 'AHREFS'

            if 'download_fresh_' in f:
                links = pd.read_csv(f, usecols=['Source URL'])['Source URL']
                source = 'MAJESTIC'

            if 'ExternalLinks_SampleLinks' in f:
                links = pd.read_csv(f, usecols=['Links'])['Links']
                source = 'GOOGLE'
            
#             links = links.apply(lambda x: '{uri.scheme}://{uri.netloc}'.format(uri=urlparse(x)))
            links = links.apply(lambda x: tldextract.extract(x).registered_domain)
            df = pd.DataFrame({'ref_domain': links.unique()})
            df['domain'] = dom
            df['source'] = source
            data = data.append(df, ignore_index=True)
    
        doms.append(dom)
    print(len(doms))
    jl.dump(data, 'data/ref_domains_crawl2.jl')

In [3]:
# %%time
# data_path = 'data/GSC Ahrefs Majestic'
# ref_domains_extract(data_path)

Done:
104
Wall time: 14min 36s


In [6]:
ext = tldextract.extract('3dprintingninja.blogspot.com.ee')
ext.registered_domain

'blogspot.com.ee'

In [7]:
data0 = jl.load('data/ref_domains_crawl.jl').drop_duplicates().reset_index(drop=True)

In [None]:
import re

pat = re.compile("^\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}$")
data0[data0.ref_domain.apply(lambda x: pat.search(x)) == True]

In [2]:
data0 = jl.load('data/ref_domains_crawl2.jl').drop_duplicates().reset_index(drop=True)
# data0['ref_domain'] = data0.ref_domain.str.rstrip('//')
# data0['ref_domain'] = data0.ref_domain.apply(lambda x: '{uri.netloc}'.format(uri=urlparse(x)))
data0.head()

Unnamed: 0,domain,ref_domain,source
0,10edges.com,blogspot.com.ee,GOOGLE
1,10edges.com,blogspot.com,GOOGLE
2,10edges.com,blogspot.fr,GOOGLE
3,10edges.com,blogspot.in,GOOGLE
4,10edges.com,blogspot.nl,GOOGLE


### consider all unique ref.domains found in 3 sources as 100%, then what % from all these ref.domains each individual source has found?

Method 1: calculate % of unique ref.domains for each of 104 sites separately, and then take the average and median values across all 104 sites

In [3]:
binarize = pd.get_dummies(data0['source'])
data = pd.concat([data0, binarize], axis=1).drop(['source'], axis=1)
data.head()

Unnamed: 0,domain,ref_domain,AHREFS,GOOGLE,MAJESTIC
0,10edges.com,blogspot.com.ee,0,1,0
1,10edges.com,blogspot.com,0,1,0
2,10edges.com,blogspot.fr,0,1,0
3,10edges.com,blogspot.in,0,1,0
4,10edges.com,blogspot.nl,0,1,0


In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 136262 entries, 0 to 136261
Data columns (total 5 columns):
domain        136262 non-null object
ref_domain    136262 non-null object
AHREFS        136262 non-null uint8
GOOGLE        136262 non-null uint8
MAJESTIC      136262 non-null uint8
dtypes: object(2), uint8(3)
memory usage: 2.5+ MB


In [5]:
data = data.groupby(['domain', 'ref_domain'], as_index=False).agg({'AHREFS': max,
                                                                   'GOOGLE': max,
                                                                   'MAJESTIC': max})
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 86831 entries, 0 to 86830
Data columns (total 5 columns):
domain        86831 non-null object
ref_domain    86831 non-null object
AHREFS        86831 non-null uint8
GOOGLE        86831 non-null uint8
MAJESTIC      86831 non-null uint8
dtypes: object(2), uint8(3)
memory usage: 2.2+ MB


In [6]:
data.head()

Unnamed: 0,domain,ref_domain,AHREFS,GOOGLE,MAJESTIC
0,10edges.com,academicworks.com,1,1,1
1,10edges.com,alamatpon.com,1,0,0
2,10edges.com,allwomenstalk.com,1,1,0
3,10edges.com,americanbookreview.org,0,0,1
4,10edges.com,answers.com,0,1,0


In [7]:
domain_ave = data.groupby(['domain']).agg({'AHREFS': np.mean,
                                            'GOOGLE': np.mean,
                                            'MAJESTIC': np.mean})
domain_ave.head()

Unnamed: 0_level_0,AHREFS,GOOGLE,MAJESTIC
domain,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
10edges.com,0.48913,0.663043,0.467391
4quotes4me.co.uk,0.56821,0.306633,0.558198
Exploringthisrock.com,0.405556,0.538889,0.383333
aaronneo.com.au,0.378378,0.810811,0.121622
aionhill.com,0.61742,0.577729,0.326351


In [8]:
domain_ave.mean().to_frame('mean').join(domain_ave.median().to_frame('median')).sort_index()

Unnamed: 0,mean,median
AHREFS,0.518496,0.529555
GOOGLE,0.600211,0.619187
MAJESTIC,0.391123,0.400431


Method 2: consider all 104 domains as one website, and calculate the totals from all data that we have

In [9]:
data_one = data.groupby(['ref_domain']).agg({'AHREFS': max,
                                             'GOOGLE': max,
                                             'MAJESTIC': max})
data_one.head()

Unnamed: 0_level_0,AHREFS,GOOGLE,MAJESTIC
ref_domain,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
,1,1,0
0-21.co.uk,1,0,0
0-3-0.com,1,0,0
000.nl,1,1,1
000a.biz,0,1,0


In [10]:
data_one.mean().sort_index().to_frame('mean')

Unnamed: 0,mean
AHREFS,0.578351
GOOGLE,0.577634
MAJESTIC,0.533679


### Intersections for Venn

In [11]:
venn = data0.drop(['domain'], axis=1).drop_duplicates().reset_index(drop=True)

In [12]:
len(venn)

108441

In [13]:
venn.source.value_counts().sort_index()

AHREFS      37118
GOOGLE      37072
MAJESTIC    34251
Name: source, dtype: int64

In [14]:
venn.to_csv('data/for_venn.csv')

In [15]:
# unique to AHREFS
print ('unique to AHREFS: {}'.format(len(data[(data.AHREFS == 1) &\
                                (data.MAJESTIC == 0) &\
                                (data.GOOGLE == 0)])))

# unique to MAJESTIC
print ('unique to MAJESTIC: {}'.format(len(data[(data.AHREFS == 0) &\
                                          (data.MAJESTIC == 1) &\
                                          (data.GOOGLE == 0)])))

# unique to GOOGLE
print ('unique to GOOGLE: {}'.format(len(data[(data.AHREFS == 0) &\
                                          (data.MAJESTIC == 0) &\
                                          (data.GOOGLE == 1)])))

# AHREFS and MAJESTIC
print ('AHREFS & MAJESTIC: {}'.format(len(data[(data.AHREFS == 1) &\
                                                       (data.MAJESTIC == 1) &\
                                                       (data.GOOGLE == 0)])))

# AHREFS and GOOGLE
print ('AHREFS & GOOGLE: {}'.format(len(data[(data.AHREFS == 1) &\
                                                     (data.MAJESTIC == 0) &\
                                                     (data.GOOGLE == 1)])))

# MAJESTIC and GOOGLE
print ('MAJESTIC & GOOGLE: {}'.format(len(data[(data.AHREFS == 0) &\
                                                       (data.MAJESTIC == 1) &\
                                                       (data.GOOGLE == 1)])))

# AHREFS and MAJESTIC and GOOGLE
print ('AHREFS & MAJESTIC & GOOGLE: {}'.format(len(data[(data.AHREFS == 1) &\
                                                              (data.MAJESTIC == 1) &\
                                                              (data.GOOGLE == 1)])))

unique to AHREFS: 16082
unique to MAJESTIC: 14227
unique to GOOGLE: 20675
AHREFS & MAJESTIC: 10004
AHREFS & GOOGLE: 8277
MAJESTIC & GOOGLE: 3982
AHREFS & MAJESTIC & GOOGLE: 13584


In [16]:
data_one.sum().sort_index()

AHREFS      37118
GOOGLE      37072
MAJESTIC    34251
dtype: int64

In [19]:
ingoogle = data[(data.AHREFS == 0) & (data.GOOGLE == 1)].drop(['MAJESTIC', 'GOOGLE', 'AHREFS'], axis=1)
ingoogle.head()

Unnamed: 0,domain,ref_domain
4,10edges.com,answers.com
8,10edges.com,blogspot.com.ee
9,10edges.com,blogspot.com.es
10,10edges.com,blogspot.fr
11,10edges.com,blogspot.in


In [20]:
inmajestic = data[(data.AHREFS == 0) & (data.MAJESTIC == 1)].drop(['MAJESTIC', 'GOOGLE', 'AHREFS'], axis=1)
inmajestic.head()

Unnamed: 0,domain,ref_domain
3,10edges.com,americanbookreview.org
6,10edges.com,bju.edu
17,10edges.com,carthage.edu
19,10edges.com,coolpot.com
23,10edges.com,dronestoriesetc.com


In [21]:
ingoogle.to_csv('data/ingoogle.csv',index=False)
inmajestic.to_csv('data/inmajestic.csv',index=False)