# 02 WHOIS data

In [None]:
import pandas as pd
import numpy as np
from urllib.parse import urlparse
import json
import requests
from time import sleep
import random
import ast
import tldextract

In [None]:
# https://pypi.org/project/python-whois/
# https://pypi.org/project/whois21/
import whois21
import log21

# Other options for the future
# https://pypi.org/project/asyncwhois/
# https://pypi.org/project/domaintools-misp/
# https://pypi.org/project/sitesniffer/

## Functions

In [None]:
def set_link(data, lab_type):
    data.insert(1, 'MAINURL', 'None')
    if "links" in data.columns and lab_type != "hackerspaces" and lab_type != "repaircafes":
        for i, row in data.iterrows():
            links_dict=ast.literal_eval(row["links"])
            for j in links_dict:
                # Here choose which link to get...
                if (   
                    "facebook" not in links_dict[j] and 
                    "instagram" not in links_dict[j] and
                    "twitter" not in links_dict[j] and
                    "youtube" not in links_dict[j] and
                    "vimeo" not in links_dict[j] and
                    "tiktok" not in links_dict[j] and
                    "fb.me" not in links_dict[j] and
                    "Facebook.com" not in links_dict[j] and
                    "facebook.com" not in links_dict[j] and
                    "fb.com" not in links_dict[j] and
                    "meetup.com" not in links_dict[j] and
                    "sites.google" not in links_dict[j] and
                    "wordpress.com" not in links_dict[j] and
                    "fablabs.io" not in links_dict[j] and
                    "makerfaire.com" not in links_dict[j] and
                    "hackerspaces.org" not in links_dict[j] and
                    "repaircafe.org" not in links_dict[j] and
                    "makerspaces.com" not in links_dict[j]
                ):           
                    if len(links_dict[j]) > 0 and not pd.isna(links_dict[j]):
                        data.at[i,'MAINURL'] = links_dict[j]
    else:
        for i, row in data.iterrows():
            # Just domain of main link
            url = row["url"]
            if not pd.isna(url) and len(url)>0:
                        data.at[i,'MAINURL'] = url
                
    return data

In [None]:
def url_type(value):
    if len(value) > 0 and not pd.isna(value):
        address = urlparse(value)
        split_url = address.netloc.split(".")
        if len(split_url) > 2 and split_url[0] != "www":
            new_value = "subdomain"
        else:
            new_value = "domain"  
    else:
    # No url
        new_value = "No URL"

    return new_value

In [None]:
def domain_name(value):
    #new_value = tldextract.extract(value).top_domain_under_public_suffix

    new_value = tldextract.extract(value).domain + "." + tldextract.extract(value).suffix
    
    return new_value

In [None]:
def whois_extract_text_whois21(value):
    #sleep(random.randint(3, 6))
    if value is not None:
        try:
            w3 = whois21.WHOIS(tldextract.extract(value).domain + "." + tldextract.extract(value).suffix)
            new_value = [w3.whois_data]
        except:
            new_value = None
    else:
        new_value = None
        
    return new_value

In [None]:
def extract_whois21_date(value):

    if value is not None:
        try:
            if 'CREATION DATE' in value[0]:
                new_value = pd.to_datetime(value[0]['CREATION DATE'], errors="coerce")
            elif 'REGISTERED' in value[0]:
                new_value = pd.to_datetime(value[0]['REGISTERED'], errors="coerce")
            elif 'RECORD CREATED' in value[0]:
                new_value = pd.to_datetime(value[0]['RECORD CREATED'], errors="coerce")
            elif 'FIRST REGISTRATION DATE' in value[0]:
                new_value = pd.to_datetime(value[0]['FIRST REGISTRATION DATE'], errors="coerce")
            elif 'REGISTERED ON' in value[0]:
                new_value = pd.to_datetime(value[0]['REGISTERED ON'], errors="coerce")
            elif 'CREATED' in value[0] and 'RECORD CREATED' not in value[0]:
                new_value = pd.to_datetime(value[0]['CREATED'], errors="coerce")
            else:
                new_value = None
        except:
            new_value = None
    else:
        new_value = None
    if new_value is not None:
        new_value = new_value.tz_localize(None)

    return new_value

In [None]:
def get_whois_data(df):

    datag = df.copy()
    
    datag['URL_type'] = ''
    datag['domain_name'] = ''
    datag['whois'] = ''
    datag['whois_error'] = ''
    datag['whois21'] = ''
    datag['whois21_error'] = ''
    datag['whois21_text'] = ''
    datag['creation_date'] = pd.NaT
        
    datag['URL_type'] = datag['MAINURL'].apply(url_type)
    datag['domain_name'] = datag['MAINURL'].apply(domain_name)
    #datag['whois21_text'] = datag['MAINURL'].apply(whois_extract_text_whois21)

    for row in datag.itertuples():
        print(row.Index, datag.loc[row.Index, 'MAINURL'])
        datag.at[row.Index, 'whois21_text'] = whois_extract_text_whois21(datag.loc[row.Index, 'MAINURL'])
    
    datag['creation_date_whois21'] = datag['whois21_text'].apply(extract_whois21_date)
    
    datag.at[row.Index, 'creation_date'] = datag.loc[row.Index, 'creation_date_whois21']
    
    #datagnn = datag[datag["creation_date"].notna()]
    datag.set_index('creation_date_whois21', inplace=True)
    datag.index.rename('time', inplace=True)
    
    return datag

## Data extraction from WHOIS

In [None]:
data = pd.read_csv('data/fablabs.csv')
df = set_link(data, lab_type="fablab")
df2 = get_whois_data(df)
df2.to_csv("data/whois_fablabs.csv")

In [None]:
data = pd.read_csv('data/makerspaces.csv')
df = set_link(data, lab_type="makerspaces")
df2 = get_whois_data(df)
df2.to_csv("data/whois_makerspaces.csv")

In [None]:
data = pd.read_csv('data/repaircafes.csv')
df = set_link(data, lab_type="repaircafes")
df2 = get_whois_data(df)
df2.to_csv("data/whois_repaircafes.csv")

In [None]:
data = pd.read_csv('data/hackerspaces.csv')
df = set_link(data, lab_type="hackerspaces")
df2 = get_whois_data(df)
df2.to_csv("data/whois_hackerspaces.csv")