## Import Tweets from JSON

Import the Tweets that we captured using the RTweet Library.
Note: This could also be done directly in Python using a library such as Tweepy.

In [1]:
import json
from pprint import pprint

with open('tweets.json') as f:
    data = json.load(f)

## Create Histogram of Common Domains in Tweets

In [2]:
import re
from collections import defaultdict

import matplotlib
import matplotlib.pyplot as plt
import numpy as np


domains = defaultdict(int)
sorted_domains = list()
sorted_counts = list()

for d in data:
    url = str(d["urls_expanded_url"][0])
    dom = re.match(r'http.*\/\/(.*?)\/', url)
    if dom:
        domain = dom.group(1)
        domains[domain] += 1

        
sdomains = sorted(domains, key=domains.get, reverse=True)

for w in sdomains:
    sorted_domains.append(w)
    sorted_counts.append(domains[w])
    # this will print all of the domains and their counts.
    # the tail may be the most interesting part
    # print w, domains[w]

# plot the most popular domains that show up in URLs
# plot shows only the top 10. domains has all of the counts

x = sorted_domains[:10]
y = sorted_counts[:10]

plt.bar(x,y)
plt.xticks(x, rotation='vertical')
plt.show()

<Figure size 640x480 with 1 Axes>

## Explore Characteristics of Each Domain Name

First we'll extract the following characteristics of the domain names in each Tweet:
(1) The IP addresses that each domain resolves to
(2) The DNS nameservers that each domain is authoritative for

In [3]:
ips = dict()
nameservers = dict()

### A Record Lookup

In [4]:
# requires: dnspython library

import dns.resolver

# Lookup IP addresses / A Records of Domain Names
for qname in sdomains:
    ips[qname] = list()
    
    try:
        answers = dns.resolver.query(qname, 'A')
    except dns.resolver.NXDOMAIN as e:
        # the domain does not exist so dns resolutions remain empty
        pass
    except dns.resolver.NoAnswer as e:
        # the resolver is not answering so dns resolutions remain empty
        pass

    for rdata in answers:
        #print qname, domains[qname], rdata.address
        ips[qname].append(rdata.address)
        
print ips['twitter.com'][0]

104.244.42.65


In [5]:
# dictionary of all of the DNS domain names -> IP addresses that host the websites
print ips

{'www.wayamo.com': [u'129.121.176.228'], 'www.loyolaandnews.es': [u'91.192.110.199'], 'simplecast.com': [u'52.200.78.105', u'52.72.25.31', u'18.232.28.162', u'34.193.215.247', u'54.87.115.109', u'35.173.151.154'], 'nnd.ng': [u'54.213.66.64'], 'www.dispatchlive.co.za': [u'216.58.223.83'], 'www.thepresidency.gov.za': [u'41.0.5.139'], 'www.sabreakingnews.co.za': [u'104.199.94.19'], 'southerncourier.co.za': [u'104.24.11.33', u'104.24.10.33'], 'actionkingbest.blogspot.com': [u'216.58.223.97'], 'ift.tt': [u'67.199.248.13', u'67.199.248.12'], 'readinglist.click': [u'34.251.166.161'], 'theconversation.com': [u'151.101.2.110', u'151.101.194.110', u'151.101.66.110', u'151.101.130.110'], 'trib.al': [u'54.174.224.48', u'18.232.216.80', u'54.84.189.246'], 'blogs.worldbank.org': [u'104.17.187.11', u'104.17.188.11'], 'www.scmp.com': [u'47.89.56.101', u'103.206.40.111'], 'www.regularnews.net': [u'160.153.16.34'], 'www.flickr.com': [u'87.248.114.12', u'77.238.180.12', u'77.238.180.11', u'87.248.114.11'

### NS Record Lookup

In [8]:
# Lookup NS Records for Each Domain Name

for qname in sdomains:
    nameservers[qname] = list()
    
    try:
        answer = dns.resolver.query(qname, 'NS', raise_on_no_answer=False)
    except dns.resolver.NXDOMAIN as e:
        # the domain does not exist so dns resolutions remain empty
        pass
    except dns.resolver.NoAnswer as e:
        # the resolver is not answering so dns resolutions remain empty
        pass
    
    if answer.rrset is not None:        
        for ns in answer.rrset.items:
              fqdn = ns.to_text().upper()
              nameservers[qname].append(fqdn)

In [9]:
# dictionary of all of the DNS domain names -> DNS Namservers
print nameservers

{'www.wayamo.com': ['NS1.ASMALLORANGE.COM.', 'NS2.ASMALLORANGE.COM.'], 'www.loyolaandnews.es': ['DNS3.HISPALISDNS.COM.', 'DNS2.HISPALISDNS.COM.', 'DNS1.HISPALISDNS.COM.'], 'simplecast.com': ['NS-1055.AWSDNS-03.ORG.', 'NS-435.AWSDNS-54.COM.', 'NS-1649.AWSDNS-14.CO.UK.', 'NS-714.AWSDNS-25.NET.'], 'nnd.ng': ['ASA.NS.CLOUDFLARE.COM.', 'AMIT.NS.CLOUDFLARE.COM.'], 'www.dispatchlive.co.za': [], 'www.thepresidency.gov.za': [], 'www.sabreakingnews.co.za': [], 'southerncourier.co.za': ['LISA.NS.CLOUDFLARE.COM.', 'THOMAS.NS.CLOUDFLARE.COM.'], 'actionkingbest.blogspot.com': [], 'ift.tt': ['NS-60.AWSDNS-07.COM.', 'NS-1876.AWSDNS-42.CO.UK.', 'NS-1195.AWSDNS-21.ORG.', 'NS-901.AWSDNS-48.NET.'], 'readinglist.click': ['NS06.DOMAINCONTROL.COM.', 'NS05.DOMAINCONTROL.COM.'], 'theconversation.com': ['NS1.DNSUNLIMITED.COM.', 'NS4.DNSIMPLE.COM.', 'NS1.DNSIMPLE.COM.', 'NS2.DNSIMPLE.COM.', 'NS5.DNSUNLIMITED.COM.', 'NS3.DNSIMPLE.COM.'], 'trib.al': ['NS-637.AWSDNS-15.NET.', 'NS-1190.AWSDNS-20.ORG.', 'NS-466.AWSDN

### Map Each IP Address to its corresponding ISP (AS Number) and Country

In [19]:
import pyasn

asndb = pyasn.pyasn('/Users/feamster/anaconda2/ipasn.dat')
asndb.lookup('8.8.8.8')

(15169, '8.8.8.0/24')

In [38]:
from geoip import geolite2

match = geolite2.lookup('17.0.0.1')
print match.country

US


In [49]:
#for qname in ips.keys():
#   print qname,domains[qname], ips[qname][0]     

for qname in ips.keys():
    for i in range(len(ips[qname])):
        ip_match = geolite2.lookup(ips[qname][i])
        if ip_match is None:
            continue
            
        ip_country = ip_match.country
        print qname, domains[qname], asndb.lookup(ips[qname][i]), ip_country


www.wayamo.com 1 (36444, '129.121.176.0/20') US
www.loyolaandnews.es 1 (39020, '91.192.110.0/24') ES
simplecast.com 1 (14618, '52.200.0.0/13') US
simplecast.com 1 (14618, '52.72.0.0/15') US
simplecast.com 1 (14618, '18.232.0.0/14') US
simplecast.com 1 (14618, '34.192.0.0/12') US
simplecast.com 1 (14618, '54.87.0.0/16') US
simplecast.com 1 (14618, '35.168.0.0/13') US
nnd.ng 3 (16509, '54.213.0.0/16') US
www.dispatchlive.co.za 1 (15169, '216.58.223.0/24') US
www.thepresidency.gov.za 7 (36994, '41.0.5.0/24') ZA
www.sabreakingnews.co.za 1 (15169, '104.199.64.0/19') US
southerncourier.co.za 2 (13335, '104.24.0.0/20') US
southerncourier.co.za 2 (13335, '104.24.0.0/20') US
actionkingbest.blogspot.com 1 (15169, '216.58.223.0/24') US
ift.tt 26 (395224, '67.199.248.0/24') US
ift.tt 26 (395224, '67.199.248.0/24') US
readinglist.click 4 (16509, '34.248.0.0/13') US
theconversation.com 9 (54113, '151.101.0.0/22') US
theconversation.com 9 (54113, '151.101.192.0/22') US
theconversation.com 9 (54113, '

za.trendwiki.co 5 (63949, '178.79.128.0/18') GB
buff.ly 52 (395224, '67.199.248.0/24') US
buff.ly 52 (395224, '67.199.248.0/24') US
www.dailysun.co.za 1 (10474, '41.86.96.0/19') ZA
www.moneyweb.co.za 1 (13335, '104.25.192.0/20') US
www.moneyweb.co.za 1 (13335, '104.25.192.0/20') US
www.kooderadio.com 1 (20013, '50.116.80.0/20') US
thecolonialchronicle.com 2 (26496, '107.180.0.0/18') US
bit.ly 174 (395224, '67.199.248.0/24') US
bit.ly 174 (395224, '67.199.248.0/24') US
www.salabournews.co.za 2 (36943, '41.185.0.0/16') ZA
www.africantimesnews.co.za 1 (49453, '134.19.190.0/23') NL
news365.co.za 3 (13335, '104.27.160.0/20') US
news365.co.za 3 (13335, '104.27.160.0/20') US
www.thedailyvox.co.za 1 (3741, '197.96.0.0/13') ZA
www.timeslive.co.za 33 (15169, '216.58.223.0/24') US
zimsinsa.com 1 (36943, '41.185.0.0/16') ZA
albertonrecord.co.za 2 (13335, '104.24.0.0/20') US
albertonrecord.co.za 2 (13335, '104.24.0.0/20') US
lnkd.in 2 (14413, '108.174.10.0/24') US
www.businesslive.co.za 37 (15169, 

### Map Each DNS Nameserver to Its Corresponding IP Address and Country