In [64]:
import socket
import requests
import numpy as np
import pandas as pd
from tqdm import tqdm
import urllib.request
from multiprocessing.dummy import Pool

In [22]:
df = pd.read_csv('host.csv', header=None, names = ['host'])

In [23]:
df[:3]

Unnamed: 0,host
0,api.youla.io
1,favicon.yandex.net
2,w-74721.fp.kaspersky-labs.com


In [24]:
len(df), len(df['host'].unique())

(1000000, 199944)

In [25]:
df.drop_duplicates(inplace=True)

In [32]:
len(df), len(df['host'].unique())

(199944, 199944)

In [33]:
def test_tcp(host, port):
    s = socket.socket()
    s.settimeout(1)
    try:
        s.connect((host, port))
        return 1
    except:
        s.close()
        return 0
    finally:
        s.close()

In [34]:
def get_data(url):
    try:
        with urllib.request.urlopen(url, timeout=2) as r:
            return r.read().decode()[:50000]
    except:
        return ''

In [35]:
def _test_tcp(host):
    return (test_tcp(host, 443) or test_tcp(host, 80))

In [36]:
def _get_data(url, tcp):
    return get_data(url) if tcp else ''

In [37]:
hosts = df['host'].tolist()
urls = ['https://' + host for host in hosts]

In [None]:
pool = Pool(20)
futures = [pool.apply_async(_test_tcp, [host]) for host in hosts]
df['tcp'] = [future.get() for future in tqdm(futures)]

In [43]:
tcps = df['tcp'].tolist()

In [None]:
pool = Pool(20)
futures = [pool.apply_async(_get_data, [url, tcp]) for url, tcp in zip(urls, tcps)]
df['text'] = [future.get() for future in tqdm(futures)]

In [50]:
df1m = pd.read_csv('top-1m.csv', header=None, index_col=0, names=['host'])

In [51]:
df1m[:3]

Unnamed: 0,host
1,google.com
2,youtube.com
3,facebook.com


In [53]:
df1m['1m'] = 1
df_merged = pd.merge(df, df1m, how='left', on='host')
df_merged['1m'] = df_merged['1m'].fillna(0).astype(int)

In [56]:
df_merged[:5]

In [None]:
df_merged.to_csv('50k.csv', index=False)

In [71]:
df = pd.read_csv('50k.csv')
df['text'].fillna('', inplace=True)

In [73]:
df[:20]

Unnamed: 0,host,tcp,1m,text
0,api.youla.io,1,0,"<!doctype html>\n<html lang=""en"">\n<head>\n ..."
1,favicon.yandex.net,1,0,
2,w-74721.fp.kaspersky-labs.com,1,0,
3,questtime.net,1,1,"<!DOCTYPE html>\n<html dir=""ltr"" lang=""ru-ru"" ..."
4,passport-authproxy.taxi.yandex.net,1,0,
5,m.aliexpress.com,1,0,
6,www.gismeteo.ru,1,0,
7,r8---sn-jvhnu5g-n8me.googlevideo.com,1,0,
8,sonar-gmp1-1.xx.fbcdn.net,1,0,
9,4wll4l5gec.execute-api.us-west-2.amazonaws.com,1,0,


In [79]:
df['target'] = (((df['text'].str.len() > 10000) & \
    (~df['text'].str.match(r'{[\s\S]*}')) & \
    (~df['text'].str.match(r'<?xml[\s\S]'))) |
    (df['tcp'] & df['1m'])).astype(int)

In [80]:
df[:20]

Unnamed: 0,host,tcp,1m,text,target
0,api.youla.io,1,0,"<!doctype html>\n<html lang=""en"">\n<head>\n ...",1
1,favicon.yandex.net,1,0,,0
2,w-74721.fp.kaspersky-labs.com,1,0,,0
3,questtime.net,1,1,"<!DOCTYPE html>\n<html dir=""ltr"" lang=""ru-ru"" ...",1
4,passport-authproxy.taxi.yandex.net,1,0,,0
5,m.aliexpress.com,1,0,,0
6,www.gismeteo.ru,1,0,,0
7,r8---sn-jvhnu5g-n8me.googlevideo.com,1,0,,0
8,sonar-gmp1-1.xx.fbcdn.net,1,0,,0
9,4wll4l5gec.execute-api.us-west-2.amazonaws.com,1,0,,0
