In [4]:
#from advertools import robotstxt_to_df, robotstxt_test
import pandas as pd

from itertools import product

from protego import Protego

import requests

from warcio.archiveiterator import ArchiveIterator

from tqdm import tqdm, tqdm_notebook

In [2]:
def robotstxt_to_df_modified(robots_read):
    robots_text = robots_read.splitlines()
    lines = []
    for line in robots_text:
        if line.strip():
            if line.strip().startswith('#'):
                lines.append(['comment',
                                (line.replace('#', '').strip())])
            else:
                split = line.split(':', maxsplit=1)
                lines.append([split[0].strip(), split[1].strip()])
    df = pd.DataFrame(lines, columns=['directive', 'content'])
    return df

In [3]:
def robotstxt_test_modified(robots_text, robotstxt_url, user_agents, urls):
    if not robotstxt_url.endswith('/robots.txt'):
        raise ValueError('Please make sure you enter a valid robots.txt URL')
    if isinstance(user_agents, str):
        user_agents = [user_agents]
    if isinstance(urls, str):
        urls = [urls]
        
    rp = Protego.parse(robots_text)
    test_list = []
    for path, agent in product(urls, user_agents):
        d = dict()
        d['user_agent'] = agent
        d['url_path'] = path
        d['can_fetch'] = rp.can_fetch(path, agent)
        test_list.append(d)
    df = pd.DataFrame(test_list)
    df.insert(0, 'robotstxt_url', robotstxt_url)
    df = df.sort_values(['user_agent', 'url_path']).reset_index(drop=True)
    return df

In [4]:
def get_bias_from_robotstxt_file(robots_text, robotstxt_url):
    '''
    input: robots.txt url
    output: return 3 lists: no_bias bots list, favored bots list, disfavored bots list
    
    Example input: https://www.nytimes.com/robots.txt
    output : 
    user_agent
    *                 no_bias
    Twitterbot        favored
    ia_archiver    disfavored
    omgili         disfavored
    omgilibot      disfavored
    dtype: object
    '''
    # robots.txt file to pd dataframe 
    df = robotstxt_to_df_modified(robots_text)
    
    # get all user agents in a list
    user_agents = df.loc[df['directive'].str.lower() == 'user-agent']['content'].drop_duplicates().to_list()
    
    #initiate return lists
    no_bias, fav, disfav, fav_count_diff, disfav_count_diff = [], [], [], [], []
    
    # Check for empty robots.txt
    if not user_agents or user_agents == ['*']:
        return no_bias, fav, disfav, fav_count_diff, disfav_count_diff
    
    # get all directories that appear in the robots.txt file
    urls = df.loc[df['directive'].str.lower().isin(['disallow', 'allow'])]['content'].drop_duplicates().to_list()
    
    # return a DataFrame with a row for each combination of (user-agent, URL) indicating whether or not that particular user-agent can fetch the given URL.
    test = robotstxt_test_modified(robots_text, robotstxt_url, user_agents, urls)
    
    # return a pd series with user_agent as index and how many urls is allowed to fetch as value
    count_allow_by_user_agent = test.groupby(by='user_agent').apply(lambda row: row['can_fetch'].sum())

    # check if '*' is in user_agents, else add it to the list with count 0
    if not '*' in count_allow_by_user_agent.index:
        count_allow_by_user_agent['*'] = 0
    
    # Number of directories allowed for '*' to use it as reference next
    ref = count_allow_by_user_agent['*']

    #Create a dataframe from the series
    count_allow_by_user_agent_df = pd.DataFrame({'user_agent': count_allow_by_user_agent.index, 'url_can_fetch_count':count_allow_by_user_agent.values})
    
    def transform_can_fetch_into_categories(row):
        v = row.url_can_fetch_count - ref
        if v == 0:
            bias = 'no_bias'
        elif v > 0:
            bias = 'favored'
        else:
            bias = 'disfavored'
            
        row['bias'] = bias
        row['can_fetch_compared_to_*'] = v
        
        return row
        
        
    # map counts into one of the 3 categories
    count_allow_by_user_agent_df = count_allow_by_user_agent_df.apply(transform_can_fetch_into_categories, axis=1)
    
    #drop user_agent == *
    count_allow_by_user_agent_df.drop(count_allow_by_user_agent_df[count_allow_by_user_agent_df.user_agent == '*'].index, inplace=True)
    
    for index, row in count_allow_by_user_agent_df.iterrows():
        if row['bias'] == 'no_bias':
            no_bias.append(row['user_agent'])
        elif row['bias'] == 'favored':
            fav.append(row['user_agent'])
            fav_count_diff.append(row['can_fetch_compared_to_*'])
        else:
            disfav.append(row['user_agent'])
            disfav_count_diff.append(row['can_fetch_compared_to_*'])
    
    # return the final output
    return no_bias, fav, disfav, fav_count_diff, disfav_count_diff

In [8]:
def append_to_csv(data, csv_file_path):
    try:
        df = pd.DataFrame(data, columns=['robotstxt_url', 'no_bias', 'favored', 'disfavored', 'fav_count_diff', 'disfav_count_diff'])
        print('Saving data to: ' + csv_file_path)
        df.to_csv(csv_file_path, mode='a', header=False)
        return
    except:
        return

In [6]:
def process_robotstxt_file(file_path, batch_size=1024):
    data = []
    csv_output_path = '/'.join(file_path.split('/')[:-1]) + '/output.csv' 
    with open(file_path, 'rb') as stream:
        for record in ArchiveIterator(stream):
            if record.rec_type == 'response' and record.http_headers.get_statuscode() == '200':
                txt = record.content_stream().read().decode('ISO-8859-1').replace('¡', ' ')
                robotstxt_url = record.rec_headers.get_header('WARC-Target-URI')
                try:
                    no_bias, fav, disfav, fav_count_diff, disfav_count_diff = get_bias_from_robotstxt_file(txt, robotstxt_url)
                    data.append([robotstxt_url, no_bias, fav, disfav, fav_count_diff, disfav_count_diff])
                except:
                    pass
        if len(data) >= batch_size:
            append_to_csv(data, csv_output_path)
            data = []
    
    append_to_csv(data, csv_output_path)
    data = []
    return

In [2]:
import glob
my_path = '/home/osama/CC-MAIN-2021-43-robotstxt/1634323583423.96'
files = glob.glob(my_path + '/**/*.gz', recursive=True)

In [None]:
for file in tqdm_notebook(files):
    process_robotstxt_file(file)