In [None]:
# The mighty cleaner
# I would love to document this properly some day

In [None]:
import os
import ast
import time
import hashlib
import pandas as pd

In [None]:
MIN_SUBS_LEN = 2
MAX_ALLOWED_RATE = 0.3
MAX_ALLOWED_OCCURRENCES = 5


def generate_id():
    id = 1
    while True:
        yield str(id)
        id += 1
ID = generate_id()


def sequence_is_allowed(seq_count, total_urls):
    return seq_count <= MAX_ALLOWED_OCCURRENCES and seq_count < MAX_ALLOWED_RATE * total_urls


def fill_segments_hashes(structure, segments_hash_values):
    for segments_list in structure:
        for seg in segments_list:
            hash = hashlib.md5(seg.encode('utf-8')).hexdigest()
            segments_hash_values[hash] = seg


def fill_segments_two_way_map(structure, two_way_map):
    for segments_list in structure:
        for seg in segments_list:
            if two_way_map.get(seg, None):
                continue
            seg_id = next(ID)
            two_way_map[seg] = seg_id
            two_way_map[seg_id] = seg


def get_hashed_structure(structure):
    return [ [ hashlib.md5(seg.encode('utf-8')).hexdigest() for seg in segment_list ] for segment_list in structure ]


def get_original_struct_from_hashed(hashed_structure, segments_hash_values):
    return [ [ segments_hash_values[hashed_seg] for hashed_seg in segment_list ] for segment_list in hashed_structure ]


def swap_structure_state(encoded_structure, two_way_map):
    return [ [ two_way_map[enc_seg] for enc_seg in segment_list ] for segment_list in encoded_structure ]


def get_contigous_subsequences(segment_list):
    return [segment_list[i:i+j] for i in range(0,len(segment_list)) for j in range(1,len(segment_list)-i+1)]


def fill_contigous_subsequences(contigous_sequences, structure):    
    for segments_list in structure:
        for subs in get_contigous_subsequences(segments_list):
            if len(subs) < MIN_SUBS_LEN:
                continue
            subsequence_token = ':'.join(subs)
            contigous_sequences[subsequence_token] = contigous_sequences.get(subsequence_token, 0) + 1


def clean_duplicate_segments(list_of_segments, contigous_sequences, total_urls):
    # print('------------------')

    size = len(list_of_segments)
    
    if size < MIN_SUBS_LEN:
        return list_of_segments

    window = MIN_SUBS_LEN
    begin = 0
    end = begin + window - 1

    remove_list = []

    while begin < size and end < size:
        subsequence_token = ':'.join(list_of_segments[begin:end + 1])
        # print('begin subs:', subsequence_token),
        if not sequence_is_allowed(contigous_sequences.get(subsequence_token, 0), total_urls):
            while end < size:
                if end == size - 1:
                    remove_list.append((begin, end))
                    begin = end + 1
                    end = begin + window - 1
                    break
                subsequence_token += ':' + list_of_segments[end+1]
                # print('end subs:', subsequence_token)
                if sequence_is_allowed(contigous_sequences.get(subsequence_token, 0), total_urls):
                    # print('removing:', subsequence_token)
                    remove_list.append((begin, end))
                    begin = end + 1
                    end = begin + window - 1
                    break
                end += 1
        else:
            begin += 1
            end = begin + window - 1


    # build the cleaned list (no duplicate sequences)
    cleaned_list = []
    i = 0
    for range in remove_list:
        if i < range[0]:
            cleaned_list.extend(list_of_segments[i:range[0]])
            i = range[1] + 1
        else:
            i = range[1] + 1
    if i < size:
        cleaned_list.extend(list_of_segments[i:])

    # print(cleaned_list)
    return cleaned_list
            

def clean_website_duplicate_sequences(structure):
    # segments_hash_values = dict()
    two_way_map = dict()
    contigous_sequences = dict()
    URL_COUNT = len(structure)

    # fill_segments_hashes(structure, segments_hash_values)
    fill_segments_two_way_map(structure, two_way_map)

    # hashed_structure = get_hashed_structure(structure)
    encoded_structure = swap_structure_state(structure, two_way_map)

    fill_contigous_subsequences(contigous_sequences, encoded_structure)

    # cleaned_hashed_structure = [ clean_duplicate_segments(list_of_segments, contigous_sequences, URL_COUNT) for list_of_segments in hashed_structure ]
    # cleaned_structure = get_original_struct_from_hashed(cleaned_hashed_structure, segments_hash_values)
    cleaned_encoded_structure = [ clean_duplicate_segments(list_of_segments, contigous_sequences, URL_COUNT) for list_of_segments in encoded_structure ]
    
    cleaned_structure = swap_structure_state(cleaned_encoded_structure, two_way_map)

    return cleaned_structure

In [None]:
def remove_similar(arrays, urls, thresh = 0.95):
    # Create a new list to store non-duplicate arrays and corresponding URLs
    non_duplicate_arrays = []
    non_duplicate_urls = []
    # Iterate over each array and URL in the input lists
    for i, (array1, url1) in enumerate(zip(arrays, urls)):
        # Create a new set to store unique strings in the current array
        unique_strings1 = set(array1)
        # Iterate over each array and URL that come after the current array and URL in the lists
        for j in range(i+1, len(arrays)):
            array2 = arrays[j]
            url2 = urls[j]
            # Calculate the number of common strings between the two arrays
            common_strings = set(array2).intersection(unique_strings1)
            try:
                similarity = len(common_strings) / len(unique_strings1)
            except ZeroDivisionError:
                similarity = 0
            # If the similarity is 95% or greater, remove both arrays and URLs
            if similarity >= thresh:
                break
        else:
            non_duplicate_arrays.append(array1)
            non_duplicate_urls.append(url1)
    return non_duplicate_arrays, non_duplicate_urls


def remove_duplicates(array_of_arrays):
    result = []
    for inner_array in array_of_arrays:
        unique_elements = []
        for element in inner_array:
            if element.lower() not in unique_elements:
                unique_elements.append(element.lower())
        result.append(unique_elements)
    return result

def clean_lists(lists, thresh = 3):
    all = []
    for i in lists:
        all.extend(i)
    
    counts = {}
    duplicates = set()

    for string in all:
        if string in counts:
            counts[string] += 1
            if counts[string] >= thresh:
                duplicates.add(string)
        else:
            counts[string] = 1

    total = []
    for x in lists:
        temp = []
        for y in x:
            if y not in duplicates:
                temp.append(y)
        total.append(temp)
    
    return total

def remove_words(array_of_arrays):
    result = []
    removed = []
    for inner_array in array_of_arrays:
        arrResult = []
        arrRemoved = []
        for element in inner_array:
            if len(element.split()) > 1 and non_letter_ratio(element) < 0.4:
                arrResult.append(element)
            else:
                arrRemoved.append(element)
        result.append(arrResult)
        removed.append(arrRemoved)
    return result, removed
    
def non_letter_ratio(string):
    text = ''.join(string.split())
    special_characters = '"\'!@#$%^&*()-+?_=,<>/"\\|'
    up = 0
    down = len(text)
    for char in text:
        if char in special_characters or char.isdigit():
            up += 1
    return up/down

In [None]:
# Cleaner without remove similar, looping over df using index
def cleaner(input_df):
    input_df['Cleaned'] = None

    segments = input_df['Segments'].values.tolist()
    pages_list = input_df['Page'].values.tolist()

    dedupeArr = remove_duplicates(segments)
    cleaned = clean_website_duplicate_sequences(dedupeArr)
    results, removed  = remove_words(cleaned)

    counter = 0
    input_df = input_df.reset_index()
    for r in results:
        input_df.at[counter, 'Cleaned'] = r
        counter += 1
    return input_df.drop(columns = ['index'])



def cleaner(input_df):
    input_df['Cleaned'] = None

    segments = input_df['Segments'].values.tolist()
    pages_list = input_df['Page'].values.tolist()

    dedupeArr = remove_duplicates(segments)
    cleaned = clean_website_duplicate_sequences(dedupeArr)
    newTexts, newUrls = remove_similar(cleaned, pages_list)
    results, removed  = remove_words(newTexts)

    input_df = input_df.set_index('Page')
    for u, r in zip(newUrls, results):
        input_df.at[u, 'Cleaned'] = r
    input_df = input_df.reset_index()[['ID', 'Page', 'Segments', 'Cleaned']]
    return input_df

In [None]:
for i, f in enumerate(sorted(os.listdir('Segmented By ID'), key=len)):
    print(i+1, end = '\t\t')
    print('ID: ', str(f.rstrip('.csv')).ljust(10), end = '')
    start = time.time()
    new_df = pd.read_csv(f'Segmented By ID/{f}', index_col=False)
    new_df.drop_duplicates(subset='Page', keep='first', inplace=True)
    new_df['Segments'] = new_df['Segments'].apply(lambda x: ast.literal_eval(x))

    page_count = len(new_df)
    print(str(page_count).ljust(10), end = '')
    seg_count = sum([len(row) for row in new_df['Segments']])
    print(str(seg_count).ljust(15), end = '')

    if f in os.listdir('Cleaned By ID'):
        print('ALREADY CLEANED')
        continue

    if page_count < 3 or page_count >= 500 or (f.rstrip('.csv') in ['1183', '25997', '29935', '284418']):
        print('SKIPPED')
        continue

    new_df = cleaner(new_df)
    new_df.to_csv(f'Cleaned By ID/{f}', index = False)
    print(time.time() - start, 'seconds')