<a href="https://colab.research.google.com/github/ohsolus/MapReduceThread/blob/main/MapReduceMultithreading.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import requests
from bs4 import BeautifulSoup

def fetch_content(url):
    response = requests.get(url)
    if response.status_code == 200:
        soup = BeautifulSoup(response.content, 'html.parser')
        return soup.get_text()
    return ""


In [None]:
from collections import defaultdict
import re

def map_function(content):
    words = re.findall(r'\b[a-zA-Z]+\b', content.lower())
    word_count = defaultdict(int)
    total_words = 0
    for word in words:
        word_count[word] += 1
        total_words += 1
    return word_count, total_words


In [None]:
def reduce_function(mapped_data_list):
    combined_counts = defaultdict(int)
    for word_count in mapped_data_list:
        for word, count in word_count.items():
            combined_counts[word] += count
    return combined_counts


In [None]:
import threading
import time

class MapReduce:
    def __init__(self, urls):
        self.urls = urls
        self.contents = []
        self.mapped_data = []
        self.lock = threading.Lock()
        self.times = []
        self.total = []

    def fetch_and_map(self, url):
        #calculamos el tiempo de rendimiento (inicio)
        start_time = time.time()
        content = fetch_content(url)
        if content:
            mapped_data, total = map_function(content)
            self.total.append((url, total))
            with self.lock:
                self.mapped_data.append(mapped_data)
        #calculamos el tiempo de rendimiento (fin)
        end_time = time.time()
        elapsed_time = end_time - start_time
        with self.lock:
            self.times.append((url, elapsed_time))


    def execute(self):
        # Creamos los hilos
        threads = [threading.Thread(target=self.fetch_and_map, args=(url,)) for url in self.urls]

        for thread in threads:
            thread.start()

        for thread in threads:
            thread.join()

        reduced_data = reduce_function(self.mapped_data)
        return reduced_data

# Uso del algoritmo MapReduce
urls = ['https://en.wikipedia.org/wiki/Ebenezer_Place,_Wick', 'https://es.wikipedia.org/wiki/Ydalir']
map_reduce = MapReduce(urls)
result = map_reduce.execute()

# Imprimir los resultados
for word, count in result.items():
    print(f"{word}: {count}")

# Imprimir los resultados de los tiempos
print("\nTiempos de procesamiento por URL: \n")

sum_time = 0

for url, elapsed_time in map_reduce.times:
    print(f"{url}: {elapsed_time:.2f} segundos")
    sum_time = sum_time + elapsed_time

print("\nTiempo total de ejecución: ", sum_time)


sum_words = 0
print("\n ")

# Imprimir el total de palabras

for url, total_words in map_reduce.total:
    print(f"{url}: {total_words}")
    sum_words = sum_words + total_words

print("\nTotal de palabras: ", sum_words)




ebenezer: 8
place: 8
wick: 7
wikipedia: 13
jump: 1
to: 13
content: 2
main: 4
menu: 2
move: 3
sidebar: 3
hide: 3
navigation: 1
pagecontentscurrent: 1
eventsrandom: 1
articleabout: 1
wikipediacontact: 1
usdonate: 1
contribute: 1
helplearn: 1
editcommunity: 1
portalrecent: 1
changesupload: 2
file: 1
search: 2
create: 2
account: 2
log: 2
in: 10
personal: 1
tools: 3
pages: 1
for: 2
logged: 1
out: 1
editors: 1
learn: 1
more: 1
contributionstalk: 1
contents: 2
top: 1
toggle: 2
the: 27
table: 1
of: 11
languages: 1
edit: 2
links: 2
articletalk: 1
english: 2
readeditview: 2
history: 2
actions: 1
general: 2
what: 1
hererelated: 1
filespecial: 1
pagespermanent: 1
linkpage: 1
informationcite: 1
this: 5
pageget: 1
shortened: 1
urldownload: 1
qr: 1
codewikidata: 1
item: 1
print: 1
export: 1
download: 1
as: 3
pdfprintable: 1
version: 1
other: 2
projects: 1
wikimedia: 5
commons: 4
coordinates: 1
n: 4
w: 6
from: 8
free: 2
encyclopedia: 1
street: 13
credited: 2
world: 4
s: 3
shortest: 5
with: 3
union: 3
