Paginas Web de la facultad de matemáticas obtenidas en el día 23 de junio de 2024

Instalar libreria necesaria

In [None]:
#!pip install scrapy

Crear nuevo proyecto

In [None]:
!scrapy startproject webcrawler


## Obtener direcciones web a partir de URL inicial

Crear archivo con el código del algoritmo apra encontrar las direcciones web

In [None]:
%%writefile /content/webcrawler/webcrawler/spiders/url_spider.py
import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
from urllib.parse import urljoin

class UrlSpider(CrawlSpider):
    name = 'url_spider'
    allowed_domains = ['matematicas.ucm.es']
    start_urls = ['https://matematicas.ucm.es/']  # Página inicial

    custom_settings = {
        'DEPTH_LIMIT': 4  # Limita la profundidad del rastreo
    }

    rules = (
        Rule(LinkExtractor(allow=()), callback='parse_item', follow=True),
    )

    def __init__(self, *args, **kwargs):
        super(UrlSpider, self).__init__(*args, **kwargs)
        self.urls_seen = set()

    def parse_item(self, response):
        urls = response.css('a::attr(href)').extract()
        absolute_urls = [urljoin(response.url, url) for url in urls]

        with open('extracted_urls.txt', 'a') as f:
            for url in absolute_urls:
                if url not in self.urls_seen:
                    self.urls_seen.add(url)
                    f.write(f"{url}\n")


Ejecutar algoritmo de búsqueda

In [None]:
%cd /content/webcrawler
!scrapy crawl url_spider


Analizar las direcciones obtenidas

In [None]:
# Leer las URLs extraídas
with open('extracted_urls.txt', 'r') as f:
    urls = f.read().splitlines()

print(len(urls))
print(urls)

## Obtener matriz de adyacencia de webs

Crear archivo con el código del algoritmo apra encontrar las direcciones web y crear la matriz de adaycencia

In [None]:
%%writefile /content/webcrawler/webcrawler/spiders/link_spider.py
import scrapy
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor
import numpy as np
import pandas as pd

# Leer las URLs extraídas
with open('extracted_urls.txt', 'r') as f:
    initial_urls = f.read().splitlines()


class LinkSpider(CrawlSpider):
    name = 'link_spider'
    allowed_domains = ['matematicas.ucm.es']

    # Lista de URLs
    start_urls = initial_urls

    custom_settings = {
        #'DEPTH_LIMIT': 1,  # Limita la profundidad del rastreo
        #'CLOSESPIDER_PAGECOUNT': 100  # Limita el número de páginas rastreadas
    }

    rules = (
        Rule(LinkExtractor(allow=()), callback='parse_item', follow=False),
    )

    def __init__(self, *args, **kwargs):
        super(LinkSpider, self).__init__(*args, **kwargs)
        self.url_to_index = {}
        self.index_to_url = []
        self.adjacency_list = []

        # Crear índice para las URLs iniciales
        for url in self.start_urls:
            if url not in self.url_to_index:
                self.url_to_index[url] = len(self.index_to_url)
                self.index_to_url.append(url)
                self.adjacency_list.append([])

    def parse_item(self, response):
        page_url = response.url
        if page_url in self.url_to_index:
            page_index = self.url_to_index[page_url]
            links = LinkExtractor(allow=()).extract_links(response)
            for link in links:
                target_url = link.url
                if target_url in self.url_to_index:
                    target_index = self.url_to_index[target_url]
                    self.adjacency_list[page_index].append(target_index)

    def closed(self, reason):
        size = len(self.index_to_url)
        adjacency_matrix = np.zeros((size, size))
        for i, targets in enumerate(self.adjacency_list):
            for target in targets:
                adjacency_matrix[i, target] = 1

        df = pd.DataFrame(adjacency_matrix, index=self.index_to_url, columns=self.index_to_url)
        df.to_csv('adjacency_matrix.csv')


Ejecutar algoritmo de búsqueda

In [None]:
%cd /content/webcrawler
!scrapy crawl link_spider

Analizar matriz de adyacencia obtenida

In [None]:
import pandas as pd

# Leer la matriz de adyacencia
df = pd.read_csv('/content/webcrawler/adjacency_matrix.csv', index_col=0)
df


In [None]:
print(df.index)