In [1]:
%%time
import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
import multiprocessing
from tqdm import tqdm

with open("jugadores_sofascore.csv","r") as file:
    lista_jugadores = file.readlines()

CPU times: total: 93.8 ms
Wall time: 2.31 s


In [2]:
lista_jugadores

['https://www.sofascore.com/football/player/dusan-stoiljkovic/547776\n',
 'https://www.sofascore.com/football/player/viktor-gyokeres/804508\n',
 'https://www.sofascore.com/football/player/amando-lapage/1396102\n',
 'https://www.sofascore.com/football/player/marios-pourzitidis/962704\n',
 'https://www.sofascore.com/football/player/florian-grillitsch/186719\n',
 'https://www.sofascore.com/football/player/vitalii-katrych/1648299\n',
 'https://www.sofascore.com/football/player/antonin-kinsky/1031251\n',
 'https://www.sofascore.com/football/player/dylan-tait/988688\n',
 'https://www.sofascore.com/football/player/colin-rosler/907462\n',
 'https://www.sofascore.com/football/player/stephane-keller/1082963\n',
 'https://www.sofascore.com/football/player/vladimir-miletic/998786\n',
 'https://www.sofascore.com/football/player/olakunle-olusegun/1002552\n',
 'https://www.sofascore.com/football/player/yildirim-metehan/1516786\n',
 'https://www.sofascore.com/football/player/cambiaso-andrea/917048\n',

In [3]:
import psutil
import time
import threading
import pandas as pd
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from concurrent.futures import ThreadPoolExecutor, as_completed
from tqdm import tqdm

# --- Clase de monitoreo simplificada ---
class MonitorSimple:
    def __init__(self):
        self.inicio_tiempo = None
        self.max_cpu = 0
        self.max_memoria = 0
        self.muestras_cpu = []
        self.muestras_memoria = []
        self._monitoring = False

    def iniciar(self):
        self.inicio_tiempo = time.time()
        self._monitoring = True
        self.thread = threading.Thread(target=self._monitorear)
        self.thread.start()
        print("🔍 Monitoreo de recursos iniciado...")

    def _monitorear(self):
        while self._monitoring:
            cpu_actual = psutil.cpu_percent(interval=0.5)
            memoria_actual = psutil.virtual_memory().percent
            self.max_cpu = max(self.max_cpu, cpu_actual)
            self.max_memoria = max(self.max_memoria, memoria_actual)
            self.muestras_cpu.append(cpu_actual)
            self.muestras_memoria.append(memoria_actual)

    def finalizar(self):
        self._monitoring = False
        self.thread.join()
        tiempo_total = time.time() - self.inicio_tiempo
        cpu_promedio = sum(self.muestras_cpu) / len(self.muestras_cpu) if self.muestras_cpu else 0
        memoria_promedio = sum(self.muestras_memoria) / len(self.muestras_memoria) if self.muestras_memoria else 0

        print("\n" + "="*60)
        print("📊 MÉTRICAS FINALES DE RECURSOS")
        print("="*60)
        print(f"⏱️  Tiempo total: {tiempo_total:.1f} segundos")
        print(f"🖥️  CPU - Promedio: {cpu_promedio:.1f}% | Máximo: {self.max_cpu:.1f}%")
        print(f"💾 RAM - Promedio: {memoria_promedio:.1f}% | Máximo: {self.max_memoria:.1f}%")
        print(f"📈 Muestras tomadas: {len(self.muestras_cpu)}")
        print("="*60)

# --- Scraper de jugador ---
def scrape(jugador_url):
    datos_jugador = {'url': jugador_url}
    try:
        options = Options()
        options.add_argument('--headless')
        driver = webdriver.Chrome(options=options)
        driver.get(jugador_url)

        try:
            el = driver.find_element(By.XPATH, "//h1[contains(@class, 'Text')]")
            datos_jugador['nombre'] = el.text.strip()
        except: pass

        # 2. Posición
        try:
            el = driver.find_element(By.XPATH, "//span[contains(@class, 'Text') and contains(@class, 'fzFbgV')]")
            datos_jugador['posicion'] = el.text.strip()
        except: pass

        # 3. Equipo actual
        try:
            el = driver.find_element(By.XPATH, "//a[contains(@href, '/team/football')]")
            datos_jugador['equipo'] = el.text.strip().split("\n")[0]
        except: pass

        # 4. Nacionalidad
        try:
            el = driver.find_element(By.XPATH, "//img[contains(@alt, 'country flag')]")
            datos_jugador['nacionalidad'] = el.get_attribute('alt').replace(' country flag', '')
        except: pass

        # 5. Edad
        try:
            el = driver.find_element(By.XPATH, "//span[contains(text(), 'years old')]")
            datos_jugador['edad'] = el.text.replace(' years old', '').strip()
        except: pass

        # 6. Altura
        try:
            el = driver.find_element(By.XPATH, "//span[contains(text(), 'cm')]")
            datos_jugador['altura'] = el.text.strip()
        except: pass

        # 7. Valor de mercado
        try:
            el = driver.find_element(By.XPATH, "//div[contains(@class, 'Text imGAlA')]")
            datos_jugador['valor_mercado'] = el.text.strip()
        except: pass

        # 8–12. Ataque, técnica, velocidad, defensa, creatividad
        try:
            stats = driver.find_elements(By.XPATH, "//span[contains(@class, 'textStyle_table.small c_surface.s1 lh_20px d_block ta_center')]")
            etiquetas = ['ataque', 'tecnica', 'velocidad', 'defensa', 'creatividad']
            for i, etiq in enumerate(etiquetas):
                if i < len(stats):
                    datos_jugador[etiq] = stats[i].text.strip()
        except: pass

        # 13. Liga actual
        try:
            el = driver.find_element(By.XPATH, "//a[contains(@href, '/tournament/football')]")
            datos_jugador['liga'] = el.text.strip()
        except: pass

        # 14. Partidos jugados
        try:
            el = driver.find_element(By.XPATH, "//span[contains(@class, 'Text fftNCK')]")
            datos_jugador['partidos'] = el.text.strip()
        except: pass

        # 15. Valoración media
        try:
            el = driver.find_element(By.XPATH, "//span[@aria-valuenow and @role='meter']")
            datos_jugador['valoracion_media'] = el.get_attribute('aria-valuenow')
        except: pass

    except Exception as e:
        datos_jugador['error'] = str(e)
    finally:
        try: driver.quit()
        except: pass
    return datos_jugador

# --- Código principal ---
if __name__ == '__main__':


    monitor = MonitorSimple()
    monitor.iniciar()

    resultados = []
    errores = []

    with ThreadPoolExecutor(max_workers=10) as executor:
        futures = [executor.submit(scrape, url) for url in lista_jugadores[-2000:]]
        for future in tqdm(as_completed(futures), total=len(futures)):
            result = future.result()
            if 'error' in result:
                errores.append(result)
            else:
                resultados.append(result)

    pd.DataFrame(resultados).to_csv('jugadores_datos.csv', index=False)
    pd.DataFrame(errores).to_csv('jugadores_errores.csv', index=False)

    monitor.finalizar()


🔍 Monitoreo de recursos iniciado...


100%|████████████████████████████████████████████████████████████████████████████| 2000/2000 [1:24:21<00:00,  2.53s/it]



📊 MÉTRICAS FINALES DE RECURSOS
⏱️  Tiempo total: 5061.6 segundos
🖥️  CPU - Promedio: 16.4% | Máximo: 100.0%
💾 RAM - Promedio: 23.1% | Máximo: 32.5%
📈 Muestras tomadas: 10081


In [14]:
pd.concat([pd.read_csv("primeros.csv"),pd.read_csv("jugadores_datos.csv")]).to_csv("datos_finales.csv")