In [None]:
import requests
r = requests.get("https://example.com")
print(r.status_code)

import sys
import os
# install package direcly in notebook: %pip install requests-html
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), ".."))) # lägg till training i path


In [None]:
# Install packages 
%pip install playwright

In [None]:
import pandas as pd
from collections import Counter
import re

def most_frequent_terms(df, column, top_n=10):
    """
    Return the most frequent terms from a text column in a DataFrame.
    
    Args:
        df (pd.DataFrame): Input dataframe.
        column (str): Column name containing text.
        top_n (int): Number of most frequent terms to return.
        
    Returns:
        pd.DataFrame: DataFrame with term counts.
    """
    # Join all text in column into one big string
    text = " ".join(df[column].astype(str).tolist())
    
    # Tokenize: lowercase words, only keep a–z characters
    tokens = re.findall(r"\b[a-zA-ZåäöÅÄÖ]+\b", text.lower())
    
    # Count terms
    counter = Counter(tokens)
    
    # Convert to DataFrame
    most_common = counter.most_common(top_n)
    return pd.DataFrame(most_common, columns=["term", "count"])



In [None]:
bronze_data = pd.read_csv('../data/bronze/jobs.csv', index_col=0)
bronze_group = bronze_data.groupby(by=['site']).count().reset_index() 
bronze_group

In [None]:
df = most_frequent_terms(bronze_data, "job_title", top_n=100)
df.to_csv('../data/most_frequent.csv')

# Run all 

In [1]:
import sys, os
import pandas as pd
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), ".."))) # lägg till training i path
from src.data_loader import load_local_data, unload_local_data, load_local_dict, unload_local_dict
from src.scrapers.abstract_scraper import AbstractScraper
from src.scrapers.afry_scraper import AfryScraper
from src.scrapers.aliant_scraper import AliantScraper
from src.scrapers.asociety_scraper import ASocietyScraper
from src.scrapers.combitech_scraper import CombitechScraper
from src.scrapers.emagine_scraper import EmagineScraper
from src.scrapers.ework_scraper import EworkScraper
from src.scrapers.nikita_scraper import NikitaScraper
from src.scrapers.regent_scraper import RegentScraper
from src.scrapers.upgraded_scraper import UpgradedScraper

nr_payload_pre = len(load_local_dict())
nr_ads_pre = len(pd.read_csv('../data/bronze/jobs.csv'))

scrapers = [AfryScraper(), AliantScraper(), ASocietyScraper(), CombitechScraper(), EmagineScraper(), EworkScraper(), NikitaScraper(), RegentScraper(), UpgradedScraper()]
#scrapers = [AfryScraper()]
for s in scrapers:
    if s.site == 'Upgraded': 
        response = await s.request_status()
    else: 
        response = s.request_status()

    scraped_payload_dict = s.return_raw_job_posts_data(response)
    old_payload_dict = load_local_dict()
    old_bronze_data = load_local_data()
    
    new_payload_dict = s.return_new_ads(new_dict=scraped_payload_dict, old_dict=old_payload_dict)
    new_bronze_data = s.parse_bronze_data(new_payload_dict)

    updated_payload_dict = s.concat_dicts(new_payload_dict, old_payload_dict)
    updated_bronze_data = s.concat_new_rows(new_bronze_data, old_bronze_data)
    unload_local_dict(updated_payload_dict)
    unload_local_data(updated_bronze_data)


nr_payload_post = len(load_local_dict())
nr_ads_post = len(pd.read_csv('../data/bronze/jobs.csv'))
print('Total number of new added jobs:', nr_ads_post-nr_ads_pre)
print('Total number of new added jobs_payloads:', nr_payload_post-nr_payload_pre)


Afry > Response: 200
Afry > Nmr of scraped adds: 81
Afry > Parsing bronze data: 0
Aliant > Response: 200
Aliant > Nmr of scraped adds: 10
Aliant > Parsing bronze data: 0
A Society > Response: 200
A Society > Nmr of scraped adds: 113
A Society > Parsing bronze data: 0
Combitech > Response: 200
Combitech > Nmr of scraped adds: 34
Combitech > Parsing bronze data: 0
Emagine > Response: 200
Emagine > Nmr of scraped adds: 44
Emagine > Parsing bronze data: 0
Ework > Response: 200
Ework > Nmr of scraped adds: 87
Ework > Parsing bronze data: 0
Nikita > Response: 200
Nikita > Nmr of scraped adds: 20
Nikita > Parsing bronze data: 0
Regent > Response: 200
Regent > Nmr of scraped adds: 32
Regent > Parsing bronze data: 0
Upgraded > Status code: 200
Upgraded > Nmr of scraped adds: 75
Upgraded > Parsing bronze data: 0
Total number of new added jobs: 0
Total number of new added jobs_payloads: 0


# Bronze table 

In [None]:
import pandas as pd
from src.data_loader import load_local_data, unload_local_data
bronze_data = load_local_data()


