Skip to content

Daily scraping and submitting #342

Daily scraping and submitting

Daily scraping and submitting #342

name: Daily scraping and submitting
on:
schedule:
- cron: '30 00 * * *'
permissions:
contents: write
jobs:
scraper:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- name: Set up Python 3.10
uses: actions/setup-python@v5
with:
python-version: "3.10"
- name: Install dependencies
run: |
date -u
TZ="Europe/Paris" date
python -m pip install --upgrade pip
pip install -r $GITHUB_WORKSPACE/_resources/scraper_requirements.txt
- name: Create year/month dirs if not existing
run: |
cd $GITHUB_WORKSPACE
mkdir -p $(date --date="yesterday" '+%Y/%m')
- name: Scrape the previous day publication and push to the repository
run: |
cd $GITHUB_WORKSPACE
python $GITHUB_WORKSPACE/_resources/scraper.py -d yesterday
git config user.name scraping-bot
git config user.email github-actions@github.com
git pull
git add -A
git commit -m "scraping bot - $(date --date="yesterday" '+%Y/%m/%d')"
git push
- name: Consolidate the full dataset
run: |
cd $GITHUB_WORKSPACE
csvstack -n datefile --filenames 20**/**/**-**-****.csv | csvformat -U 1 > ./full_dataset/full.csv
git config user.name scraping-bot
git config user.email github-actions@github.com
git pull
git add -A
git commit -m "fulldataset bot - $(date --date="yesterday" '+%Y/%m/%d')"
git push
submitter:
needs: scraper
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- name: Set up Python 3.10
uses: actions/setup-python@v5
with:
python-version: "3.10"
- name: Install Netcraft dependencies
run: |
git pull
python -m pip install --upgrade pip
pip install -r $GITHUB_WORKSPACE/_resources/netcraft_requirements.txt
- name: Submit the previous day publication to Netcraft
env:
SECRET_NETCRAFT_REPORT_MAIL: ${{ secrets.SECRET_NETCRAFT_REPORT_MAIL }}
run: |
cd $GITHUB_WORKSPACE
csvcut -x -c 'fqdn' ./$(date --date="yesterday" '+%Y/%m')/$(date --date="yesterday" '+%Y-%m-%d.csv')| sed 1d| tee /tmp/$(date --date="yesterday" '+%Y-%m-%d.csv')
python $GITHUB_WORKSPACE/_resources/netcraft.py -a submit -i /tmp/$(date --date="yesterday" '+%Y-%m-%d.csv')
- name: Install CRDF submitter dependencies
run: |
pip install -r $GITHUB_WORKSPACE/_resources/crdf_requirements.txt
- name: Submit the previous day publication to CRDF
env:
SECRET_CRDF_API_KEY: ${{ secrets.SECRET_CRDF_API_KEY }}
run: |
cd $GITHUB_WORKSPACE
csvcut -x -c 'fqdn' ./$(date --date="yesterday" '+%Y/%m')/$(date --date="yesterday" '+%Y-%m-%d.csv')| sed 1d| tee /tmp/$(date --date="yesterday" '+%Y-%m-%d.csv')
ls -al /tmp/$(date --date="yesterday" '+%Y-%m-%d.csv')
python $GITHUB_WORKSPACE/_resources/crdf.py -a submit -i /tmp/$(date --date="yesterday" '+%Y-%m-%d.csv')
- name: Push to AlienVault pulse
continue-on-error: true
env:
SECRET_ALIENVAULT_API_KEY: ${{ secrets.SECRET_ALIENVAULT_API_KEY }}
run: |
cd $GITHUB_WORKSPACE
pip install -r "$GITHUB_WORKSPACE/_resources/alienvault_requirements.txt"
python "$GITHUB_WORKSPACE/_resources/alienvault.py" -p "6544b0acf1c1eb2efe9e34e0" -i "/tmp/$(date --date="yesterday" '+%Y-%m-%d.csv')"
- name: Install Waybacksave dependencies
run: |
pip install -r $GITHUB_WORKSPACE/_resources/waybacksave_requirements.txt
- name: Submit the previous day publication to the Wayback Machine for archive
run: |
cd $GITHUB_WORKSPACE
csvsql -u 1 ./$(date --date="yesterday" '+%Y/%m')/$(date --date="yesterday" '+%Y-%m-%d.csv') --tables db --query 'SELECT fqdn FROM db WHERE ip IS NOT NULL' | sed 1d| tee /tmp/previous_day_with_ip
python $GITHUB_WORKSPACE/_resources/waybacksave.py -i /tmp/previous_day_with_ip -f /tmp/previous_day_with_ip_failed
python $GITHUB_WORKSPACE/_resources/waybacksave.py -i /tmp/previous_day_with_ip_failed -f /tmp/previous_day_with_ip_failed_again -q
python $GITHUB_WORKSPACE/_resources/waybacksave.py -i /tmp/previous_day_with_ip_failed_again -f /tmp/previous_day_with_ip_failed_again_again -q
python $GITHUB_WORKSPACE/_resources/waybacksave.py -i /tmp/previous_day_with_ip_failed_again_again
- name: Submit to Spamhaus
env:
SECRET_SPAMHAUS_API_KEY: ${{ secrets.SECRET_SPAMHAUS_API_KEY }}
run: |
cd $GITHUB_WORKSPACE
pip install -r $GITHUB_WORKSPACE/_resources/spamhaus_requirements.txt
python "$GITHUB_WORKSPACE/_resources/spamhaus.py" -i "/tmp/$(date --date="yesterday" '+%Y-%m-%d.csv')" -r "See https://red.flag.domains/posts/$(date --date="yesterday" '+%Y-%m-%d')/"