This notebook is an attempt to clustering the various links extracted from the Lumen API on passage 2.

In [34]:
import json

# Load the data
with open('data/data.json', 'r') as f:
    data = json.load(f)

## 1. FROM MESSY JSON TO TIDY DATA

In [53]:
print(data[9]['dmca']['works'][0].keys())

dict_keys(['description', 'infringing_urls', 'copyrighted_urls'])


In [65]:
flattened_data = []

for notice in data:
    if 'dmca' in notice:
        if 'works' in notice['dmca']: 
            dmca = notice['dmca']
            for work in dmca['works']:
                for infringing_url in work['infringing_urls']:
                    # Move this line outside the loop for infringing_url
                    copyrighted_url = work['copyrighted_urls'][0]['url'] if work['copyrighted_urls'] else None
                    flat_notice = {
                        'id': dmca['id'],
                        'type': dmca['type'],
                        'title': dmca['title'],
                        'body': dmca['body'],
                        'date_sent': dmca['date_sent'],
                        'date_received': dmca['date_received'],
                        'topics': ','.join(dmca['topics']),
                        'sender_name': dmca['sender_name'],
                        'principal_name': dmca['principal_name'],
                        'recipient_name': dmca['recipient_name'],
                        'tags': ','.join(dmca['tags']),
                        'jurisdictions': ','.join(dmca['jurisdictions']),
                        'action_taken': dmca['action_taken'],
                        'language': dmca['language'],
                        'description': work['description'],
                        'infringing_url': infringing_url['url'],
                        'copyrighted_url': copyrighted_url,
                    }
                    flattened_data.append(flat_notice)

df = pd.DataFrame(flattened_data)

In [68]:
df.head()

Unnamed: 0,id,type,title,body,date_sent,date_received,topics,sender_name,principal_name,recipient_name,tags,jurisdictions,action_taken,language,description,infringing_url,copyrighted_url
0,28811860,DMCA,DMCA (Copyright) Complaint to Google,,2022-09-15T00:00:00.000Z,2022-09-15T00:00:00.000Z,"DMCA Notices,Copyright",Pdl Backup,,Google LLC,,US,,,Dnf Council Shirt,https://pvtboutique.com/product/minecraft-dnf-...,https://twitter.com/pdlbackup/status/156964685...
1,28811860,DMCA,DMCA (Copyright) Complaint to Google,,2022-09-15T00:00:00.000Z,2022-09-15T00:00:00.000Z,"DMCA Notices,Copyright",Pdl Backup,,Google LLC,,US,,,Dnf Council Shirt,https://hottrendclothing.com/product/minecraft...,https://twitter.com/pdlbackup/status/156964685...
2,28811860,DMCA,DMCA (Copyright) Complaint to Google,,2022-09-15T00:00:00.000Z,2022-09-15T00:00:00.000Z,"DMCA Notices,Copyright",Pdl Backup,,Google LLC,,US,,,Dnf Council Shirt,https://t-shirtat.com/shop/minecraft-dnf-counc...,https://twitter.com/pdlbackup/status/156964685...
3,28811860,DMCA,DMCA (Copyright) Complaint to Google,,2022-09-15T00:00:00.000Z,2022-09-15T00:00:00.000Z,"DMCA Notices,Copyright",Pdl Backup,,Google LLC,,US,,,Dnf Council Shirt,https://t-shirtat.com/shop/dnf-council-shirt-h...,https://twitter.com/pdlbackup/status/156964685...
4,28811860,DMCA,DMCA (Copyright) Complaint to Google,,2022-09-15T00:00:00.000Z,2022-09-15T00:00:00.000Z,"DMCA Notices,Copyright",Pdl Backup,,Google LLC,,US,,,Dnf Council Shirt,https://uspremiumgift.com/product/minecraft-dn...,https://twitter.com/pdlbackup/status/156964685...


In [70]:
df.dtypes

id                  int64
type               object
title              object
body               object
date_sent          object
date_received      object
topics             object
sender_name        object
principal_name     object
recipient_name     object
tags               object
jurisdictions      object
action_taken       object
language           object
description        object
infringing_url     object
copyrighted_url    object
dtype: object

In [71]:
df['date_sent'] = pd.to_datetime(df['date_sent'])
df['date_received'] = pd.to_datetime(df['date_received'])

In [72]:
df.dtypes

id                               int64
type                            object
title                           object
body                            object
date_sent          datetime64[ns, UTC]
date_received      datetime64[ns, UTC]
topics                          object
sender_name                     object
principal_name                  object
recipient_name                  object
tags                            object
jurisdictions                   object
action_taken                    object
language                        object
description                     object
infringing_url                  object
copyrighted_url                 object
dtype: object

## ADDING SOURCE URLS (COPYRIGHTED AND INFRINGING)

In [75]:
from urllib.parse import urlparse

def get_base_url(url):
    return urlparse(url)._replace(path="", params="", query="", fragment="").geturl()

# Using apply to apply the function to each element in the series
df['source_infringing'] = df['infringing_url'].apply(get_base_url)

df['source_copyrighted'] = df['copyrighted_url'].apply(get_base_url)

df.head()


Unnamed: 0,id,type,title,body,date_sent,date_received,topics,sender_name,principal_name,recipient_name,tags,jurisdictions,action_taken,language,description,infringing_url,copyrighted_url,source_infringing,source_copyrighted
0,28811860,DMCA,DMCA (Copyright) Complaint to Google,,2022-09-15 00:00:00+00:00,2022-09-15 00:00:00+00:00,"DMCA Notices,Copyright",Pdl Backup,,Google LLC,,US,,,Dnf Council Shirt,https://pvtboutique.com/product/minecraft-dnf-...,https://twitter.com/pdlbackup/status/156964685...,https://pvtboutique.com,https://twitter.com
1,28811860,DMCA,DMCA (Copyright) Complaint to Google,,2022-09-15 00:00:00+00:00,2022-09-15 00:00:00+00:00,"DMCA Notices,Copyright",Pdl Backup,,Google LLC,,US,,,Dnf Council Shirt,https://hottrendclothing.com/product/minecraft...,https://twitter.com/pdlbackup/status/156964685...,https://hottrendclothing.com,https://twitter.com
2,28811860,DMCA,DMCA (Copyright) Complaint to Google,,2022-09-15 00:00:00+00:00,2022-09-15 00:00:00+00:00,"DMCA Notices,Copyright",Pdl Backup,,Google LLC,,US,,,Dnf Council Shirt,https://t-shirtat.com/shop/minecraft-dnf-counc...,https://twitter.com/pdlbackup/status/156964685...,https://t-shirtat.com,https://twitter.com
3,28811860,DMCA,DMCA (Copyright) Complaint to Google,,2022-09-15 00:00:00+00:00,2022-09-15 00:00:00+00:00,"DMCA Notices,Copyright",Pdl Backup,,Google LLC,,US,,,Dnf Council Shirt,https://t-shirtat.com/shop/dnf-council-shirt-h...,https://twitter.com/pdlbackup/status/156964685...,https://t-shirtat.com,https://twitter.com
4,28811860,DMCA,DMCA (Copyright) Complaint to Google,,2022-09-15 00:00:00+00:00,2022-09-15 00:00:00+00:00,"DMCA Notices,Copyright",Pdl Backup,,Google LLC,,US,,,Dnf Council Shirt,https://uspremiumgift.com/product/minecraft-dn...,https://twitter.com/pdlbackup/status/156964685...,https://uspremiumgift.com,https://twitter.com


In [76]:
sources = df[['id', 'source_infringing', 'source_copyrighted']]

## ISOLATING RELEVANT NOTICES

In [94]:
sources.head()

Unnamed: 0,id,source_infringing,source_copyrighted
0,28811860,https://pvtboutique.com,https://twitter.com
1,28811860,https://hottrendclothing.com,https://twitter.com
2,28811860,https://t-shirtat.com,https://twitter.com
3,28811860,https://t-shirtat.com,https://twitter.com
4,28811860,https://uspremiumgift.com,https://twitter.com


In [96]:
copyrighted_counts = sources['source_copyrighted'].value_counts()
top_50_copyrighted = copyrighted_counts.nlargest(50).index
top_50_copyrighted

Index(['https://chaturbate.com', 'https://nubilesporn.com',
       'https://onlyfans.com', '', 'https://www.lacoste.com',
       'https://themeforest.net', 'https://yithemes.com',
       'https://codecanyon.net', 'https://www.brazzers.com',
       'https://twitter.com', 'https://taaghche.com',
       'https://woocommerce.com', 'https://www.youtube.com',
       'https://www.dramaclub.one', 'https://profiles.myfreecams.com',
       'https://mydramalist.com', 'https://www.imdb.com',
       'http://onlyfans.com', 'http://www.toei-anim.co.jp',
       'https://www.naughtyamerica.com', 'http://profiles.myfreecams.com',
       'http://www.chicdownload.ir', 'http://toomics.com',
       'https://www.cdreader.com', 'https://comics.shogakukan.co.jp',
       'https://vieon.vn', 'https://www.cmoa.jp',
       'https://ver.movistarplus.es', 'https://doramasqueen.com',
       'https://www.watchit.com', 'https://bookstore.1roman.ir',
       'https://www.goodnovel.com', 'https://watchit.com',
       'htt

In [99]:
# Filter the original DataFrame, keeping only rows where the source_infringing is not in the top 50
filtered_df = sources[~sources['source_copyrighted'].isin(top_50_copyrighted)]
filtered_df

Unnamed: 0,id,source_infringing,source_copyrighted
53,29362386,https://aniclube.app,https://www.tv-tokyo.co.jp
54,29362386,https://animesonlinen.com,https://www.tv-tokyo.co.jp
55,29362386,https://goyabu.com,https://www.tv-tokyo.co.jp
56,29362386,https://animesrubro.net,https://www.tv-tokyo.co.jp
57,29362386,https://meusanimes.net,https://www.tv-tokyo.co.jp
...,...,...,...
3635055,35213103,https://rakshiri.com,https://www.shopfktoys.com
3635056,35216480,https://daihongphat.vn,https://www.giaanproperty.vn
3635068,35213242,https://chotistories.com,https://www.banglachotistories.com
3635069,35224211,https://thammylinhanh.vn,https://tamanhhospital.vn


In [100]:
infringing_counts = sources['source_infringing'].value_counts()
top_50_infringing = infringing_counts.nlargest(50).index
top_50_infringing

Index(['https://femefun.com', 'https://pervertslut.com',
       'https://www.camwhores.tv', 'https://area51.porn',
       'https://erothots.co', 'https://www.camwhores.video',
       'https://sharewordpress.net', 'https://inoxnhanvuong.com',
       'https://dienmaybigstar.com', 'https://inoxmanhhung.com',
       'https://www.camwhorez.tv', 'https://sxyprn.com',
       'https://pifansub.net', 'https://fapello.com',
       'https://archivebate.com', 'https://xfantazy.com',
       'https://pifansub.org', 'https://debouchage-bruxsel.be',
       'https://camstreams.tv', 'https://pelisflix3.top',
       'https://fapodrop.com', 'https://cokhiviendong.com',
       'https://www.camwhores.in', 'http://www.masajescort.com',
       'https://romanbook.ir', 'https://www.torrentdownloads.me',
       'https://codegood.net', 'https://nullphpscript.com',
       'https://camslib.com', 'https://www.cambro.tv', 'https://fxggxt.com',
       'http://nokuo.blog.jp', 'https://faponic.com',
       'https://www.

In [101]:
# Filter the original DataFrame, keeping only rows where the source_infringing is not in the top 50
filtered_df = filtered_df[~filtered_df['source_infringing'].isin(top_50_infringing)]
filtered_df

Unnamed: 0,id,source_infringing,source_copyrighted
53,29362386,https://aniclube.app,https://www.tv-tokyo.co.jp
54,29362386,https://animesonlinen.com,https://www.tv-tokyo.co.jp
55,29362386,https://goyabu.com,https://www.tv-tokyo.co.jp
56,29362386,https://animesrubro.net,https://www.tv-tokyo.co.jp
57,29362386,https://meusanimes.net,https://www.tv-tokyo.co.jp
...,...,...,...
3635055,35213103,https://rakshiri.com,https://www.shopfktoys.com
3635056,35216480,https://daihongphat.vn,https://www.giaanproperty.vn
3635068,35213242,https://chotistories.com,https://www.banglachotistories.com
3635069,35224211,https://thammylinhanh.vn,https://tamanhhospital.vn


In [102]:
forbidden_words = ['porn',
             'xxx',
             'forum',
             'sex',
             'escort',
             'film',
             'pirate',
             'chaturbate',
             'girl',
             'ringtones',
             'manga',
            'anime',
            'naked',
            '.xyz',
            'football',
            'nude']

In [104]:
import re

# Create a pattern that matches any of the forbidden words
pattern = '|'.join(map(re.escape, forbidden_words))

# Filter the DataFrame, keeping only the rows where the source_infringing does not match the pattern
filtered_df = filtered_df[~filtered_df['source_infringing'].str.contains(pattern, case=False)]
filtered_df = filtered_df[~filtered_df['source_copyrighted'].str.contains(pattern, case=False)]

filtered_df

Unnamed: 0,id,source_infringing,source_copyrighted
53,29362386,https://aniclube.app,https://www.tv-tokyo.co.jp
55,29362386,https://goyabu.com,https://www.tv-tokyo.co.jp
63,29362386,https://www.anitube.site,https://www.tv-tokyo.co.jp
69,29362386,https://aniclube.app,https://www.tv-tokyo.co.jp
76,29362386,https://www.anitube.site,https://www.tv-tokyo.co.jp
...,...,...,...
3635055,35213103,https://rakshiri.com,https://www.shopfktoys.com
3635056,35216480,https://daihongphat.vn,https://www.giaanproperty.vn
3635068,35213242,https://chotistories.com,https://www.banglachotistories.com
3635069,35224211,https://thammylinhanh.vn,https://tamanhhospital.vn


### So far we've done the following:

1. Identified top-50 source_infringing, checked if they were of interest (they were not) and removed them from our filtered_df
2. Identified top-50 source_copyrighted, checked if they were of interest (they were not) and removed them from our filtered_df
3. Removed all rows containing common words that are not of interest (mostly related to porn)

In [105]:
unique_infringing = filtered_df['source_infringing'].unique()
unique_copyrighted = filtered_df['source_copyrighted'].unique()

In [107]:
len(unique_infringing)

63803

In [108]:
len(unique_copyrighted)

10069

In [114]:
# Get the counts for each unique source_infringing value
unique_infringing_counts = filtered_df['source_infringing'].value_counts().reset_index()

# Rename the columns to meaningful names
unique_infringing_counts.columns = ['source_infringing', 'count']

# Save the DataFrame to a CSV file
unique_infringing_counts.to_csv('unique_infringing.csv', index=False)


In [113]:
# Get the counts for each unique source_infringing value
unique_copyrighted_counts = filtered_df['source_copyrighted'].value_counts().reset_index()

# Rename the columns to meaningful names
unique_copyrighted_counts.columns = ['source_copyrighted', 'count']

# Save the DataFrame to a CSV file
unique_copyrighted_counts.to_csv('unique_copyrighted.csv', index=False)

# USING THE GOOGLE SAFETY API TO SCORE URLS

In [128]:
key ='dJWxcic07BVvrZHCp24J'
url = 'https://champion-casino-wds.buzz'

In [116]:
import requests

In [124]:
endpoint = 'https://api.webshrinker.com/categories/v3/'

In [129]:
url = f'{endpoint}{url}{key}'
print(url)

https://api.webshrinker.com/categories/v3/https://champion-casino-wds.buzzdJWxcic07BVvrZHCp24J


In [92]:
top50_copyrighted = df['source_copyrighted'].value_counts().head(50)
top50_copyrighted


https://chaturbate.com               414593
https://nubilesporn.com              295992
https://onlyfans.com                 275612
                                     178743
https://www.lacoste.com              134414
https://themeforest.net               83694
https://yithemes.com                  67275
https://codecanyon.net                52813
https://www.brazzers.com              52747
https://twitter.com                   47806
https://taaghche.com                  44244
https://woocommerce.com               35715
https://www.youtube.com               33011
https://www.dramaclub.one             31652
https://profiles.myfreecams.com       26115
https://mydramalist.com               25756
https://www.imdb.com                  25058
http://onlyfans.com                   24885
http://www.toei-anim.co.jp            23638
https://www.naughtyamerica.com        20943
http://profiles.myfreecams.com        20912
http://www.chicdownload.ir            20874
http://toomics.com              

In [93]:
# Get the top 50 infringing URLs
top_50_infringing = infringing_counts.nlargest(50).index
top_50_infringing

NameError: name 'infringing_counts' is not defined

In [89]:
# Filtering out most common copyrighted (not of interest)
filtered_df = df[df['source_copyrighted'].isin(top50_copyrighted)]
filtered_df.shape

(0, 19)

In [90]:
filtered_df

Unnamed: 0,id,type,title,body,date_sent,date_received,topics,sender_name,principal_name,recipient_name,tags,jurisdictions,action_taken,language,description,infringing_url,copyrighted_url,source_infringing,source_copyrighted


In [2]:
# Extracting all infringing_urls

urls = []

for item in data:
    if 'dmca' in item:
        if 'works' in item['dmca']:
            for url in item['dmca']['works'][0]['infringing_urls']:
                url = url['url']
                urls.append(url)

In [4]:
len(urls)

2921645

I probably don't want to extract text from almost three million urls. How can I do this?

In [5]:
urls

['https://pvtboutique.com/product/minecraft-dnf-council-shirt/',
 'https://hottrendclothing.com/product/minecraft-dnf-council-shirt/',
 'https://t-shirtat.com/shop/minecraft-dnf-council-t-shirt/',
 'https://t-shirtat.com/shop/dnf-council-shirt-hoodie-sweater-and-tank-top/',
 'https://uspremiumgift.com/product/minecraft-dnf-council-funny-t-shirt/',
 'https://t-shirtat.com/shop/dnf-council-minecraft-shirt/',
 'https://usateelowprice.com/product/dnf-council-minecraft-shirt/',
 'https://phanavatees.com/product/nice-meriteri03-minecraft-dnf-council-shirt/',
 'https://hostingrockett-shirt.com/product/dnf-council-shirt/',
 'https://aniviashirt.com/product/meriteri03-minecraft-dnf-council-shirt/',
 'https://duancanhovinhomesgoldenriverbason.com/us-premium-gift-store-minecraft-dnf-council-funny-t-shirt/',
 'https://florakikyo.com.vn/fashion-news/top-trending-official-mud-slut-hitting-every-hole-2022-shirt/',
 'https://qiyanashirt.com/2022/09/15/almashirt-timthetatman-season-opener-helmet-shirt/

Extract unique source domain first.

In [6]:
from urllib.parse import urlparse

def get_base_url(url):
    return urlparse(url)._replace(path="", params="", query="", fragment="").geturl()

# Using a set to store unique base URLs
unique_base_urls = set(get_base_url(url) for url in urls)

# Converting the set to a list if needed
unique_base_urls = list(unique_base_urls)

unique_base_urls

['',
 'https://55hanman.org',
 'https://rintor.org',
 'https://w.laroza.one',
 'https://megatronicsconsulting.com',
 'https://numije.byninoune.fr',
 'https://sexloanluan1.com',
 'https://bdmusic23.lat',
 'http://serv2810.piratbit.fun',
 'https://www.hdfilmcanavari.org',
 'https://www.qijishow.com',
 'https://rykece.superfitonline.sk',
 'http://www.kcd.com.tr',
 'https://www.ciyuan03.com',
 'https://m.caimoge.net',
 'https://betongthanhduy.com',
 'https://pet.idkuu.com',
 'https://www115.newsofturkiye.com',
 'https://letscometoplay.com',
 'https://torrent.game4you.top',
 'https://www.av01.tv',
 'https://tkr183.com',
 'https://motphim3z.com',
 'https://nodumejy.ivexlibrary.sk',
 'https://kinozal-cam.kinoteatr.live',
 'https://enjoy-series.co',
 'https://www.bingoporno.com',
 'https://proua.org',
 'https://coracuj.point-pizza.fr',
 'https://goqinyv.floatingbb.sk',
 'https://mcracked.com',
 'https://behtaraneh.ir',
 'https://www.ipsacademy.org',
 'https://escortregio.com',
 'https://x1080h

In [7]:
len(unique_base_urls)

85231

85,231 urls. Maybe we can categorize them all?

In [8]:
import csv

# Open a new CSV file for writing
with open('source_urls.csv', 'w', newline='') as csvfile:
    writer = csv.writer(csvfile)

    # Write the header
    writer.writerow(['source_url'])

    # Write the unique source URLs
    for url in unique_base_urls:
        writer.writerow([url])

Uploaded to : https://docs.google.com/spreadsheets/d/1FwrM80v0lqw1RtyXxU5deSRdxkFUGvFvH-hTzzaIwDc/edit?usp=sharing

In [9]:
import pandas as pd

# Create a DataFrame with the unique source URLs and an empty 'include' column
df = pd.DataFrame(list(unique_base_urls), columns=['source_url'])
df['include'] = ''

# Drop empty rows
df = df[df['source_url'] != '']

# Print the DataFrame to verify the content
df

Unnamed: 0,source_url,include
1,https://55hanman.org,
2,https://rintor.org,
3,https://w.laroza.one,
4,https://megatronicsconsulting.com,
5,https://numije.byninoune.fr,
...,...,...
85226,https://arthursaguezartiste.fr,
85227,https://xemomapaq.agitated.dev,
85228,https://gr2.sportplus.live,
85229,https://vfxlrnk.fizjo-line.pl,


Maybe we can decide a set of words that we can all agree that should not be included.

In [10]:
# Set 'include' to False if 'source_url' contains any of the specified words
for word in ['porn',
             'xxx',
             'forum',
             'sex',
             'escort',
             'film',
             'pirate',
             'chaturbate',
             'girl',
             'ringtones',
             'manga',
            'naked',
            '.xyz',
            'football',
            'nude']:
    df.loc[df['source_url'].str.contains(word, case=False), 'include'] = False

df[df['include'] == False]

Unnamed: 0,source_url,include
6,https://sexloanluan1.com,False
9,https://www.hdfilmcanavari.org,False
26,https://www.bingoporno.com,False
33,https://escortregio.com,False
38,https://filmfaa.top,False
...,...,...
85188,https://porn300.top,False
85189,https://www.bayanescortistanbul.com,False
85197,https://livepornchat.webcam,False
85206,https://kino.gudfilm.org,False


In [11]:
df[df['include'] == ""].sample(20)

Unnamed: 0,source_url,include
23262,https://pussy-hub.com,
65841,https://bumavizy.lasertagx.si,
14610,https://kinovod392.cc,
20920,https://themerecords.com,
26914,http://oklive6-3.vkuser.net,
80115,https://v1.simp3s.biz,
80313,https://poscitech.com,
81103,https://a1465.ordo-wratislawitz.cz,
67919,https://www.jacketmakers.com,
49183,https://foxhunters.net,


## COPYRIGHTED URLS

In [13]:
# Extracting all copyrighted_urls

urls = []

for item in data:
    if 'dmca' in item:
        if 'works' in item['dmca']:
            for url in item['dmca']['works'][0]['copyrighted_urls']:
                url = url['url']
                urls.append(url)
                
# Using a set to store unique base URLs
unique_base_urls_copyright = set(get_base_url(url) for url in urls)

# Converting the set to a list if needed
unique_base_urls_copyright = list(unique_base_urls)

unique_base_urls_copyright

['',
 'https://55hanman.org',
 'https://rintor.org',
 'https://w.laroza.one',
 'https://megatronicsconsulting.com',
 'https://numije.byninoune.fr',
 'https://sexloanluan1.com',
 'https://bdmusic23.lat',
 'http://serv2810.piratbit.fun',
 'https://www.hdfilmcanavari.org',
 'https://www.qijishow.com',
 'https://rykece.superfitonline.sk',
 'http://www.kcd.com.tr',
 'https://www.ciyuan03.com',
 'https://m.caimoge.net',
 'https://betongthanhduy.com',
 'https://pet.idkuu.com',
 'https://www115.newsofturkiye.com',
 'https://letscometoplay.com',
 'https://torrent.game4you.top',
 'https://www.av01.tv',
 'https://tkr183.com',
 'https://motphim3z.com',
 'https://nodumejy.ivexlibrary.sk',
 'https://kinozal-cam.kinoteatr.live',
 'https://enjoy-series.co',
 'https://www.bingoporno.com',
 'https://proua.org',
 'https://coracuj.point-pizza.fr',
 'https://goqinyv.floatingbb.sk',
 'https://mcracked.com',
 'https://behtaraneh.ir',
 'https://www.ipsacademy.org',
 'https://escortregio.com',
 'https://x1080h

In [17]:
len(unique_base_urls_copyright)

85231

In [19]:
# Create a DataFrame with the unique source URLs and an empty 'include' column
df_copyright = pd.DataFrame(list(unique_base_urls_copyright), columns=['source_url_copyright'])
df_copyright['include'] = ''

# Drop empty rows
df_copyright = df_copyright[df_copyright['source_url_copyright'] != '']

# Print the DataFrame to verify the content
df_copyright

Unnamed: 0,source_url_copyright,include
1,https://55hanman.org,
2,https://rintor.org,
3,https://w.laroza.one,
4,https://megatronicsconsulting.com,
5,https://numije.byninoune.fr,
...,...,...
85226,https://arthursaguezartiste.fr,
85227,https://xemomapaq.agitated.dev,
85228,https://gr2.sportplus.live,
85229,https://vfxlrnk.fizjo-line.pl,


In [23]:
# Set 'include' to False if 'source_url' contains any of the specified words
for word in ['tumblr',
            'blogspot',
            'wordpress',
            'issuu',
            'over-blog',
            'livejournal']:
    
    df_copyright.loc[df_copyright['source_url_copyright'].str.contains(word, case=False), 'include'] = True

df_copyright[df_copyright['include'] == True]

Unnamed: 0,source_url_copyright,include
218,https://crazymovieupdates.blogspot.com,True
223,https://lgbrpcv.wordpress.com,True
549,https://sebastienaurillaclivrepdfgratuit.blogs...,True
701,https://crackwordpress.com,True
826,http://the-hot-test.blogspot.com,True
...,...,...
84248,https://melaniemartinezdz.blogspot.com,True
84282,https://flashstreamsnew.blogspot.com,True
84732,https://blogspot.com,True
84913,https://www.wordpressthemesall.com,True


In [31]:
df_copyright[df_copyright['include'] == ""].sample(20)

Unnamed: 0,source_url_copyright,include
33632,https://fanserials-19.kinoteatr.life,
43311,https://www.foxdisco.info,
66043,https://www.africatopsports.com,
48437,https://prod.privvbuy.site,
28162,https://uptomega.me,
65387,https://sc.arabseed.news,
31464,http://beta.ibethel.tv,
61,https://pzzej.datlngllfe.net,
77923,https://goforporn.com,
39726,http://livetv562.me,
