## Python basics
<br>

In [None]:
# the basis: variable assignment
corpus = "This is a very tiny corpus. But at least this tiny corpus has a second sentence."

In [None]:
# check the content of a variable
print(corpus)

In [None]:
# in jupyter notebooks that's enough to print a variable - if it's the last row of a cell
corpus

In [None]:
# check the data type of any Python object (everything is an object in Python)
type(corpus)

In [None]:
# objects / data types have their own methods
tokens = corpus.split()
tokens

In [None]:
# split() returns a list (as indicated by the square brackets)
type(tokens)

In [None]:
# iterable/sequential data types support selection and operations based on index positions
tokens[1]

In [None]:
# operations can be nested and/or concatenated
[tokens[0], type(tokens[0])]

In [None]:
# strings are iterable too
corpus[0]

In [None]:
# use the "replace" method to improve our tokenization on whitespace
corpus = corpus.replace(".", " .")
corpus

In [None]:
# now the punctuation gets splitted correctly
tokens = corpus.split()
tokens

In [None]:
# the built-in method len() calculates the length of iterable data types
token_count = len(tokens)
token_count

In [None]:
# token_count is an integer object
type(token_count)

In [None]:
# now we know that our corpus consists of 18 tokens and 82 characters
char_count = len(corpus)
char_count

In [None]:
# with a for-loop we can access every item in a list or in other iterable data types
for token in tokens:
    print("token:\t" + token) # strings can be concatenated
    #print(f"token:\t{token}") # the f-string syntax does the trick as well

In [None]:
# let's get the sentences of our corpus
sentences = corpus.split(".")
sentences

In [None]:
# we have two tasks: get rid of the empty string and
# append the punctuation back to the sentences
sentences_stripped = []
for sentence in sentences:
    sentence = sentence.strip()
    # with conditions we can check if a statement is true or false
    if not sentence == "":
        sentence += " ."
        sentences_stripped.append(sentence)

sentences_stripped

In [None]:
# dicts provide a mapping between keys and values
token_lengths = {}
type(token_lengths)

In [None]:
# we can assign values to keys with dict_variable[key] = value
for token in tokens:
    token_lengths[token] = len(token)

token_lengths

In [None]:
# let's write a simple counter for the token frequencies of our corpus
counter = {}
for token in tokens:
    # check if we have already seen this token
    if token in counter:
        counter[token] += 1
    # if not (we see this token for the first time):
    else:
        counter[token] = 1

counter

In [None]:
# often programming tasks can be solved in more than one way
counter = {}
for token in tokens:
    if not token in counter:
        counter[token] = 0
    counter[token] += 1

counter

In [None]:
# for many problems code already exists, which can be imported
# from built-in or external Python modules
from collections import Counter

In [None]:
# well, that's shorter...
Counter(tokens)

In [None]:
type(Counter(tokens))

In [None]:
# we can use the most_common() method of Counter
# to sort our token frequencies
Counter(tokens).most_common()

<br><br>
## spaCy
<br>

In [None]:
# first we have to import the previously installed library
import spacy

In [None]:
# let's create a tiny corpus
spacy_corpus = "Mary Lou McDonald grew up in a republican household in Dublin to a backdrop of the Troubles in Northern Ireland. \"My family's connections with the IRA would have been in the 1920s,\" she says."

In [None]:
# befor spaCy can annotate anything, we have to load a trained statistical model
nlp = spacy.load("en_core_web_sm")

In [None]:
# nlp() runs the default model annotation pipeline on our example corpus
# 'doc' now contains the annotated spaCy data object
doc = nlp(spacy_corpus)

In [None]:
# count the tokens of our corpus document
len(doc)

In [None]:
# spaCy docs are composed of Token objects
print(type(doc[0]))
doc[0]

In [None]:
# Token objects have different attributes containing their linguistic annotations
for token in doc:
    print(token.text)
    print(token.pos_)
    print(token.pos_, token.tag_)
    print(token.lemma_)
    print(token.morph)
    print(token.ent_iob_, token.ent_type_, "\n")

In [None]:
spacy.explain("NORP")

In [None]:
for sentence in doc.sents:
    print(sentence)

In [None]:
for entity in doc.ents:
    print(entity)

In [None]:
type(doc.ents[0])

In [None]:
vrt = "<text>\n"
for sentence in doc.sents:
    vrt += "<s>\n"
    entity = False
    for token in sentence:
        if entity == False and token.ent_iob_ == "B":
            entity = True
            vrt += f"<entity type=\"{token.ent_type_}\">\n"
        elif entity == True and not token.ent_iob_ == "I":
            entity = False
            vrt += "</entity type>\n"
        vrt += "\t".join((token.text, token.pos_, token.lemma_, str(token.morph), token.ent_type_, token.ent_iob_)) + "\n"
    vrt += "</s>\n"
vrt += "</text>\n"

print(vrt)

In [None]:
vrt = "<text>\n"
for sentence in doc.sents:
    vrt += "<sentence>\n"
    for token in sentence:
        vrt += token.text + "\n"
    vrt += "</sentence>\n"
vrt += "</text>\n"

print(vrt)

In [None]:
vrt = "<person>\nMary\nLou\nMcDonald\n</person>"
print(vrt)

<br><br>
## data formats
<br>

In [None]:
import json
import csv

In [None]:
tweets = []

with open("tweets.jsonl", encoding="utf8") as jsonl:
    for tweet in jsonl:
        tweets.append(json.loads(tweet))

In [None]:
with open("tweets.csv", "w", encoding="utf8", newline="") as csvfile:
    fieldnames = ["author_id", "conversation_id", "created_at", "id", "in_reply_to_user_id", "like_count", "quote_count", "reply_count", "retweet_count", "referenced_tweet_ids", "reference_types", "text"]
    writer = csv.DictWriter(csvfile, dialect="excel", delimiter=";", fieldnames=fieldnames, extrasaction="ignore")
    writer.writeheader()
    
    for tweet in tweets:        
        tweet_fields = {
            "author_id": tweet["author_id"],
            "conversation_id": tweet["conversation_id"],
            "created_at": tweet["created_at"],
            "id": tweet["id"],
            "in_reply_to_user_id": tweet.get("in_reply_to_user_id"),
            "like_count": tweet.get("public_metrics", {}).get("like_count"),
            "quote_count": tweet.get("public_metrics", {}).get("quote_count"),
            "reply_count": tweet.get("public_metrics", {}).get("reply_count"),
            "retweet_count": tweet.get("public_metrics", {}).get("retweet_count"),
            "referenced_tweet_ids": ", ".join([referenced.get("id") for referenced in tweet.get("referenced_tweets", {})]),
            "reference_types": ", ".join([referenced.get("type") for referenced in tweet.get("referenced_tweets", {})]),
            "text": tweet["text"].replace("\n", " ")
        }
        writer.writerow(tweet_fields)

<br><br>
## trafilatura
<br>

In [None]:
import trafilatura

In [None]:
url = "https://europa.eu/citizens-initiative-forum/blog/europeans-safe-connections-call-stronger-regulation-wireless-internet-schools_en"
downloaded = trafilatura.fetch_url(url)

In [None]:
result = trafilatura.extract(downloaded)
result

In [None]:
result = trafilatura.extract(
    downloaded,
    output_format="xml",
    url=url,
    #include_comments=True,
    #include_formatting=True,
    #include_links=True,
    #include_images=True,
    #include_tables=True,
    #favor_precision=True,
    #favor_recall=True,
    #target_language="en"
)

In [None]:
print(result)

In [None]:
from xml.dom import minidom

In [None]:
print(minidom.parseString(result).toprettyxml(indent="    "))

In [None]:
from IPython.core.display import display, HTML
display(HTML("<style>div.output_area pre {white-space: pre;}</style>"))

In [None]:
print(trafilatura.extract(downloaded, output_format="csv"))

In [None]:
print(trafilatura.extract(downloaded, output_format="json"))

In [None]:
from trafilatura.spider import focused_crawler

In [None]:
homepage = "https://europa.eu/citizens-initiative-forum/blog_en"

In [None]:
# starting a crawl
to_visit, known_urls = focused_crawler(homepage, max_seen_urls=10, max_known_urls=100000)

In [None]:
list(to_visit)

In [None]:
sorted(known_urls)

In [None]:
# resuming a crawl
to_visit, known_urls = focused_crawler(homepage, max_seen_urls=10, max_known_urls=100000, todo=to_visit, known_links=known_urls)

In [None]:
# filter the crawl links
sorted([url for url in known_urls if url.startswith("https://europa.eu/citizens-initiative-forum/blog/")])