## Python basics
<br>

In [1]:
# the basis: variable assignment
corpus = "This is a very tiny corpus. But at least this tiny corpus has a second sentence."

In [2]:
# check the content of a variable
print(corpus)

This is a very tiny corpus. But at least this tiny corpus has a second sentence.


In [3]:
# in jupyter notebooks that's enough to print a variable - if it's the last row of a cell
corpus

'This is a very tiny corpus. But at least this tiny corpus has a second sentence.'

In [4]:
# check the data type of any Python object (everything is an object in Python)
type(corpus)

str

In [5]:
# objects / data types have their own methods
tokens = corpus.split()
tokens

['This',
 'is',
 'a',
 'very',
 'tiny',
 'corpus.',
 'But',
 'at',
 'least',
 'this',
 'tiny',
 'corpus',
 'has',
 'a',
 'second',
 'sentence.']

In [6]:
# split() returns a list (as indicated by the square brackets)
type(tokens)

list

In [7]:
# iterable/sequential data types support selection and operations based on index positions
tokens[1]

'is'

In [8]:
# operations can be nested and/or concatenated
[tokens[0], type(tokens[0])]

['This', str]

In [9]:
# strings are iterable too
corpus[0]

'T'

In [10]:
# use the "replace" method to improve our tokenization on whitespace
corpus = corpus.replace(".", " .")
corpus

'This is a very tiny corpus . But at least this tiny corpus has a second sentence .'

In [11]:
# now the punctuation gets splitted correctly
tokens = corpus.split()
tokens

['This',
 'is',
 'a',
 'very',
 'tiny',
 'corpus',
 '.',
 'But',
 'at',
 'least',
 'this',
 'tiny',
 'corpus',
 'has',
 'a',
 'second',
 'sentence',
 '.']

In [12]:
# the built-in method len() calculates the length of iterable data types
token_count = len(tokens)
token_count

18

In [13]:
# token_count is an integer object
type(token_count)

int

In [14]:
# now we know that our corpus consists of 18 tokens and 82 characters
char_count = len(corpus)
char_count

82

In [15]:
# with a for-loop we can access every item in a list or in other iterable data types
for token in tokens:
    print("token:\t" + token) # strings can be concatenated
    #print(f"token:\t{token}") # the f-string syntax does the trick as well

token:	This
token:	is
token:	a
token:	very
token:	tiny
token:	corpus
token:	.
token:	But
token:	at
token:	least
token:	this
token:	tiny
token:	corpus
token:	has
token:	a
token:	second
token:	sentence
token:	.


In [16]:
# let's get the sentences of our corpus
sentences = corpus.split(".")
sentences

['This is a very tiny corpus ',
 ' But at least this tiny corpus has a second sentence ',
 '']

In [17]:
# we have two tasks: get rid of the empty string and
# append the punctuation back to the sentences
sentences_stripped = []
for sentence in sentences:
    sentence = sentence.strip()
    # with conditions we can check if a statement is true or false
    if not sentence == "":
        sentence += " ."
        sentences_stripped.append(sentence)

sentences_stripped

['This is a very tiny corpus .',
 'But at least this tiny corpus has a second sentence .']

In [18]:
# dicts provide a mapping between keys and values
token_lengths = {}
type(token_lengths)

dict

In [19]:
# we can assign values to keys with dict_variable[key] = value
for token in tokens:
    token_lengths[token] = len(token)

token_lengths

{'This': 4,
 'is': 2,
 'a': 1,
 'very': 4,
 'tiny': 4,
 'corpus': 6,
 '.': 1,
 'But': 3,
 'at': 2,
 'least': 5,
 'this': 4,
 'has': 3,
 'second': 6,
 'sentence': 8}

In [20]:
# let's write a simple counter for the token frequencies of our corpus
counter = {}
for token in tokens:
    # check if we have already seen this token
    if token in counter:
        counter[token] += 1
    # if not (we see this token for the first time):
    else:
        counter[token] = 1

counter

{'This': 1,
 'is': 1,
 'a': 2,
 'very': 1,
 'tiny': 2,
 'corpus': 2,
 '.': 2,
 'But': 1,
 'at': 1,
 'least': 1,
 'this': 1,
 'has': 1,
 'second': 1,
 'sentence': 1}

In [21]:
# often programming tasks can be solved in more than one way
counter = {}
for token in tokens:
    if not token in counter:
        counter[token] = 0
    counter[token] += 1

counter

{'This': 1,
 'is': 1,
 'a': 2,
 'very': 1,
 'tiny': 2,
 'corpus': 2,
 '.': 2,
 'But': 1,
 'at': 1,
 'least': 1,
 'this': 1,
 'has': 1,
 'second': 1,
 'sentence': 1}

In [22]:
# for many problems code already exists, which can be imported
# from built-in or external Python modules
from collections import Counter

In [23]:
# well, that's shorter...
Counter(tokens)

Counter({'This': 1,
         'is': 1,
         'a': 2,
         'very': 1,
         'tiny': 2,
         'corpus': 2,
         '.': 2,
         'But': 1,
         'at': 1,
         'least': 1,
         'this': 1,
         'has': 1,
         'second': 1,
         'sentence': 1})

In [24]:
type(Counter(tokens))

collections.Counter

In [25]:
# we can use the most_common() method of Counter
# to sort our token frequencies
Counter(tokens).most_common()

[('a', 2),
 ('tiny', 2),
 ('corpus', 2),
 ('.', 2),
 ('This', 1),
 ('is', 1),
 ('very', 1),
 ('But', 1),
 ('at', 1),
 ('least', 1),
 ('this', 1),
 ('has', 1),
 ('second', 1),
 ('sentence', 1)]