# Basic Data Types and Operators

### Comments, Variables, and `print()`

In [2]:
# This is a comment as it starts with #

'''
This is a 
multiple-line 
comment
'''

"""
This is another 
multiple-line 
comment
"""

# Python is a strongly-typed language, so the type matters when performing operations.
# On the other hand, Python is a dynamically-typed language, so the variable type
# is determined based on the data it holds in the runtime

# python has a function called "print()" that prints variables based on their types

# Integer variable
x = 3
print(x)

# Float variable
y = 3.5
print(y)

# print() can print multiple objects, and it takes a parameter called "sep" that is by default a space
print(x, y)

# in the next line, we set the "sep" parameter as a new line "\n"
print(x, y, sep="\n")


3
3.5
3 3.5
3
3.5


### Function `type()`
you can use `type()` function to determine the type of the variable

In [3]:
print(type(x), type(y))

<class 'int'> <class 'float'>


### Strings

In [4]:
# strings - both single or double quotes can be used to define strings
z = "String with double quotes"
w = 'String with single quotes'
a = "String with 'single quotes' in it"
b = 'String with "double quotes" in it'
print(z, w, a, b, sep='\n')
print (type(z))

String with double quotes
String with single quotes
String with 'single quotes' in it
String with "double quotes" in it
<class 'str'>


### Casting

In [5]:
x = str(3)
y = int("3")
z = float(3)
print(x, y, z, sep='\n')

3
3
3.0


### Boolean Variables

In [6]:
a = True
b = False
print(a, b, int(a), int(b))
print(type(a))

True False 1 0
<class 'bool'>


### Lists

In [7]:
# a list can contain items of different types
l = [1, "NLP", 2.3]
print('This is the original list:                               ', l)

# add item to list
l.append("4")
print('This is the list after adding "4" to it:                 ', l)

# extend list with another list
l.extend([5, 6])
print('This is the list after extending it with [5, 6]:         ', l)

# insert item to list at a specific index
l.insert(2, 0)
print('This is the list after inserting "0" at index "2":       ', l)

# remove item from list
l.remove("NLP")
print('This is the list after removing "NLP" from it:             ', l)

# remove item from list by index
d=l.pop(3)
print(f"The list after removing the item of index 3, which is {d}: {l}")

# sort list
l.sort()
print('This is the list after sorting it:                       ', l)

# reverse list
l.reverse()
print('This is the list after reversing it:                     ', l)


# get count of items in a list
print('number of items in the list:                             ', len(l))

# access items in a list (zero-indexed)
print('first item in the list:                                  ', l[0])

# access the last item in a list
print('last item in the list:                                   ', l[-1])

# access a slice from the list
print('from item of index "1" to item of index "2":             ', l[1:3]) 
print('from the first item to item of index "1":                ', l[:2])
print('from item of index "1" to the last item:                 ', l[1:])

# set item in a list
l[1] = 2

print('the list after changing the second item from "NLP" to 2: ', l)

# remove item from list
del l[0]

print('the list after removing the first item:                  ', l)


This is the original list:                                [1, 'NLP', 2.3]
This is the list after adding "4" to it:                  [1, 'NLP', 2.3, '4']
This is the list after extending it with [5, 6]:          [1, 'NLP', 2.3, '4', 5, 6]
This is the list after inserting "0" at index "2":        [1, 'NLP', 0, 2.3, '4', 5, 6]
This is the list after removing "NLP" from it:              [1, 0, 2.3, '4', 5, 6]
The list after removing the item of index 3, which is 4: [1, 0, 2.3, 5, 6]
This is the list after sorting it:                        [0, 1, 2.3, 5, 6]
This is the list after reversing it:                      [6, 5, 2.3, 1, 0]
number of items in the list:                              5
first item in the list:                                   6
last item in the list:                                    0
from item of index "1" to item of index "2":              [5, 2.3]
from the first item to item of index "1":                 [6, 5]
from item of index "1" to the last item:            

### Dictionaries
Dictionaries store data as (`key`, `value`) pairs of arbitrary types.

In [8]:
d = {   
        "1": 1, 
        "2": 2, 
        3: "3", 
        4.5:"4.5"
    }
print('item with a string key:              ', d["1"])
print('item with an int key:                ', d[3])
print('item with a float key:               ', d[4.5])

# get value of a specific key
print('value of key 4.5:                    ', d.get(4.5))

# add item to the dictionary
d[5] = "5"
print('dictionary after adding an item:     ', d)

# update item in the dictionary
d[5] = "five"
print('dictionary after updating an item:   ', d)

# print all keys in the dictionary
print('keys in the dictionary:              ', d.keys())

# print all values in the dictionary
print('values in the dictionary:            ', d.values())

# print all items in the dictionary
print('items in the dictionary:             ', d.items())

# pop item from the dictionary
item=d.pop('1')
print(f"dictionary after popping the item with key '1', which is {item}: {d}")


# delete item with a specific key
del d["2"]
print('dictionary after deleting an item:   ', d)


item with a string key:               1
item with an int key:                 3
item with a float key:                4.5
value of key 4.5:                     4.5
dictionary after adding an item:      {'1': 1, '2': 2, 3: '3', 4.5: '4.5', 5: '5'}
dictionary after updating an item:    {'1': 1, '2': 2, 3: '3', 4.5: '4.5', 5: 'five'}
keys in the dictionary:               dict_keys(['1', '2', 3, 4.5, 5])
values in the dictionary:             dict_values([1, 2, '3', '4.5', 'five'])
items in the dictionary:              dict_items([('1', 1), ('2', 2), (3, '3'), (4.5, '4.5'), (5, 'five')])
dictionary after popping the item with key '1', which is 1: {'2': 2, 3: '3', 4.5: '4.5', 5: 'five'}
dictionary after deleting an item:    {3: '3', 4.5: '4.5', 5: 'five'}


### Tuples
Tuples are `immutable` objects, lists are `mutable`.

Tuples cannot be changed while lists can.

In [9]:
t = (1, 2.5, "3")
print('original tuple:  ', t)
print('first item       ', t[0])
print()

"""
Tuple is immutable, although you can use the + operator to concatenate several tuples.
 The old object is still present at this point, and a new object is created.
"""
t = t + (4,)
print('after adding 4:  ', t)

# tuples can be a key of a dict
d = {t: "tuple"}
print(d[t])
print(d)

original tuple:   (1, 2.5, '3')
first item        1

after adding 4:   (1, 2.5, '3', 4)
tuple
{(1, 2.5, '3', 4): 'tuple'}


In [10]:
# this will get an error (recall: tuple are immutable)
# tuples are immutable so you can't change them
t[0]=2

TypeError: 'tuple' object does not support item assignment

### Sets
- `Unchangeable`: same as tuples
- `unindexed`: we cannot access a specific index
- `unique values only`: no duplicate values

In [11]:
s = {"a", "b", "c"}
print(s)

# add item to the set
s.add(5)
# remove item from the set
s.remove("a")
print(s)

# try adding duplicate items
s = {5, "b", "c", "b"}
print(s)

{'a', 'c', 'b'}
{5, 'c', 'b'}
{5, 'c', 'b'}


In [12]:
# this will get an error (recall: sets not subscriptable)
s[0]

TypeError: 'set' object is not subscriptable

### Conversions

In [13]:
# convert a string to a list of characters
l=list("abc def")

print(l)

['a', 'b', 'c', ' ', 'd', 'e', 'f']


In [14]:
s= set(["one","two","one"])
print(s)

{'one', 'two'}


In [16]:
w="words with spaces".split() # split the string into a list of words
print(w)

# split the string on a specific character (,)
n="1,4,8,2".split(",")
print(n)

['words', 'with', 'spaces']
['1', '4', '8', '2']


In [17]:
# join the list of words into a string
print(' '.join(w) )

# join the list of words into a string with a comma between them
print(','.join(n))

words with spaces
1,4,8,2


### Arithmetic Operators

In [18]:
x = 2
y = 3
print(x + y)
print(x - y)
print(y / x)
print(y // x)
print(x * y)
print(y % x)
print(x**y)

5
-1
1.5
1
6
1
8


### Assignment operators

In [19]:
x = 2
print(x)
x += 1
print(x)
x -= 1
print(x)
x /= 2
print(x)
x *= 3
print(x)
x //= 2
print(x)

2
3
2
1.0
3.0
1.0


### Comparison operators

In [20]:
x = 3
y = 2
print(x == y)
print(x != y)
print(x >= y)
print(x <= y)
print(x > y)
print(x < y)

False
True
True
False
True
False


### Logical Operators

In [21]:
x = True
y = False
print(x and y)
print(x or y)
print(not y)

False
True
True


# Flow Control
`Blocks` in `Python` are structured using `indentation`

### `if`-`elif`-`else`

In [22]:
x = 2
y = 3
if x > y:
    # here indentation is important
    print ("x > y")
elif x < y:
    print("x < y")
else:
    print("x = y")

x < y


### `while` loops

In [23]:
# while loops
i = 0
while i < 10:
    print(i)
    i += 1

0
1
2
3
4
5
6
7
8
9


### `for` loops
for is used to iterate over sequences like lists, dictionaries, sets, strings, tuples, ...


In [24]:
l = ["1", "2", 3, 4, 5.3]
for item in l:
    print(item)
    
print()

# range function
print(list(range(2, 10)))
print(list(range(10)))
print(list(range(2, 10, 2)))
print()

for i in range(2, 10):
    print(i)
    
print()

s = "NLP"
for c in s:
    print(c)

1
2
3
4
5.3

[2, 3, 4, 5, 6, 7, 8, 9]
[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
[2, 4, 6, 8]

2
3
4
5
6
7
8
9

N
L
P


# Functions

In [25]:
# function with no arguments
def func():
    print("in func")

In [26]:
func()

in func


In [27]:
# function with arguments
def func(a):
    print(a)

In [28]:
func(3)
func([1, 2])

3
[1, 2]


In [29]:
# function that makes some logic and returns a value
def func(a):
    return a + 1

In [30]:
f = func(3)
print(f)

4


In [31]:
# function that takes a variable number of arguments
def func(*args):
    for arg in args:
        print(arg)

In [32]:
func(1, 2, "NLP")

1
2
NLP


In [33]:
# function with 3 arguments
def func(a, b, c):
    print(a, b, c)

In [35]:
func(1, 2, 3)
func(b=2, a=1, c=3)

1 2 3
1 2 3


In [36]:
# a function with an argument that has a default value
def func(a,b=1):
    print(a,b)

In [37]:
func(2,5)
func(2)

2 5
2 1


In [38]:
# get an error (non-default argument follows default argument)
def func(b=9,a):
    print(a,b)

SyntaxError: non-default argument follows default argument (25820532.py, line 2)

In [39]:
# a function with a keyword arguments
def func(**kwargs):
    print(kwargs["subject"])

In [40]:
func(subject="NLP", section=1)

NLP


# Classes

In [41]:
class Human:
    def __init__(self, name):
        self.name = name
        print("Human created")
    
    def __str__(self):
        return f"My name is {self.name}"
    
    def changeName(self, name):
        print(f"replacing {self.name} with {name}")
        self.name = name

In [42]:
h = Human("omar")
print(h)
h.changeName("mohamed")
print(h)

Human created
My name is omar
replacing omar with mohamed
My name is mohamed


In [43]:
class Student(Human):
    def __init__(self, name, year):
        Human.__init__(self, name)
        self.year = year
        print("Student created")
        
    def __str__(self):
        return f"Student name: {self.name}\nIn year: {self.year}"
    
    def passed(self):
        self.year += 1

In [44]:
s = Student("omar", 4)
print(s)
s.passed()
print(s)

Human created
Student created
Student name: omar
In year: 4
Student name: omar
In year: 5


# Towards Text Processing

In [85]:
# Natural Language Toolkit
!pip install nltk

In [45]:
import nltk
nltk.download('book')

[nltk_data] Downloading collection 'book'
[nltk_data]    | 
[nltk_data]    | Downloading package abc to
[nltk_data]    |     C:\Users\omars\AppData\Roaming\nltk_data...
[nltk_data]    |   Package abc is already up-to-date!
[nltk_data]    | Downloading package brown to
[nltk_data]    |     C:\Users\omars\AppData\Roaming\nltk_data...
[nltk_data]    |   Package brown is already up-to-date!
[nltk_data]    | Downloading package chat80 to
[nltk_data]    |     C:\Users\omars\AppData\Roaming\nltk_data...
[nltk_data]    |   Package chat80 is already up-to-date!
[nltk_data]    | Downloading package cmudict to
[nltk_data]    |     C:\Users\omars\AppData\Roaming\nltk_data...
[nltk_data]    |   Package cmudict is already up-to-date!
[nltk_data]    | Downloading package conll2000 to
[nltk_data]    |     C:\Users\omars\AppData\Roaming\nltk_data...
[nltk_data]    |   Package conll2000 is already up-to-date!
[nltk_data]    | Downloading package conll2002 to
[nltk_data]    |     C:\Users\omars\AppData\R

True

In [46]:
from nltk.book import *

*** Introductory Examples for the NLTK Book ***
Loading text1, ..., text9 and sent1, ..., sent9
Type the name of the text or sentence to view it.
Type: 'texts()' or 'sents()' to list the materials.
text1: Moby Dick by Herman Melville 1851
text2: Sense and Sensibility by Jane Austen 1811
text3: The Book of Genesis
text4: Inaugural Address Corpus
text5: Chat Corpus
text6: Monty Python and the Holy Grail
text7: Wall Street Journal
text8: Personals Corpus
text9: The Man Who Was Thursday by G . K . Chesterton 1908


In [47]:
type(text1)

nltk.text.Text

In [48]:
text1

<Text: Moby Dick by Herman Melville 1851>

In [49]:
# search for a specific word in a text
text1.concordance("monstrous")

Displaying 11 of 11 matches:
ong the former , one was of a most monstrous size . ... This came towards us , 
ON OF THE PSALMS . " Touching that monstrous bulk of the whale or ork we have r
ll over with a heathenish array of monstrous clubs and spears . Some were thick
d as you gazed , and wondered what monstrous cannibal and savage could ever hav
that has survived the flood ; most monstrous and most mountainous ! That Himmal
they might scout at Moby Dick as a monstrous fable , or still worse and more de
th of Radney .'" CHAPTER 55 Of the Monstrous Pictures of Whales . I shall ere l
ing Scenes . In connexion with the monstrous pictures of whales , I am strongly
ere to enter upon those still more monstrous stories of them which are to be fo
ght have been rummaged out of this monstrous cabinet there is no telling . But 
of Whale - Bones ; for Whales of a monstrous size are oftentimes cast up dead u


In [50]:
# the number of tokens in a given text (words and punctuation symbols)
len(text3)

44764

In [51]:
# to obtain the vocabulary of a given corpora (the unique words and punctuations)
sorted(set(text3))

['!',
 "'",
 '(',
 ')',
 ',',
 ',)',
 '.',
 '.)',
 ':',
 ';',
 ';)',
 '?',
 '?)',
 'A',
 'Abel',
 'Abelmizraim',
 'Abidah',
 'Abide',
 'Abimael',
 'Abimelech',
 'Abr',
 'Abrah',
 'Abraham',
 'Abram',
 'Accad',
 'Achbor',
 'Adah',
 'Adam',
 'Adbeel',
 'Admah',
 'Adullamite',
 'After',
 'Aholibamah',
 'Ahuzzath',
 'Ajah',
 'Akan',
 'All',
 'Allonbachuth',
 'Almighty',
 'Almodad',
 'Also',
 'Alvah',
 'Alvan',
 'Am',
 'Amal',
 'Amalek',
 'Amalekites',
 'Ammon',
 'Amorite',
 'Amorites',
 'Amraphel',
 'An',
 'Anah',
 'Anamim',
 'And',
 'Aner',
 'Angel',
 'Appoint',
 'Aram',
 'Aran',
 'Ararat',
 'Arbah',
 'Ard',
 'Are',
 'Areli',
 'Arioch',
 'Arise',
 'Arkite',
 'Arodi',
 'Arphaxad',
 'Art',
 'Arvadite',
 'As',
 'Asenath',
 'Ashbel',
 'Asher',
 'Ashkenaz',
 'Ashteroth',
 'Ask',
 'Asshur',
 'Asshurim',
 'Assyr',
 'Assyria',
 'At',
 'Atad',
 'Avith',
 'Baalhanan',
 'Babel',
 'Bashemath',
 'Be',
 'Because',
 'Becher',
 'Bedad',
 'Beeri',
 'Beerlahairoi',
 'Beersheba',
 'Behold',
 'Bela',
 'Belah

In [52]:
print(len(set(text3)))
print(len(text3))
print(len(set(text3)) / len(text3) * 100)
# what do you reveal comparing then number of tokens with the vocabulary number?

2789
44764
6.230453042623537


In [53]:
# to compute the number of occurances of a specific word
text3.count("I")

484

# Let's think about Text

A text is a sequence of words and character (tokens) separated by white spaces, new lines, ...
We can simply represent any corpora as a sequence of tokens so in python it is simply a list. This is how nltk represents text corporas

In [54]:
sent1 = ['Call', 'me', 'Ishmael', '.']

# to see the total number of tokens
print(len(sent1))

4


In [56]:
sent2

['The',
 'family',
 'of',
 'Dashwood',
 'had',
 'long',
 'been',
 'settled',
 'in',
 'Sussex',
 '.']

In [57]:
sent3

['In',
 'the',
 'beginning',
 'God',
 'created',
 'the',
 'heaven',
 'and',
 'the',
 'earth',
 '.']

In [58]:
# to concatenate two sentences (lists)
sent2 + sent3

['The',
 'family',
 'of',
 'Dashwood',
 'had',
 'long',
 'been',
 'settled',
 'in',
 'Sussex',
 '.',
 'In',
 'the',
 'beginning',
 'God',
 'created',
 'the',
 'heaven',
 'and',
 'the',
 'earth',
 '.']

In [59]:
# to add new token to a sentence
sent1.append('Some')
sent1

['Call', 'me', 'Ishmael', '.', 'Some']

In [60]:
# to index a sentence (list) by index
text4[173]

'awaken'

In [61]:
# to get the first index of a specific word
text4.index('awaken')

173

In [62]:
# You can use list slicing to get a part of the text
text5[16715:16735]

['U86',
 'thats',
 'why',
 'something',
 'like',
 'gamefly',
 'is',
 'so',
 'good',
 'because',
 'you',
 'can',
 'actually',
 'play',
 'a',
 'full',
 'game',
 'without',
 'buying',
 'it']

In [63]:
# To calculate the word count of a corpora
fdist1 = FreqDist(text1)
print(type(fdist1))
fdist1.most_common(10)

<class 'nltk.probability.FreqDist'>


[(',', 18713),
 ('the', 13721),
 ('.', 6862),
 ('of', 6536),
 ('and', 6024),
 ('a', 4569),
 ('to', 4542),
 (';', 4072),
 ('in', 3916),
 ('that', 2982)]

In [64]:
fdist1['whale']

906

In [65]:
# To filter words based on the word length
# let's get the words having more than 15 character
long_words = [w for w in set(text1) if len(w) > 15]
long_words

['superstitiousness',
 'characteristically',
 'uncomfortableness',
 'circumnavigation',
 'uninterpenetratingly',
 'indispensableness',
 'comprehensiveness',
 'cannibalistically',
 'preternaturalness',
 'circumnavigating',
 'Physiognomically',
 'responsibilities',
 'circumnavigations',
 'undiscriminating',
 'apprehensiveness',
 'uncompromisedness',
 'subterraneousness',
 'physiognomically',
 'simultaneousness',
 'hermaphroditical',
 'CIRCUMNAVIGATION',
 'irresistibleness',
 'indiscriminately',
 'supernaturalness']

In [66]:
# what about filtering based on the word frequency in the corpora?
fdist5 = FreqDist(text5)
common_words = [w for w in set(text5) if fdist5[w] > 500]
common_words

['i', 'hi', '?', 'you', 'to', 'JOIN', ',', 'I', '.', 'lol', 'a', 'the', 'PART']

In [67]:
# python built-in string comaprison operators

# checks if string starts with sub-string
print("omar".startswith('o'))
# checks if string ends with sub-string
print("omar".endswith('r'))
# checks if string is sub-string of another
print("ma" in "omar")
# checks if all characters in the string are lowercase
print("omar".islower())
# checks if all characters in the string are uppercase
print("OMAR".isupper())
# checks if all the characters in a string are alphabetic characters (a-z) only
print("omar".isalpha())
# checks if all the characters are alphanumeric (a-z, 0-9) only
print("omar1".isalnum())
# checks if all the characters are numeric (0-9) only
print("123".isdigit())
# checks if the string is title-cased. (all words in a string begin with uppercase letters and the remaining characters are lowercase letters)
print("Introduction To Python".istitle())

True
True
True
True
True
True
True
True
True
