In [1]:
from IPython.display import clear_output
import time
def countdown_timer(seconds):
    for i in range(seconds, -1, -1):
        clear_output(wait=True)
        print(f"⏳ Time remaining: {i} seconds")
        time.sleep(1)
    print("✅ Time's up!")

# Session 1: Text and its features

In [2]:
import os  # Provides functions for interacting with the operating system (e.g., file paths, directories).
import string  # Offers utilities for common string operations, such as punctuation handling.
import pandas as pd  # Provides data structures and tools for data manipulation and analysis.
import nltk  # A fundamental library for natural language processing tasks (e.g., tokenization, POS tagging).
import readability  # Used to compute readability metrics and other linguistic features of text.


In [3]:
# Download necessary NLTK resources for tokenization, lemmatization, and stopword removal
nltk.download('punkt')        # Tokenizer models (used for splitting text into sentences or words)
nltk.download('wordnet')      # WordNet lexical database (used for lemmatization)
nltk.download('omw-1.4')      # Open Multilingual WordNet (supports lemmatization with multiple languages)
nltk.download('stopwords')    # List of common stopwords in various languages

[nltk_data] Downloading package punkt to F:\Mahdi\Dropbox\Dropbox\code
[nltk_data]     \courseTeach\.venv\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to F:\Mahdi\Dropbox\Dropbox\co
[nltk_data]     de\courseTeach\.venv\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to F:\Mahdi\Dropbox\Dropbox\co
[nltk_data]     de\courseTeach\.venv\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package stopwords to F:\Mahdi\Dropbox\Dropbox\
[nltk_data]     code\courseTeach\.venv\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [4]:
# Initialize the stemmer and lemmatizer
stemmer = nltk.stem.PorterStemmer()           # Reduces words to their stem (e.g., "running" → "run")
lemmatizer = nltk.stem.WordNetLemmatizer()    # Reduces words to their base or dictionary form (lemma)

---
---

## 1.1. Meta data

### Basic Features of Texts

Each text contains three types of useful data:

- **Metadata**
  - Information about the text, such as the author, text type (e.g., speech, tweet), publication date, language, etc.
  - Sometimes, metadata is embedded within the text itself.

- **Textual Data**
  - The linguistic features and actual content of the text.


---
---

### Metadata

- When designing a text analysis project, think carefully about the types of information you want to collect.
  - If certain metadata is not captured early on, recovering it later can be very time-consuming.

- Metadata refers to information beyond the text itself and its linguistic features.
  - Examples include the author, publication date, text type, language, and more.

- There are three common ways to store and retrieve metadata:
  - **Storing in a separate file** – Save metadata in a structured file (e.g., CSV or JSON) and load it when needed.
  - **Encoding in the file name** – Embed metadata (e.g., date, author ID) directly in the file name and parse it during processing.
  - **Using an API** – Retrieve metadata directly from online sources or databases.

---
---

### Exercise 1.1

In [None]:
countdown_timer(300)

---
---

### Retrieve Metadata from File Names - Steps

1. **Read the text files and store them in a dictionary.**

2. **Convert the dictionary to a DataFrame.**
   - One column should contain the file names.
   - Another column should contain the corresponding text content.

3. **Split the file names into multiple variables** (e.g., author, date, category), based on how you encoded the metadata in the file name.


### Example 1 - Retrieve metadata from file names


In [5]:
# Step 1: Read the files and store text and file names in a dictionary

dictUNSpeech = {}  # Create an empty dictionary to store file name and text content

# Define the directory containing the text files
fileAddress1 = '../../corpusExample/unSpeeches2000_2010'

# Loop through all files in the directory
# For each file:
#   - Open it
#   - Read its content (converted to lowercase)
#   - Store it in the dictionary with the filename (without '.txt') as the key
for file in os.listdir(fileAddress1):
    with open(os.path.join(fileAddress1, file), 'r', encoding='utf-8', errors='ignore') as textFile:
        text = textFile.read().lower()
        dictUNSpeech[file.replace('.txt', '')] = text

# Step 2: Convert the dictionary to a DataFrame
# The 'id' column will hold the cleaned file names, and 'text' will hold the corresponding content
dfUNSpeech = pd.DataFrame(list(dictUNSpeech.items()), columns=["id", "text"])

# Step 3: Extract metadata from the file name
# Each file name is assumed to follow the pattern: isoAlpha_session_year.txt (e.g., USA_65_2010)

# Extract country code (isoAlpha), session number, and year from the file name
dfUNSpeech["isoAlpha"] = dfUNSpeech["id"].str.split("_", n=2, expand=True)[0].astype(str)
dfUNSpeech["session"] = dfUNSpeech["id"].str.split("_", n=2, expand=True)[1].astype(int)
dfUNSpeech["year"] = dfUNSpeech["id"].str.split("_", n=2, expand=True)[2].astype(int)

In [6]:
dfUNSpeech.head(5)

Unnamed: 0,id,text,isoAlpha,session,year
0,AFG_55_2000,"on my way to the\nassembly hall, i was informe...",AFG,55,2000
1,AFG_56_2001,"﻿at the outset, on\nbehalf of the government o...",AFG,56,2001
2,AFG_57_2002,﻿not very far from here stood\ntwo towers that...,AFG,57,2002
3,AFG_58_2003,﻿there is no reality more\noppressive than the...,AFG,58,2003
4,AFG_59_2004,nelson mandela once\ndescribed his countryís t...,AFG,59,2004


---
---

### Example 2 - Retrieve Metadata from a Separate File — Steps

1. **Read the dataset containing the metadata.**

2. **Merge the metadata with the dataset containing the text files.**
   - Use a common identifier (e.g., file name or country code) to perform the merge.


In [7]:
fileAddress2 = '../../corpusExample/speakersSession.xlsx'
speakerData = pd.read_excel(fileAddress2)
dfUNSpeechComplete = dfUNSpeech.merge(speakerData, on=['year', 'isoAlpha'])

In [8]:
dfUNSpeechComplete.head(5)

Unnamed: 0,id,text,isoAlpha,session,year,Session,cname,speakerName,post
0,AFG_55_2000,"on my way to the\nassembly hall, i was informe...",AFG,55,2000,55,Afghanistan,Abdullah Abdullah,MFA
1,AFG_56_2001,"﻿at the outset, on\nbehalf of the government o...",AFG,56,2001,56,Afghanistan,Ravan Farhâdi,UN_Rep
2,AFG_57_2002,﻿not very far from here stood\ntwo towers that...,AFG,57,2002,57,Afghanistan,Hâmid Karzai,President
3,AFG_58_2003,﻿there is no reality more\noppressive than the...,AFG,58,2003,58,Afghanistan,Hâmid Karzai,President
4,AFG_59_2004,nelson mandela once\ndescribed his countryís t...,AFG,59,2004,59,Afghanistan,Mr. Hamid Karzai,President


### Some Abbreviation
- Prime Minister: PM 
- Deputy Prime Minister: DPM 
- Head of Government: HOG
- Head of State: HOS
- Minister for Foreign Affairs: MFA
- UN Representative: UN_rep


---
---

### Importing a Corpus and Its Metadata via API: Steps

1. **Create an account on the provider’s website.**

2. **Obtain your API key.**
   - The API key is a unique code that allows you to access your account programmatically through Python.

3. **Read the API documentation** to understand the syntax and parameters for retrieving data.

4. **Retrieve the data** using the appropriate request methods (e.g., `GET`, `POST`) and your API key.


### Example 3 - working with API

Example 3: working with API**---
---

### Exercise 1.2

In [None]:
countdown_timer(300)

---
---

## 1.2. Linguistic Features

### How Do Linguistic Features Help Us?

Sometimes, we don’t need complex methods to achieve our goals—simple measurements can be very informative.

Examples of useful linguistic features:

- Length of the text
- Number of sentences
- Average sentence length
- Text complexity (e.g., readability)
- Use of specific word types (e.g., adjectives, verbs)
- Vocabulary richness
- ...and more

---
---

### Example 1 - Measuring the length of documents

In [9]:
# Calculate the number of words in each text
# This splits each text into words (by whitespace) and counts the length of the resulting list
dfUNSpeechComplete['word_count'] = dfUNSpeechComplete['text'].str.split().str.len()

# Display the updated DataFrame
dfUNSpeechComplete[['id', 'word_count']].head()


Unnamed: 0,id,text,isoAlpha,session,year,Session,cname,speakerName,post,word_count
0,AFG_55_2000,"on my way to the\nassembly hall, i was informe...",AFG,55,2000,55,Afghanistan,Abdullah Abdullah,MFA,2873
1,AFG_56_2001,"﻿at the outset, on\nbehalf of the government o...",AFG,56,2001,56,Afghanistan,Ravan Farhâdi,UN_Rep,2073
2,AFG_57_2002,﻿not very far from here stood\ntwo towers that...,AFG,57,2002,57,Afghanistan,Hâmid Karzai,President,1700
3,AFG_58_2003,﻿there is no reality more\noppressive than the...,AFG,58,2003,58,Afghanistan,Hâmid Karzai,President,1617
4,AFG_59_2004,nelson mandela once\ndescribed his countryís t...,AFG,59,2004,59,Afghanistan,Mr. Hamid Karzai,President,1096
...,...,...,...,...,...,...,...,...,...,...
2072,ZWE_61_2006,let me begin my statement \nby echoing the sen...,ZWE,61,2006,61,Zimbabwe,Mr. Robert Gabriel Mugabe,President,2371
2073,ZWE_62_2007,allow me to congratulate \nmr. kerim on his el...,ZWE,62,2007,62,Zimbabwe,Robert G. Mugabe,President,2052
2074,ZWE_63_2008,i wish to begin by joining \nthose who have co...,ZWE,63,2008,63,Zimbabwe,Robert Mugabe,President,1800
2075,ZWE_64_2009,let me begin by extending \nour warmest congra...,ZWE,64,2009,64,Zimbabwe,Robert G. Mugabe,President,1722


---
---

### Example 2 - Counting the number of sentences in documents

In [10]:
# Apply sentence tokenizer to count the number of sentences in each text
# This uses NLTK's sentence tokenizer, which handles punctuation and abbreviation rules
dfUNSpeechComplete['sentence_count'] = dfUNSpeechComplete['text'].apply(
    lambda x: len(nltk.tokenize.sent_tokenize(x))
)

# Preview the result
dfUNSpeechComplete[['id', 'sentence_count']].head()


---
---

### Example 3 – Looking at the Frequency of Speeches from Different Officials

In [11]:
dfUNSpeechComplete['post'].value_counts()

post
MFA            955
President      533
PM             221
UN_Rep         187
DPM            109
V-President     34
HOS             24
HOG             14
Name: count, dtype: int64

---
---

### Features of Text That Could Be Interesting

When analyzing text, several features can provide valuable insights into structure, style, or complexity. Some commonly explored features include:

- **Length** – Number of characters, words, or sentences
- **Readability** – How easy or difficult the text is to understand (e.g., using Flesch-Kincaid score)
- **Entropy** – A measure of unpredictability or information density in the text
- **Lexical Diversity** – The variety of vocabulary used (e.g., type-token ratio)
- **Verb Tense Usage** – Distribution of past, present, and future tenses, which can indicate narrative style or focus


### Example 4 – Linguistic Features: Readability Scores

In [12]:
# Define a short example text
SmallText = """
This is ILE summer school. It is held in Hamburg at the Institute for Law and Economics,
which is one of its kind in Europe.

This is the week of methods. We go through computational and experimental methods,
which are common in Law and Economics research.
"""

# Extract various linguistic and readability features
linguisticFeatures = readability.getmeasures(SmallText, lang='en')

# Display the results
linguisticFeatures


OrderedDict([('readability grades',
              OrderedDict([('Kincaid', 6.550869565217393),
                           ('ARI', 7.63644927536232),
                           ('Coleman-Liau', 8.984565978260871),
                           ('FleschReadingEase', 75.40644927536233),
                           ('GunningFogIndex', 8.742028985507247),
                           ('LIX', 41.42028985507246),
                           ('SMOGIndex', 8.477225575051662),
                           ('RIX', 4.0),
                           ('DaleChallIndex', 3.5066202898550727)])),
             ('sentence info',
              OrderedDict([('characters_per_word', 4.543478260869565),
                           ('syll_per_word', 1.3695652173913044),
                           ('words_per_sentence', 15.333333333333334),
                           ('sentences_per_paragraph', 1.5),
                           ('type_token_ratio', 0.7391304347826086),
                           ('directspeech_ratio', 0.0),

In [18]:
# Measure readability using Flesch Reading Ease score
# The score indicates how easy a text is to read — higher values mean easier readability

dfUNSpeechComplete['flesch_reading_ease'] = dfUNSpeechComplete['text'].apply(
    lambda x: readability.getmeasures(x, lang='en')['readability grades']['FleschReadingEase']
)

# Display the updated DataFrame with readability scores
dfUNSpeechComplete[['id', 'flesch_reading_ease']].head()


Unnamed: 0,id,flesch_reading_ease
0,AFG_55_2000,47.500066
1,AFG_56_2001,59.91597
2,AFG_57_2002,53.515308
3,AFG_58_2003,56.769041
4,AFG_59_2004,52.270847


---
---

### Exercise 1.3

In [None]:
countdown_timer(300)

---
---

## Content of text

### Tokenization

To study the content of a text, we first break the document into smaller elements that can be measured and analyzed.
These basic elements are called **tokens**.

- A **token** is typically a word, but not always.
- In most cases, a token corresponds to a word, but it can also be punctuation, a number, or even a sentence—depending on the type of tokenization.

Tokenization is the first essential step in almost any text analysis task.

### NLTK

- In the earlier days of Python-based NLP (around 8 years ago), the **NLTK** package was the primary tool for performing basic natural language processing tasks.
- While more modern libraries now exist (e.g., spaCy, transformers), NLTK is still widely used for teaching and for lightweight NLP tasks.

In this example, we use NLTK to break down our documents into individual words (i.e., **tokenization**).


### Example 1 - Tokenization

In [17]:
# Tokenize the text column: split each document into a list of word tokens
dfUNSpeechComplete['tokens'] = dfUNSpeechComplete['text'].apply(nltk.tokenize.word_tokenize)

# Display the updated DataFrame
dfUNSpeechComplete[['id', 'tokens']].head()


Unnamed: 0,id,tokens
0,AFG_55_2000,"[on, my, way, to, the, assembly, hall, ,, i, w..."
1,AFG_56_2001,"[﻿at, the, outset, ,, on, behalf, of, the, gov..."
2,AFG_57_2002,"[﻿not, very, far, from, here, stood, two, towe..."
3,AFG_58_2003,"[﻿there, is, no, reality, more, oppressive, th..."
4,AFG_59_2004,"[nelson, mandela, once, described, his, countr..."


---
---

### Exersice 1.4

In [None]:
countdown_timer(300)

---
---

### Removing Uninformative Tokens

- Not all tokens are meaningful for text analysis.
- We often want to remove tokens that do not carry substantial information.

The most common examples include:

- **Punctuation**
- **Stopwords**: Common words such as pronouns, articles, and prepositions (e.g., "he", "the", "in") that occur frequently but add little analytical value.


### Example 2 - removing words that are not meaningful

In [19]:
# Load English stopwords from NLTK
stop_words = set(nltk.corpus.stopwords.words('english'))  # Common words like "the", "is", "in", etc.

# Load standard punctuation symbols from the string module
punctuations = set(string.punctuation)  # Symbols like '.', ',', '!', '?', etc.


In [20]:
# Function to clean and tokenize text
def text_cleaner(text):
    # Tokenize the text into words
    tokens = nltk.tokenize.word_tokenize(text)

    # Convert to lowercase and remove stopwords and punctuation
    clean_tokens = [
        word.lower()
        for word in tokens
        if word.lower() not in stop_words and word.lower() not in punctuations
    ]

    return clean_tokens

# Apply the cleaning function to the text column
dfUNSpeechComplete['tokens_clean'] = dfUNSpeechComplete['text'].apply(text_cleaner)


In [21]:
# Access the first row and two specific variables (columns)
dfUNSpeechComplete.iloc[0][['tokens', 'tokens_clean']]

tokens          [on, my, way, to, the, assembly, hall, ,, i, w...
tokens_clean    [way, assembly, hall, informed, supreme, state...
Name: 0, dtype: object

In [24]:
# Compare the number of raw vs. clean tokens in the first document
print("Length of original tokens:", len(dfUNSpeechComplete.loc[0, 'tokens']))
print("Length of cleaned tokens:", len(dfUNSpeechComplete.loc[0, 'tokens_clean']))


Length of original tokens: 3171
Length of cleaned tokens: 1622


In [25]:
# Compare the number of unique tokens before and after cleaning
print("Number of unique original tokens:", len(set(dfUNSpeechComplete.loc[0, 'tokens'])))
print("Number of unique cleaned tokens:", len(set(dfUNSpeechComplete.loc[0, 'tokens_clean'])))


Number of unique original tokens: 1023
Number of unique cleaned tokens: 933


---
---

### Exercise 1.5

In [None]:
countdown_timer(300)

---
---

### Words That Are Similar but Not Identical

- Consider the words **run**, **running**, and **runs**:
  - They have the same basic meaning.
  - But from the perspective of a tokenizer, they are treated as different words.

- This creates problems when we want to group similar forms of a word.

- **Solutions:**
  - **Stemming** – Reduces words to their root form by chopping off suffixes (e.g., "running" → "run").
    - Often fast but may produce non-standard words.
  - **Lemmatization** – Reduces words to their dictionary (lemma) form, using grammatical context.
    - More accurate but requires more resources.


### Stemming

- **Stemming** is a technique that reduces words to their root form by cutting off common suffixes.

- For example:
  - The words **run**, **running**, and **runs** are all reduced to **run**.
  - This helps group similar words together and avoid counting them as separate tokens.

- The result is that we get the same root word multiple times instead of treating each variation as different.


### Example 3 - Stemming

In [26]:
# Function to clean and stem tokens
def text_cleaner(text):
    # Step 1: Tokenize the text
    tokens = nltk.tokenize.word_tokenize(text)

    # Step 2: Lowercase and remove stopwords and punctuation
    cleaned_tokens = [
        word.lower()
        for word in tokens
        if word.lower() not in stop_words and word.lower() not in punctuations
    ]

    # Step 3: Apply stemming to the cleaned tokens
    stemmed_tokens = [stemmer.stem(token) for token in cleaned_tokens]

    return cleaned_tokens, stemmed_tokens

# Apply the function to each document and expand the result into two new columns
dfUNSpeechComplete[['cleaned_tokens', 'tokens_clean_stemmed']] = (
    dfUNSpeechComplete['text']
    .apply(text_cleaner)
    .apply(pd.Series)
)

# Preview the result
dfUNSpeechComplete[['id', 'cleaned_tokens', 'tokens_clean_stemmed']].head()


Unnamed: 0,id,cleaned_tokens,tokens_clean_stemmed
0,AFG_55_2000,"[way, assembly, hall, informed, supreme, state...","[way, assembl, hall, inform, suprem, state, co..."
1,AFG_56_2001,"[﻿at, outset, behalf, government, islamic, sta...","[﻿at, outset, behalf, govern, islam, state, af..."
2,AFG_57_2002,"[﻿not, far, stood, two, towers, symbolized, fr...","[﻿not, far, stood, two, tower, symbol, freedom..."
3,AFG_58_2003,"[﻿there, reality, oppressive, silence, nation,...","[﻿there, realiti, oppress, silenc, nation, lon..."
4,AFG_59_2004,"[nelson, mandela, described, countryís, transi...","[nelson, mandela, describ, countryí, transit, ..."


In [27]:
# Access the first row and select specific columns: original, cleaned, and stemmed tokens
dfUNSpeechComplete.loc[0, ['tokens', 'tokens_clean', 'tokens_clean_stemmed']]

tokens                  [on, my, way, to, the, assembly, hall, ,, i, w...
tokens_clean            [way, assembly, hall, informed, supreme, state...
tokens_clean_stemmed    [way, assembl, hall, inform, suprem, state, co...
Name: 0, dtype: object

In [28]:
# Compare the number of unique tokens before and after cleaning and stemming
print("Number of unique original tokens:       ", len(set(dfUNSpeechComplete.loc[0, 'tokens'])))
print("Number of unique cleaned tokens:        ", len(set(dfUNSpeechComplete.loc[0, 'tokens_clean'])))
print("Number of unique cleaned & stemmed tokens:", len(set(dfUNSpeechComplete.loc[0, 'tokens_clean_stemmed'])))


Number of unique original tokens:        1023
Number of unique cleaned tokens:         933
Number of unique cleaned & stemmed tokens: 819


### Lemmatization

- Lemmatization goes one step further than stemming.
- It transforms a word into its **dictionary base form** (known as the *lemma*).
- Unlike stemming, lemmatization considers the context and part of speech to produce real words.

For example:
- "running", "ran" → "run"
- "better" → "good"


### Example 4 - Lemmatization

In [29]:
# Function to clean, stem, and lemmatize tokens
def text_cleaner(text):
    # Step 1: Tokenize the text into words
    tokens = nltk.tokenize.word_tokenize(text)

    # Step 2: Lowercase and remove stopwords and punctuation
    cleaned_tokens = [
        word.lower()
        for word in tokens
        if word.lower() not in stop_words and word.lower() not in punctuations
    ]

    # Step 3: Apply stemming
    stemmed_tokens = [stemmer.stem(token) for token in cleaned_tokens]

    # Step 4: Apply lemmatization
    lemmatized_tokens = [lemmatizer.lemmatize(token) for token in cleaned_tokens]

    return cleaned_tokens, stemmed_tokens, lemmatized_tokens

# Apply the function and expand the results into separate columns
dfUNSpeechComplete[['cleaned_tokens', 'tokens_clean_stemmed', 'tokens_clean_lemmatized']] = (
    dfUNSpeechComplete['text'].apply(text_cleaner).apply(pd.Series)
)

# Preview the result
dfUNSpeechComplete[['id', 'cleaned_tokens', 'tokens_clean_stemmed', 'tokens_clean_lemmatized']].head()


Unnamed: 0,id,cleaned_tokens,tokens_clean_stemmed,tokens_clean_lemmatized
0,AFG_55_2000,"[way, assembly, hall, informed, supreme, state...","[way, assembl, hall, inform, suprem, state, co...","[way, assembly, hall, informed, supreme, state..."
1,AFG_56_2001,"[﻿at, outset, behalf, government, islamic, sta...","[﻿at, outset, behalf, govern, islam, state, af...","[﻿at, outset, behalf, government, islamic, sta..."
2,AFG_57_2002,"[﻿not, far, stood, two, towers, symbolized, fr...","[﻿not, far, stood, two, tower, symbol, freedom...","[﻿not, far, stood, two, tower, symbolized, fre..."
3,AFG_58_2003,"[﻿there, reality, oppressive, silence, nation,...","[﻿there, realiti, oppress, silenc, nation, lon...","[﻿there, reality, oppressive, silence, nation,..."
4,AFG_59_2004,"[nelson, mandela, described, countryís, transi...","[nelson, mandela, describ, countryí, transit, ...","[nelson, mandela, described, countryís, transi..."


In [30]:
# Access the first row and view the original, cleaned, stemmed, and lemmatized tokens
dfUNSpeechComplete.loc[0, ['tokens', 'tokens_clean', 'tokens_clean_stemmed', 'tokens_clean_lemmatized']]


tokens                     [on, my, way, to, the, assembly, hall, ,, i, w...
tokens_clean               [way, assembly, hall, informed, supreme, state...
tokens_clean_stemmed       [way, assembl, hall, inform, suprem, state, co...
tokens_clean_lemmatized    [way, assembly, hall, informed, supreme, state...
Name: 0, dtype: object

In [31]:
# Compare the number of unique tokens at different stages of preprocessing
print("Number of unique original tokens:         ", len(set(dfUNSpeechComplete.loc[0, 'tokens'])))
print("Number of unique cleaned tokens:          ", len(set(dfUNSpeechComplete.loc[0, 'tokens_clean'])))
print("Number of unique cleaned + stemmed tokens:", len(set(dfUNSpeechComplete.loc[0, 'tokens_clean_stemmed'])))
print("Number of unique cleaned + lemmatized tokens:", len(set(dfUNSpeechComplete.loc[0, 'tokens_clean_lemmatized'])))

Number of unique original tokens:          1023
Number of unique cleaned tokens:           933
Number of unique cleaned + stemmed tokens: 819
Number of unique cleaned + lemmatized tokens: 898


### Exercise 1.6