# Session 2: Bag of Words 

In [27]:
import os 
import pandas as pd 
import nltk
import string
import re
from sklearn.feature_extraction.text import CountVectorizer

In [28]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\bax1408\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [29]:
stop_words = set(nltk.corpus.stopwords.words('english'))
punctuations = set(string.punctuation)

## Part 2: Bag of words representation of a text 

We need to represent words in numbers. 
- Bag of words 
- Embedding 

The simplest approach is to count the number of time a word appears in each document. 

**Steps** 
1. Read the document
2. convert all words to lowercase. 
3. Remove punctuation. 
4. Remove stop words 
5. Create equivalence class
6. Filter by Frequency 


---
---

**Step 1 and 2**

In [8]:
# read the documents 
# Step 1: read the files and store text and file names in a dictionary 
dictUNSpeech = {} # create an empty dictionary 
# The directory
fileAddress1 = '../../corpusExample/unSpeeches2000_2010'
# Open the file one by one - remember you need to tell python each single step - nothing here is automatic. 
# 
for file in os.listdir(fileAddress1):
    with open(os.path.join(fileAddress1, file), 'r', encoding='utf-8', errors='replace') as textFile: 
        dictUNSpeech[file.replace('.txt', '')] =  textFile.read()

# convert the dictionary to a dataframe 
dfUNSpeech = pd.DataFrame(list(dictUNSpeech.items()), columns=["id", "text"])

dfUNSpeech["isoAlpha"] = dfUNSpeech["id"].str.split("_", n=2,  expand=True)[0].astype('str')
dfUNSpeech["session"] = dfUNSpeech["id"].str.split("_", n=2, expand=True)[1].astype('int')
dfUNSpeech["year"] = dfUNSpeech["id"].str.split("_", n=2, expand=True)[2].astype('int')

In [18]:
dfUNSpeech

Unnamed: 0,id,text,isoAlpha,session,year
0,AFG_55_2000,"On my way to the\nAssembly Hall, I was informe...",AFG,55,2000
1,AFG_56_2001,"﻿At the outset, on\nbehalf of the Government o...",AFG,56,2001
2,AFG_57_2002,﻿Not very far from here stood\ntwo towers that...,AFG,57,2002
3,AFG_58_2003,﻿There is no reality more\noppressive than the...,AFG,58,2003
4,AFG_59_2004,Nelson Mandela once\ndescribed his countryís t...,AFG,59,2004
...,...,...,...,...,...
2074,ZWE_61_2006,Let me begin my statement \nby echoing the sen...,ZWE,61,2006
2075,ZWE_62_2007,Allow me to congratulate \nMr. Kerim on his el...,ZWE,62,2007
2076,ZWE_63_2008,I wish to begin by joining \nthose who have co...,ZWE,63,2008
2077,ZWE_64_2009,Let me begin by extending \nour warmest congra...,ZWE,64,2009


In [9]:
vectorizer = CountVectorizer()
dtm = vectorizer.fit_transform(dfUNSpeech['text'])   # returns a sparse matrix
# Step 4: Convert to DataFrame
dfDTM = pd.DataFrame(dtm.toarray(), columns=vectorizer.get_feature_names_out())
dfDTM["id"] = dfUNSpeech["id"]
# Optional: move 'id' column to the front
cols = dfDTM.columns.tolist()
cols = [cols[-1]] + cols[:-1]
dfDTM = dfDTM[cols]
dfDTM

Unnamed: 0,id,000,001,005,01,020,04,041,043,05,...,ìstandards,ìwe,île,œi,œone,œour,œresponsibility,œrightâ,œwith,štampar
0,AFG_55_2000,3,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,AFG_56_2001,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,AFG_57_2002,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,AFG_58_2003,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,AFG_59_2004,2,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2074,ZWE_61_2006,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2075,ZWE_62_2007,2,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,1,0
2076,ZWE_63_2008,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2077,ZWE_64_2009,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


- This is quite messy and not useful.
    - To solve these problems, we develop a preprocessor function step by step.  

In [30]:
# Define custom analyzer for CountVectorizer
def custom_preprocessor(text):
    text = re.sub(r'\d+', '', # removing digits 
                  text.lower(), # lowering the case 
                  tokens = [token for token in tokens if token.isalpha() and token not in stop_words and token not in punctuations] # Remove punctuation, stopwords, and non-alphabetic tokens
                 )
    tokens = nltk.tokenize.word_tokenize(text)
    return tokens
# === Step 2: Vectorize using lemmatization ===
vectorizer = CountVectorizer(analyzer=custom_preprocessor)
dtm = vectorizer.fit_transform(dfUNSpeech["text"])

# === Step 3: Create document-term matrix ===
dfDTM = pd.DataFrame(dtm.toarray(), columns=vectorizer.get_feature_names_out())
dfDTM["id"] = dfUNSpeech["id"]
dfDTM = dfDTM[["id"] + dfDTM.columns[:-1].tolist()]

UnboundLocalError: local variable 'tokens' referenced before assignment

In [21]:
dfDTM

Unnamed: 0,id,!,$,&,','','agenda,'bulgaria,'despite,'enemy,...,﻿to,﻿today,﻿trinidad,﻿twenty-four,﻿two,﻿uruguay,﻿we,﻿yesterday,﻿you,﻿your
0,AFG_55_2000,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,AFG_56_2001,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,AFG_57_2002,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,AFG_58_2003,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,AFG_59_2004,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2074,ZWE_61_2006,0,2,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2075,ZWE_62_2007,1,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2076,ZWE_63_2008,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2077,ZWE_64_2009,0,3,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


## Part 2: TF-IDF matrix 

## Part 3: Dictionar Approach 