In [None]:
from os.path import basename
my_text = 'https://github.com/peterverhaar/textmining_with_python/blob/main/Corpus/BraveNewWorld.txt'
file_name = basename(my_text)

import ssl

try:
    _create_unverified_https_context = ssl._create_unverified_context
except AttributeError:
    pass
else:
    ssl._create_default_https_context = _create_unverified_https_context


import sys
!pip install nltk
import nltk
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('averaged_perceptron_tagger_eng')
nltk.download('wordnet')
nltk.download('punkt_tab')
import requests


def download(url):
    response = requests.get(url)
    if response:
        new_file_name = basename(url)
        print(f"{new_file_name} is downloaded!")
        out = open(new_file_name,'w',encoding='utf-8')
        out.write(response.text)
        out.close()
        
def sorted_by_value( dict , ascending = True ):
    if ascending: 
        return {k: v for k, v in sorted(dict.items(), key=lambda item: item[1])}
    else:
        return {k: v for k, v in reversed( sorted(dict.items(), key=lambda item: item[1]))}

download('https://raw.githubusercontent.com/peterverhaar/textmining_with_python/refs/heads/main/text_mining.py')
download(my_text)

# 14. Lexicons

Lists of frequent words can help us to develop an rough understanding of the main concerns of the text. They don't necessary offer insights on the broader topics or the themes that are discussed within the text. If we want to investigate the semantics of the text at a somehwat deeper level, it can be useful to make use of word lists which map the words that occur in a text to broader pre-defined semantic categories. Such lists of words are often referred to a 'lexicons'. We can make a lexicon listing words related to 'religion', for instance. Next, by counting the numbers of times a text uses one of the words on the lexicon, we can form an impression of the importance of this particular theme within this text.  

Examples of applications which have implemented this type of 'semantic tagging' include
the [Harvard General Inquirer (HGI)](http://www.wjh.harvard.edu/~inquirer/homecat.htm), [the Linguistic Inquiry and Word Count (LIWC)
tool](http://liwc.wpengine.com/)  and the [UCREL Semantic Analysis System (USAS)](http://ucrel.lancs.ac.uk/usas/). The programmers responsible for the *Harvard General Inquirer*, for example, have defined 182 semantic categories, and they have compiled long list of words pertaining to these categories.  

To let you work with the possibilities of semantic tagging, a number of the lexicons that have been made available have been downloaded and merged. Next to the lexicons developed for the HGI and USAS, the word lists created for this course also include terms taken from lists compiled by [Bing Liu](https://www.cs.uic.edu/~liub/) and by the project team that worked on the [Multi-Perspective Question Answering (MPQA) tool](http://mpqa.cs.pitt.edu/). 

The merged semantic lexicons can be found here: 
https://github.com/peterverhaar/semantic-tagging/tree/main/Lexicons


In the code below, the lexicon files that are available are all mentioned in the list named `lexicon_files`. The code downloads all of these lexicon files, and saved these in a dictionary named `lexicons`. 

In [None]:
import re
import requests
import os

baseUrl = 'https://raw.githubusercontent.com/peterverhaar/semantic-tagging/main/Lexicons/'
lexicon_files = [  'Academic.txt' , 'Economics.txt' ,  'Legal.txt' , 'Military.txt' , 'Movement.txt' , 'Pain.txt' , 'Passive.txt' , 'Pleasure.txt' , 'Politics.txt' , 'Power.txt' , 'Religion.txt' , 'Space.txt' , 'Time.txt' , 'Transportation.txt' , 'Vice.txt' , 'Weather.txt' , 'workAndEmployment.txt' ]

dir = 'Lexicons'
if not os.path.isdir(dir):
    os.mkdir(dir)


for l in lexicon_files:
    topic = l[ : l.rindex('.') ]
    response = requests.get( baseUrl + l)
    words = []
    if response:
        response.encoding = 'utf-8'
        out = open( os.path.join( dir , l ) , 'w' , encoding = 'utf-8' )
        out.write( response.text )
        out.close()

print('Lexicons have been downloaded!')

Alternatively, you can also work with create your own lexicon files. A lexicon file is simple a text file listing all the terms that are relevant. You can create new files, and add these to the directory `Lexicon`. You can also edit existing lexicons, or remove those lexicons that are not relevant to you. 

The following code reads in all the lexicon files from `Lexicons` and saves all the words on these lists in a dictionary named `lexicons`.

In [None]:
import os 
from os.path import join
import re

lexicons = dict()

dir = 'Lexicons'

for file in os.listdir(dir):

    topic = file[ : file.rindex('.') ]
    words = []
    
    with open( join(dir,file) , encoding = 'utf-8' ) as file_handler:   
        for l in file_handler: 
            if re.search( r'\w' , l ):
                words.append(l.strip())

    lexicons[topic] = words    


You can use the code below to count the number of occurrences of the words in these various lexicons within the texts of your corpus. The code searches in lemmatised versions of all the corpus texts. The result (consisting of counts for all the texts in your corpus) is stored in a file named 'lexicon.csv'.

If your texts are long, or if the corpus contains many texts, running the code make take quite a while. 

In [None]:
from nltk.corpus import stopwords
from nltk import word_tokenize,sent_tokenize,pos_tag

import os
from os.path import join
from nltk.stem import WordNetLemmatizer
from text_mining import *

csv = open( 'lexicon.csv' , 'w' , encoding = 'utf-8' )

## print header
csv.write( 'lexicon,relative_count\n' )

print( f'\nLemmatising {file_name} ...') 
path = file_name
with open( path , encoding = 'utf-8' ) as fh:
    full_text = fh.read()
lemmatised = lemmatise(full_text)

print( 'Performing semantic tagging for {} ...'.format( file_name ) )

words = word_tokenize(lemmatised)
words = remove_punctuation(words)
freq = dict()
for w in words:
    freq[w] = freq.get(w,0)+1
tokens = len(lemmatised)

for l in lexicons:
    print(f'{l} ...')    

    countOccurrences = 0
    for word in l:
        countOccurrences += freq.get(word,0)

    csv.write( f'{l},{countOccurrences / tokens}\n')


csv.close()

print("Done!")


In the cell below, the counts that have made for the terms from the various lexicons can be visualised as a bar chart. As the value of the variable named `y`, you need to type in the name of the lexicon, without the .txt extension. 

In [None]:
%matplotlib inline

import matplotlib.pyplot as plt
import pandas as pd

df = pd.read_csv('lexicon.csv')

fig = plt.figure( figsize=( 7 ,6 ) )
ax = plt.axes()

x = 'lexicon'
y = 'relative_count'


bar_width = 0.45
opacity = 0.8

ax.bar( df[x] , df[y] , width = bar_width, alpha = opacity , color = '#23a145')

plt.xticks(rotation= 75)

ax.set_xlabel('Categories' , fontsize= 12)
ax.set_ylabel('Lexicons' , fontsize = 12 )
ax.set_title( y.title() , fontsize=20 )


plt.show()

### Exercise 14.1

Using the lexicon 'Religion.txt', can you find all the terms in *Ullyses* that are related to religion? 

In [None]:
my_file = 'https://raw.githubusercontent.com/peterverhaar/textmining_with_python/refs/heads/main/Corpus/Ullyses.txt'
download(my_file)

In [None]:
import os
from os.path import join
from nltk.stem import WordNetLemmatizer
from collections import Counter

file_name = basename(my_file)

print( f'\nLemmatising {file_name} ...') 
path = file_name
with open( path , encoding = 'utf-8' ) as fh:
    full_text = fh.read()
lemmatised = lemmatise(full_text)

print( 'Performing semantic tagging for {} ...'.format( file_name ) )

words = word_tokenize(lemmatised.lower())
words = remove_punctuation(words)

words = word_tokenize(lemmatised.lower())
words = remove_punctuation(words)

found_words = []
for word in words:
    if word in lexicons['Religion']:
        found_words.append(word)
        
freq = Counter(found_words)

for word,count in freq.most_common(30):
    print(f"{word} => {count}")

In [None]:
found_words = []
for word in words:
    if word in lexicons['Weather']:
        found_words.append(word)
        
freq = Counter(found_words)

for word,count in freq.most_common(30):
    print(f"{word} => {count}")

In [None]:
found_words = []
for word in words:
    if word in lexicons['Military']:
        found_words.append(word)
        
freq = Counter(found_words)

for word,count in freq.most_common(30):
    print(f"{word} => {count}")