Maziar Izadi

Data wrangling in Python

Email: alpha.maziar

In [1]:
# Import required libraries

# Regular Expressions (REGeX)
import re
from nltk.probability import *

# Natural Language Toolkit
import nltk

# Functions creating iterators for efficient looping
import itertools
from itertools import chain
from itertools import groupby

# Convert a collection of text documents to a matrix of token counts
from sklearn.feature_extraction.text import CountVectorizer

In [2]:
# open the file and read the contents
data = []

# make sure you define the encoding format otherwise you might get this error: 
# UnicodeDecodeError: 'charmap' codec can't decode byte 0x9d in position 260893: character maps to <undefined>
with open('data.txt', encoding="utf8") as f:
    data = f.read().lower() # read the content and lower the letters (for consistency)

In [3]:
# make a pattern for REGEX to find and keep matching words only
pattern = re.compile(r"[a-zA-Z]+(?:[-'][a-zA-Z]+)?") 

# tokenise the words: match the pattern to file's content and tokenize the content
tokenised = pattern.findall(data) 

In [4]:
# pass the length of the 'tokenised' series into a variable
tokenised_len = len(tokenised)

# indexing the tokens
indexes = [i for i, v in enumerate(tokenised) if v=='id' and i+1 < tokenised_len and tokenised[i+1]=='title']

In [5]:
# from itertools recipes
def pairwise(iterable, fillvalue=None):
    """
       This function iterates through the list of tokens and 
       creates sub list to include tokens related to one job ad only
    """
    a, b = iter(iterable), iter(iterable)
    next(b, None)
    return itertools.zip_longest(a, b, fillvalue=fillvalue)

# pairwise based on indexes in the last block and store in the 'tokenised' as a list
tokenised = [tokenised[i:j] for i,j in pairwise(indexes)] 

- Exclude words less than 4 char: 

- I have kept all the tokens which are more than 3 characters and renamed the rest to 'to_removed

- "to_remove" is added to "stopwords" list further down

In [6]:
tokenised = [[word if len(word) > 3 else "to_remove" for word in job] for job in tokenised]

### Removing stopwords

In [7]:
# adding'to_removed' string to the list of stopwords
stopwords = []
with open('stopwords_en.txt',"a") as f:
    f.write("\nto_remove") #\n to shift to next line

In [8]:
with open('stopwords_en.txt') as f:
    stopwords = f.read().splitlines() #reading stopwords line and create stopwords as a list

In [9]:
# convert stopwords into set
# Python set is better choice than list. 
# Set runs much faster than lists in terms of searching a large number of hashable items

stopwordsset = set(stopwords) 

In [10]:
def purifier(tokenList,remove_token):
    """
        This function takes two input (list of current tokens and list of tokens to be removed)
        The function converts the list into set to improve the performance
        and return a list of sets each of which include purified tokens
        and remove_token lists are removed
    """
    return [set(word for word in job if word not in remove_token) for job in tokenList]

In [11]:
# running 'purifier' function
tokenised = purifier(tokenised,stopwordsset)

You need to remove the words that appear only once in one job advertisement description, save them ( No duplication) as a txt file (refer to the required output). You will need to exclude those words in the generated vocabulary.

In [12]:
# Using chain() function to join all the words in all the job ads together by making a list
stop_wrds_removed_words = list(chain.from_iterable([word for word in job] for job in tokenised))
# convert the list of words into set to remove duplicates and create the set of vocabulary
stop_wrds_removed_vocab = set(stop_wrds_removed_words)
# passing the words in FreqDisrt() function to cound the number of token
# this function counts the number of times a word occurs in the whole corpus regardless which ad it is in.
fd = FreqDist(stop_wrds_removed_words)

In [13]:
# finding the Less Frequent Tokens.

# create a list of token which have occured more than once
# convert the list into set for performance improvement
once_only = set([k for k, v in fd.items() if v == 1])
# sort the set into alphabetical order
once_only = sorted(once_only)

set(once_only)

{'dobson',
 'nurs',
 'applianceengineerwhitegoods',
 'rman',
 'thereon',
 'techniquesdelivery',
 "steady's",
 'assistantaylesbury',
 'articulated',
 'cary',
 'managementprocess',
 'multicontact',
 'snack',
 'perfmon',
 'emulate',
 'openedge',
 'tredegar',
 'caddesignerwithestimating',
 'projectdriven',
 'lookin',
 'yacht',
 'autocoded',
 'opportuities',
 'outcomesexperience',
 'scenic',
 'tewksbury',
 'retrofit',
 'xiexperience',
 'cobol',
 'declinature',
 'announced',
 'sticky',
 'pcduproviders',
 'airborne',
 'achievedconduct',
 'joel',
 'projectmanagermefacilitiesmanagementwestlondon',
 'poststudy',
 'dunfermline',
 'munirm',
 'magnox',
 'tolondonbridge',
 'coldrooms',
 'qualityrequired',
 'unvented',
 'wellcome',
 'conwy',
 'relationshipsacross',
 'perfecting',
 'workplease',
 'remeasurement',
 'bfms',
 'minipiling',
 'dualcom',
 'rabbitmq',
 'hospice',
 'albeit',
 'mariani',
 'highintegrity',
 'pyrometry',
 'hfea',
 'hang',
 'recalibration',
 'skinny',
 'broadberryrisetechnical',


### lowFreq.txt
- Saving the sorted list of the words that appear only once in one job advertisement description to a file

In [14]:
out_file = open("lowFreq.txt", 'w')
for d in once_only:
    out_file.write(''.join(d) + '\n')
out_file.close()

In [15]:
# At this stage, I repead the same steps above
# However, this time the intention is to find the highFreq words
# start from removing lowFreq tokens from the list of tokens

tokenised = purifier(tokenised,once_only)

- Now, I'm going to create a new list of words after once_only words are removed

In [16]:
LowFreqRemoved_Words = list(chain.from_iterable([word for word in job] for job in tokenised))
LowFreqRemoved_vocab = set(LowFreqRemoved_Words)
LowFreqRemoved_fd = FreqDist(LowFreqRemoved_Words)

In [17]:
highFreq = set([k for k, v in LowFreqRemoved_fd.items() if v > 100])

### highFreq.txt
- Saving the sorted list of high frequent words that appear in more than 100 job advertisement description to a file

In [18]:
out_file = open("highFreq.txt", 'w')
for d in highFreq:
    out_file.write(''.join(d) + '\n')
out_file.close()

In [19]:
set(highFreq)

{'ability',
 'accept',
 'access',
 'accommodation',
 'accordance',
 'account',
 'accounting',
 'accounts',
 'accurate',
 'achieve',
 'achieving',
 'acting',
 'active',
 'actively',
 'activities',
 'activity',
 'acts',
 'addition',
 'additional',
 'address',
 'administration',
 'adults',
 'advanced',
 'advantage',
 'advantageous',
 'advertised',
 'advice',
 'agencies',
 'agency',
 'agile',
 'agreed',
 'allowance',
 'alongside',
 'alternatively',
 'ambitious',
 'analysis',
 'analyst',
 'analytical',
 'annual',
 'annum',
 'applicant',
 'applicants',
 'application',
 'applications',
 'apply',
 'applying',
 'appointments',
 'approach',
 'architecture',
 'area',
 'areas',
 'arisen',
 'asap',
 'aspects',
 'assess',
 'assessment',
 'assessments',
 'assignments',
 'assist',
 'assistance',
 'assistant',
 'assistants',
 'assisting',
 'associates',
 'assurance',
 'asylum',
 'attend',
 'attention',
 'attitude',
 'attributes',
 'audits',
 'authority',
 'automotive',
 'availability',
 'award',
 'awar

In [20]:
# running the purifier funtion to remove highFreq tokens
tokenised = purifier(tokenised,highFreq)

- Now, I'm going to create a new list of words after highFreq words are removed

In [21]:
HighFreqRemoved_words = list(chain.from_iterable([word for word in job] for job in tokenised))
HighFreqRemoved_vocab = set(HighFreqRemoved_words)

** This is an extra checkpoint to monitor the progress of purification

In [22]:
print(f"Length of words: {len(stop_wrds_removed_words)}")
print(f"Length of vocab: {len(stop_wrds_removed_vocab)}")
print(f"Length of LowFreqRemoved_Words: {len(LowFreqRemoved_Words)}")
print(f"Length of LowFreqRemoved_vocab: {len(LowFreqRemoved_vocab)}")
print(f"Length of HighFreqRemoved_words: {len(HighFreqRemoved_words)}")
print(f"Length of HighFreqRemoved_vocab: {len(HighFreqRemoved_vocab)}")

Length of words: 474345
Length of vocab: 18619
Length of LowFreqRemoved_Words: 465779
Length of LowFreqRemoved_vocab: 10053
Length of HighFreqRemoved_words: 126491
Length of HighFreqRemoved_vocab: 9103


### vocab.txt

In [23]:
HighFreqRemoved_vocab = list(HighFreqRemoved_vocab)

In [24]:
# list of final vocabs
vocab = {HighFreqRemoved_vocab[i]:i for i in range(0,len(HighFreqRemoved_vocab))}

In [25]:
# building a function to create th vocab.txt file meeting the task's requirements
def vaocab_output(file):
    with open (file, "a") as f:
        for key in sorted(vocab.keys()):
            f.write("%s:%s\n" % (key, vocab[key]))

In [26]:
# calling the function and build the file
vaocab_output("vocab.txt")

### Parser.txt

- Since I took on this task in the end, I realised all previous steps were included in this one as well.
- As a result, I structured the whole code in a new format.
- At this stage, I had a better understanding of the assessment requirement. So I structured the need more neetly.

In [47]:
data = {}
id = None
with open('data.txt', 'r',encoding="utf8") as f:
    for i, line in enumerate(f): # create the iteration in the range of imported file's length
        line = line.lower() 
        line = line.strip()
        if not line:
            continue
        section = line.split(':')[0] # define 'section' as a method to manupilate each line based on how the line begins
        content = ':'.join(line.split(':')[1:]).strip() # define 'content' a method to capture tokens
        if section == 'id': # id section:
            if id: # Error handle if theres some bad formatting: multiple ids
                raise ValueError('unable to parse file at line %d, multiple ids' % i)
            id = content[1:] # capture the job id
            if id in data.keys():# Error handle if theres some bad formatting: duplicates
                raise ValueError('unable to parse file at line %d, duplicate id' % i)
        elif section == 'description': #capture job description per each job ad
            if not id:# Error handle if theres some bad formatting: missing id
                raise ValueError('unable to parse file at line %d, missing id' % i)
            content = pattern.findall(line)
            content = [value for value in content if len(value) > 3] # remove short character token
            content = [value for value in content if value not in stopwordsset] # remove stopwords
            content = [value for value in content if value not in once_only] # remove lowFreq token
            content = [value for value in content if value not in highFreq] # remove highFreq tokens
            data[id] = content # creates data dictionary
            id = None
        elif section == 'title': # if the line start with 'title' do nothing
            continue
        else:
            raise ValueError('unable to parse file at line %d, unexpected section name' % i)

In [49]:
# Build output file:

with open('sparse.txt',"w") as f:
    for jobID,content in data.items(): # go through data dictionary created in the last block
        fd_parse = FreqDist(content) # count number of times each token occured in the same job ad
        tmp = "" # create a placeholder for word_index:word_freq
        for (x,y) in fd_parse.items(): # iterate through each frequencies
            tmp += f"{vocab[x]}:{y}," # build the dictionary of word_index:word_freq in the placeholder
        f.write(f"{jobID},{tmp[:-1]}\n") # write in the file line by line

---end---