Maziar Izadi

Data wrangling in Python



In [1]:
# Import required libraries

# Regular Expressions (REGeX)
import re
from nltk.probability import *

# Natural Language Toolkit
import nltk

# Functions creating iterators for efficient looping
import itertools
from itertools import chain
from itertools import groupby

# Convert a collection of text documents to a matrix of token counts
from sklearn.feature_extraction.text import CountVectorizer

In [6]:
# open the file and read the contents
data = []

# make sure you define the encoding format otherwise you might get this error: 
# UnicodeDecodeError: 'charmap' codec can't decode byte 0x9d in position 260893: character maps to <undefined>
with open('data.txt', encoding="utf8") as f:
    data = f.read().lower() # read the content and lower the letters (for consistency)

In [10]:
# make a pattern for REGEX to find and keep matching words only
pattern = re.compile(r"[a-zA-Z]+(?:[-'][a-zA-Z]+)?") 

# tokenise the words: match the pattern to file's content and tokenize the content
tokenised = pattern.findall(data) 

In [11]:
tokenised_len = len(tokenised)
indexes = [i for i, v in enumerate(tokenised) if v=='id' and i+1 < tokenised_len and tokenised[i+1]=='title']

In [4]:
# from itertools recipes
def pairwise(iterable, fillvalue=None):
    """
       This function iterates through the list of tokens and 
       creates sub list to include tokens related to one job ad only
    """
    a, b = iter(iterable), iter(iterable)
    next(b, None)
    return itertools.zip_longest(a, b, fillvalue=fillvalue)

# pairwise based on indexes in the last block and store in the 'tokenised' as a list
tokenised = [tokenised[i:j] for i,j in pairwise(indexes)] 

- Exclude words less than 3 char: 

** I have kept all the tokens which are more than 2 characters and renamed the rest to 'to_removed

** "to_remove" is added to "stopwords" list further down

In [5]:
tokenised = [[word if len(word) > 2 else "to_remove" for word in job] for job in tokenised]

### Removing stopwords

In [6]:
# adding'to_removed' string to the list of stopwords
stopwords = []
with open('stopwords_en.txt',"a") as f:
    f.write("\nto_remove") #\n to shift to next line

In [7]:
with open('stopwords_en.txt') as f:
    stopwords = f.read().splitlines() #reading stopwords line and create stopwords as a list

In [8]:
# convert stopwords into set
#Python set is better choice than list. 
# Set runs much faster than lists in terms of searching a large number of hashable items

stopwordsset = set(stopwords) 

In [9]:
def purifier(tokenList,remove_token):
    """
        This function takes two input (list of current tokens and list of tokens to be removed)
        The function converts the list into set to improve the performance
        and return a list of sets each of which include purified tokens
        and remove_token lists are removed
    """
    return [set(word for word in job if word not in remove_token) for job in tokenList]

In [10]:
# running 'purifier' function
tokenised = purifier(tokenised,stopwordsset)

You need to remove the words that appear only once in one job advertisement description, save them ( No duplication) as a txt file (refer to the required output). You will need to exclude those words in the generated vocabulary.

In [11]:
# Using chain() function to join all the words in all the job ads together by making a list
words = list(chain.from_iterable([word for word in job] for job in tokenised))
# convert the list of words into set to remove duplicates and create the set of vocabulary
vocab = set(words)
# passing the words in FreqDisrt() function to cound the number of token
# this function counts the number of times a word occurs in the whole corpus regardless which ad it is in.
fd = FreqDist(words)

In [12]:
# finding the Less Frequent Tokens.

# create a list of token which have occured more than once
# convert the list into set for performance improvement
once_only = set([k for k, v in fd.items() if v == 1])
# sort the set into alphabetical order
once_only = sorted(once_only)

set(once_only)

{'sift',
 'experiencework',
 'cyps',
 'securityengineer',
 'beech',
 'domaindriven',
 'clientbase',
 'harryannapurnait',
 'musculoskeletal',
 'stuides',
 'prelegal',
 'requiredskills',
 'traumatic',
 'gunton',
 'counseling',
 'sherburn',
 'desperate',
 'localisation',
 'cemli',
 'elaborating',
 'bullcustomer',
 'compressor',
 'cont',
 'technicalforderecruitment',
 'implementationsexperience',
 'micros',
 'snack',
 'roughly',
 'apartments',
 'ccd',
 "criteria's",
 'texile',
 'yateley',
 'membersimplement',
 'serviceusers',
 'gatewaypersonnel',
 'unauthorized',
 'accpac',
 'jakarta',
 'marsden',
 'buzzy',
 'booster',
 'semiliquid',
 'applicationsplease',
 'corrections',
 'electronicspower',
 'illess',
 'issuesyour',
 'nopcommerce',
 'enriched',
 'generalmanagerbrilliantcompanyeastlondon',
 'psychotherapy',
 'prestage',
 'sdm',
 'selfregulated',
 'scameronqmu',
 'bpas',
 'afx',
 'barcelona',
 'derhamadriasolutions',
 'partially',
 'barney',
 'aquillon',
 'opensource',
 'predicted',
 'stra

### lowFreq.txt
- Saving the sorted list of the words that appear only once in one job advertisement description to a file

In [29]:
out_file = open("lowFreq.txt", 'w')
for d in once_only:
    out_file.write(''.join(d) + '\n')
out_file.close()

In [14]:
# At this stage, I repead the same steps above
# However, this time the intention is to find the highFreq words
# start from removing lowFreq tokens from the list of tokens

tokenised = purifier(tokenised,once_only)

- Now, I'm going to create a new list of words after once_only words are removed

In [15]:
words2 = list(chain.from_iterable([word for word in job] for job in tokenised))
vocab2 = set(words2)
fd2 = FreqDist(words2)

In [16]:
highFreq = set([k for k, v in fd2.items() if v > 100])

### highFreq.txt
- Saving the sorted list of high frequent words that appear in more than 100 job advertisement description to a file

In [28]:
out_file = open("highFreq.txt", 'w')
for d in highFreq:
    out_file.write(''.join(d) + '\n')
out_file.close()

In [18]:
set(highFreq)

{'ability',
 'accept',
 'access',
 'accommodation',
 'accordance',
 'account',
 'accounting',
 'accounts',
 'accurate',
 'achieve',
 'achieving',
 'act',
 'acting',
 'active',
 'actively',
 'activities',
 'activity',
 'acts',
 'addition',
 'additional',
 'address',
 'administration',
 'adults',
 'advanced',
 'advantage',
 'advantageous',
 'advertised',
 'advice',
 'age',
 'agencies',
 'agency',
 'agile',
 'agreed',
 'aim',
 'allowance',
 'alongside',
 'alternatively',
 'ambitious',
 'analysis',
 'analyst',
 'analytical',
 'annual',
 'annum',
 'applicant',
 'applicants',
 'application',
 'applications',
 'apply',
 'applying',
 'appointments',
 'approach',
 'architecture',
 'area',
 'areas',
 'arisen',
 'asap',
 'asp',
 'aspects',
 'assess',
 'assessment',
 'assessments',
 'assignments',
 'assist',
 'assistance',
 'assistant',
 'assistants',
 'assisting',
 'associates',
 'assurance',
 'asylum',
 'attend',
 'attention',
 'attitude',
 'attributes',
 'audits',
 'authority',
 'automotive',
 

In [19]:
# running the purifier funtion to remove highFreq tokens
tokenised = purifier(tokenised,highFreq)

- Now, I'm going to create a new list of words after highFreq words are removed

In [20]:
words3 = list(chain.from_iterable([word for word in job] for job in tokenised))
vocab3 = set(words3)

** This is an extra checkpoint to monitor the progress of purification

In [21]:
print(f"Length of words: {len(words)}")
print(f"Length of vocab: {len(vocab)}")
print(f"Length of words 2: {len(words2)}")
print(f"Length of vocab 2: {len(vocab2)}")
print(f"Length of words 3: {len(words3)}")
print(f"Length of vocab 3: {len(vocab3)}")

Length of words: 494596
Length of vocab: 19932
Length of words 2: 485423
Length of vocab 2: 10759
Length of words 3: 133724
Length of vocab 3: 9776


### vocab.txt

In [22]:
vocab3 = list(vocab3)

In [23]:
# list of final vocabs
vocab = {vocab3[i]:i for i in range(0,len(vocab3))}

In [24]:
# building a function to create th vocab.txt file meeting the task's requirements
def vaocab_output(file):
    with open (file, "a") as f:
        for key in sorted(vocab.keys()):
            f.write("%s:%s\n" % (key, vocab[key]))

In [25]:
# calling the function and build the file
vaocab_output("vocab.txt")

### Parser.txt

- Since I took on this task in the end, I realised all previous steps were included in this one as well.
- As a result, I structured the whole code in a new format.
- At this stage, I had a better understanding of the assessment requirement. So I structured the need more neetly.

In [26]:
data = {}
id = None
with open('data.txt', 'r') as f:
    for i, line in enumerate(f): # create the iteration in the range of imported file's length
        line = line.lower() 
        line = line.strip()
        if not line:
            continue
        section = line.split(':')[0] # define 'section' as a method to manupilate each line based on how the line begins
        content = ':'.join(line.split(':')[1:]).strip() # define 'content' a method to capture tokens
        if section == 'id': # id section:
            if id: # Error handle if theres some bad formatting: multiple ids
                raise ValueError('unable to parse file at line %d, multiple ids' % i)
            id = content[1:] # capture the job id
            if id in data.keys():# Error handle if theres some bad formatting: duplicates
                raise ValueError('unable to parse file at line %d, duplicate id' % i)
        elif section == 'description': #capture job description per each job ad
            if not id:# Error handle if theres some bad formatting: missing id
                raise ValueError('unable to parse file at line %d, missing id' % i)
            content = pattern.findall(line)
            content = [value for value in content if len(value) > 2] # remove short character token
            content = [value for value in content if value not in stopwordsset] # remove stopwords
            content = [value for value in content if value not in once_only] # remove lowFreq token
            content = [value for value in content if value not in highFreq] # remove highFreq tokens
            data[id] = content # creates data dictionary
            id = None
        elif section == 'title': # if the line start with 'title' do nothing
            continue
        else:
            raise ValueError('unable to parse file at line %d, unexpected section name' % i)

In [27]:
# Build output file:

with open('sparse.txt',"w") as f:
    for jobID,content in data.items(): # go through data dictionary created in the last block
        fd_parse = FreqDist(content) # count number of times each token occured in the same job ad
        tmp = "" # create a placeholder for word_index:word_freq
        for (x,y) in fd_parse.items(): # iterate through each frequencies
            tmp += f"{vocab[x]}:{y}," # build the dictionary of word_index:word_freq in the placeholder
        f.write(f"{jobID},{tmp[:-1]}\n") # write in the file line by line