## Import python modules 

In [1]:
import re # regular expressions: working with string 
import pandas as pd # working with dataset
import enchant # dictionary
d = enchant.Dict("en_US") 

In [2]:
# test dictionary module
d.check('N?oriental')

False

### Import manually created files

In [3]:
remove_words_file = open('remove_words.csv').read()
remove_words = ['\\b' + w.lower() + '\\b' for 
                w in re.split('\n',remove_words_file) if not w=='']

### Define auxiliary functions 

In [4]:
def reset_space(input_string):
    # This function removes unnecessary white spaces.
    string_reset_space = [w for w in re.split(' ',input_string) 
                          if not w=='']
    return ' '.join(string_reset_space)

def initial_clean(input_string):
    # This function performs initial cleaning steps.
    # (1.) convert to lowercase
    clean = input_string.lower()
    
    # (2.) remove abbrevations (any word or letter followed by a period, ".")
    clean = re.sub(r'\w+\.','', clean)
    clean = reset_space(clean)
    
    # (3.) remove characters in parentheses (remove also the parentheses)
    clean = re.sub(re.escape('(') + '[a-z0-9 ]+' + re.escape(')'),' ',clean)
    clean = reset_space(clean)
    
    # (4.) replace ",", ".", ":", ";", "/", "?" and "!" with a whitespace, " "
    remove_character = [',', '.', ':', ';', '/', '?', '!']
    for i in remove_character:
        clean = clean.replace(i,' ')
    clean = reset_space(clean)
        
    # (5.) change "it" to "information technology"
    clean = re.sub(r'\bit\b', ' information technology ', clean)
    clean = reset_space(clean)
    
    # (6.) remove numbers and ordinal number suffix, ie., "1st", "2nd"
    clean = re.sub(r'\b1st|2nd|3rd|\d+th\b', ' ', clean)
    clean = re.sub(r'\d+', ' ', clean)
    clean = reset_space(clean)
    
    # (7.) remove words in "remove_words.csv"
    clean = re.sub('|'.join(remove_words),' ',clean)
    clean = reset_space(clean)
    
    return clean

def detect_non_ascii(text):
    # This function detects non-ASCII characters. 
    output = False
    if max([ord(w) for w in str(text)]) >= 127:
        output = True
    return output

def count_word(text):
    # This function count words.
    alphabets = re.sub('[^a-z]',' ',text.lower())
    tokens = [w for w in re.split(' ',alphabets) if not w=='']
    return len(tokens)

def detect_strange_text(text):
    # This function detect strange text
    alphabets = re.sub('[^a-z]',' ',text.lower())
    tokens = [w for w in re.split(' ',alphabets) if not w=='']
    
    output = False
    for word in tokens:
        if not d.check(word):
            output = True

    return output

### Import education entries

In [5]:
data_filename = 'educ_danish.csv'

data = pd.read_csv(data_filename,
                   sep = ',',
                   encoding = 'cp865',
                   names = ['code','text'],
                   header=0, 
                   dtype = object)

# Note: encoding = 'cp865' is for Danish and Norwegian. 
print('total rows = ' + str(len(data)))
data.head() #print example

total rows = 2449


Unnamed: 0,code,text
0,320,"Pedagogy, ivu (eud)"
1,339,"Trade / office, ivu (eud)"
2,353,"Construction, ivu (eud)"
3,354,"Iron / metal, ivu (eud)"
4,355,"Graphic, ivu (eud)"


###  Preprocess text (see "initial_clean" function)
1. convert to lowercase
2. remove abbrevations (any word or letter followed by a period, ".")
3. remove any words or letters in parentheses (remove also the parentheses)
4. replace ",", ".", ":", ";", "/", "?" and "!" with a whitespace, " "
5. change "it" to "information technology"
6. remove numbers and ordinal number suffix, ie., "1st", "2nd"
7. remove words in "remove_words.csv"

In [6]:
# an example
initial_clean('IT 1st instructor (test) v.o.v. PhD')

'information technology instructor'

In [7]:
# preprocess text: create a new column called 'clean_text'
data['clean_text'] = data['text'].apply(lambda x: initial_clean(x))
data.head()

Unnamed: 0,code,text,clean_text
0,320,"Pedagogy, ivu (eud)",pedagogy
1,339,"Trade / office, ivu (eud)",trade office
2,353,"Construction, ivu (eud)",construction
3,354,"Iron / metal, ivu (eud)",iron metal
4,355,"Graphic, ivu (eud)",graphic


In [8]:
# some education entries become empty after initial cleaning steps 
data[data['clean_text'] == '']

Unnamed: 0,code,text,clean_text
42,700,"Bachelor una, ivu (bachelor)",
51,720,"Ph.D. una, ivu (PhD)",
722,4722,Vejgodstransportudd. una,
1284,5680,MA. una,
1332,5745,"Education, Licensing. (DPU)",
1333,5746,"Education, Examination. (DPU)",
1548,6145,"Education and education, superstructure (RUC)",
1839,7003,"Education, mag.art.",
2409,9122,MA. una,
2410,9123,Cand.phil. una,


### Detect education entries containing non-ASCII characters

In [9]:
# delete empty cleaned entries
data = data[data['clean_text'] != '']

# detect entries with non-ASCII characters
data['non_ascii'] = data['text'].apply(lambda x: detect_non_ascii(x))

# print entries with non-ASCII characters 
data_non_ascii = data[data['non_ascii']==True]
data_non_ascii.to_csv('non_ascii.csv',index=False)
data_non_ascii

Unnamed: 0,code,text,clean_text,non_ascii
2250,8555,Media management (G├àU),media management (g├àu),True


### Import manually corrected education entries 

In [10]:
data_revise_filename = 'manual_correction.csv' 
data_revise = pd.read_csv(data_revise_filename,
                          sep = ',',
                          encoding = 'cp865',
                          names = ['code','text','clean_text'],
                          header = 0,
                          dtype = object)

# drop empty entries
data_revise = data_revise[~data_revise['clean_text'].isnull()]

data_revise.head()

Unnamed: 0,code,text,clean_text
0,8559,Technological diploma education in bio and pro...,bioengineering chemistry
1,7746,Danish candleholder. (DPU),language literature
2,4366,manlift Mechanic,repair troubleshoot mechanic lift
3,5401,"Agricultural Economics, cand.agro.┐k.",agricultural economics
4,5404,"Agricultural Economics, cand.agro.┐k.",agricultural economics


In [11]:
# create a new dataframe "replace_obs" to record entries 
# which have been manually corrected.
replace_obs = data_revise
replace_obs['replace_obs'] = True
replace_obs = replace_obs[['code','replace_obs']]
replace_obs.head()

Unnamed: 0,code,replace_obs
0,8559,True
1,7746,True
2,4366,True
3,5401,True
4,5404,True


In [12]:
# merge "replace_obs" to "data" 
data = data.merge(replace_obs, on = 'code', how = 'left')

# drop entries in "data" where replace_obs is True 
data = data[data['replace_obs'] != True]

# merge "data_revise" to "data"   
data = data.append(data_revise, ignore_index=True)
data = data[['code','text','clean_text']]
data

Unnamed: 0,code,text,clean_text
0,320,"Pedagogy, ivu (eud)",pedagogy
1,339,"Trade / office, ivu (eud)",trade office
2,353,"Construction, ivu (eud)",construction
3,354,"Iron / metal, ivu (eud)",iron metal
4,355,"Graphic, ivu (eud)",graphic
5,358,"Technology / industry by the way, ivu (eud)",technology industry way
6,360,"Service, ivu (eud)",service
7,375,"Food / household, ivu (eud)",food household
8,380,"Agriculture / fisheries, ivu (eud)",agriculture fisheries
9,385,"Transport, etc., ivu (eud)",transport


### Count words in each data entry

In [13]:
data['count_word'] = data['clean_text'].apply(lambda x: count_word(x))
data.head(10)

Unnamed: 0,code,text,clean_text,count_word
0,320,"Pedagogy, ivu (eud)",pedagogy,1
1,339,"Trade / office, ivu (eud)",trade office,2
2,353,"Construction, ivu (eud)",construction,1
3,354,"Iron / metal, ivu (eud)",iron metal,2
4,355,"Graphic, ivu (eud)",graphic,1
5,358,"Technology / industry by the way, ivu (eud)",technology industry way,3
6,360,"Service, ivu (eud)",service,1
7,375,"Food / household, ivu (eud)",food household,2
8,380,"Agriculture / fisheries, ivu (eud)",agriculture fisheries,2
9,385,"Transport, etc., ivu (eud)",transport,1


### Detect entries with strange words

In [14]:
data['strange_text'] = data['clean_text'].apply(lambda x: detect_strange_text(x))
data.to_csv('clean_text.csv',index=False)
data[data['strange_text'] == True]

Unnamed: 0,code,text,clean_text,count_word,strange_text
391,4356,Telecom Installation Technician,telecom installation technician,3,True
483,4478,Film and TV assistant,film tv assistant,3,True
484,4479,Film and TV production technicians,film tv production technicians,4,True
655,4878,Bacontilvirker,bacontilvirker,1,True
676,4918,retail TV and radio,retail tv radio,3,True
843,5168,"Healthcare professional, diploma exam",healthcare professional,2,True
852,5177,Pharmaconomist,pharmaconomist,1,True
902,5236,"Acoustics and Audiotechnology, Master of Scien...",acoustics audiotechnology science,3,True
906,5265,"Biomechanics, cand.manu.",biomechanics,1,True
1041,5451,Clinical dietician,clinical dietician,2,True
