## Import python modules 

In [1]:
import os
import re
import pandas as pd
import nltk
from collections import Counter

## Import data

In [2]:
data_filename = 'educ_occ_dk.csv' #Note: I manually saved the file in csv format
data = pd.read_csv(data_filename,encoding = 'cp865',header=0)
# Note: encoding = 'cp865' is for Danish and Norwegian 

print('total rows = ' + str(len(data)))
data.head(20) #print example

total rows = 794


Unnamed: 0,Valid from,Valid until,Code,Text
0,01-01-1600,31-12-9999,0,undisclosed
1,01-01-1600,31-12-9999,11000,Military / Civil Defense
2,01-01-1600,31-12-9999,100000,"Leadership at the top level in companies, orga..."
3,01-01-1600,31-12-9999,110000,Legislative work and management in public admi...
4,01-01-1600,31-12-9999,111000,Legislative work and overall administration of...
5,01-01-1600,31-12-9999,112000,General public management
6,01-01-1600,31-12-9999,114000,Overall management of interest organizations a...
7,01-01-1600,31-12-9999,114100,Management of political party organizations
8,01-01-1600,31-12-9999,114200,Management of economic interest organizations
9,01-01-1600,31-12-9999,114210,Politically responsible / politically elected ...


## Extract all text and preprocess

In [3]:
all_text = [w.lower() for w in list(data['Text']) if not w==''] 
#convert to lowercase and remove empty values

all_text[:10] #print some examples

['undisclosed',
 'military / civil defense',
 'leadership at the top level in companies, organizations and the public sector',
 'legislative work and management in public administration and interest organizations',
 'legislative work and overall administration of legislation, before 2004 incl.╩overall public management',
 'general public management',
 'overall management of interest organizations and humanitarian organizations',
 'management of political party organizations',
 'management of economic interest organizations',
 'politically responsible / politically elected leaders in economic interest organizations']

In [4]:
all_text = ' '.join(all_text) #combine all text together
all_text[:1000] #print top 1000 characters

'undisclosed military / civil defense leadership at the top level in companies, organizations and the public sector legislative work and management in public administration and interest organizations legislative work and overall administration of legislation, before 2004 incl.╩overall public management general public management overall management of interest organizations and humanitarian organizations management of political party organizations management of economic interest organizations politically responsible / politically elected leaders in economic interest organizations employed leaders in economic interest organizations management work in humanitarian or other interest groups top management in the company in business with at least 10 employees management comprising the company as a whole in business with at least 10 employees top management level, ceo or equivalent in company with at least 10 employees crossing directors in business with at least 10 employees management of the

In [5]:
# Note: I have no idea if python text analysis functions work with Danish, so I will remove all Danish characters.

# replace all non-ascii characters (e.g., Danish alphabets) with white space
all_text_preprocessed = ''.join([i if ord(i) < 128 else ' ' for i in all_text])

# replace all numbers with white space
all_text_preprocessed = re.sub('\d+',' ',all_text_preprocessed)

# replace all non-alphabetical characters with white space
all_text_preprocessed = re.sub('[^a-z ]',' ',all_text_preprocessed)

# reset white spaces
all_text_preprocessed = ' '.join([w for w in re.split(' ',all_text_preprocessed) if not w==''])

# split text into list of words
list_all_words = re.split(' ',all_text_preprocessed)

In [6]:
print(set(list_all_words)) #all words



## Select relevant words
According to "Natural Language Processing with Python" by Steven Bird, Ewan Klein, and Edward Loper, stopwords are usually "[words with] little lexical content, and their presence in a text fails to distinguish it from other texts." See [here](http://www.nltk.org/book/ch02.html) for more explanations. These words include:

In [7]:
from nltk.corpus import stopwords
print(list(stopwords.words('english')))

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', 'her', 'hers', 'herself', 'it', 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', 'too', 'very', 's', 't', 'can', 'will', 'just', 'don', 'should', 'no

In [8]:
# remove stopwords
selected_words = [w for w in list_all_words if not w in stopwords.words('english')]
print('Total unique number of words = ' + str(len(selected_words)))

# count the rest
count_words = [w for w in Counter(selected_words).most_common()]
count_words = pd.DataFrame(count_words, columns = ['word','count'])

# export to csv file
count_words.to_csv('count_words.csv',index = False)

# print some examples
count_words.head(10)

Total unique number of words = 3350


Unnamed: 0,word,count
0,work,414
1,management,61
2,operation,53
3,employees,49
4,etc,46
5,business,39
6,machines,37
7,least,30
8,production,30
9,transport,27
