In [None]:
import os
import platform
import sys
import requests
import json
import operator
import time
import datetime
import nltk
import matplotlib
import matplotlib.pyplot as pyplot

if platform.system() == 'Windows':
    if not os.path.exists('..\\output'):
        os.makedirs('..\\output')   
else:
    if not os.path.exists('../output/'):
        os.makedirs('../output/')
    
print(datetime.datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'))
print('_______________________________________________')

In [None]:
from nltk import ngrams, FreqDist
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import sent_tokenize, word_tokenize
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

In [None]:
def writeText(text, path, mode = 'a'):
    with open (path, mode) as textout:
        textout.write((text))
        
def writeJson(json, path, mode = 'w'):
    with open(path, mode) as file:
        file.write(json.dumps(json))
        
def writeCsv(listOut, outputFile):
    import csv
    with open (outputFile, "w", newline='') as outputfile:
        writer = csv.writer(outputfile, delimiter = ",")
        for element in listOut:
            writer.writerow(element)

def getTxt(path):
    return open(path, 'r').read()

In [None]:
def getToken():
    if platform.system() == 'Windows':
        return open('..\\token\\token.txt', 'r').read()
    else:
        return open('../token/token.txt', 'r').read()

In [None]:
def getJsonDocs(page = 1, itemsperpage = 200):
    jsondata ='dummy'
    list_return = []

    while jsondata:
        try:
            r = requests.get("https://app.23degrees.io/services/pub/api/v1/opendata/getWorldBankDataSetsPaginated/"+str(page)+"/"+str(itemsperpage)
                         ,timeout=None
                         ,headers={  'Content-Type': 'application/json'
                                    , 'Authorization' : 'Bearer '+ getToken()
                                   }
                        )    
            jsondata = r.json()
            for elem in jsondata:
                list_return.append(elem)
        except requests.exceptions.HTTPError as err:
            print(err)
        page += 1
    return list_return

In [None]:
def getFeaturesLabels(list_in):
    list_return = []
    for elem in list_in:
        list_inner = []
        list_inner.append(elem[1])
        list_inner.append(elem[2])
        list_inner.append(elem[3])
        list_return.append(list_inner)
    return list_return

In [None]:
list_js_docs = getJsonDocs(1,200)

In [None]:
list_js_docs[5]

In [None]:
list_list_js_tags = [] #Format: jsondoc(str)|name(str)|description(str)|tags(list)
list_js_tags_inner = []
list_js_notags = []
list_unique_tags = []
list_tags = []
dict_tags = {}
for i, jsondoc in enumerate(list_js_docs):
    jstmp = json.dumps(jsondoc, sort_keys=True,indent=4, separators=(',', ': '))
    if 'tags' in jstmp:
        list_js_tags_inner = []
        list_js_tags_inner.append(jstmp)
        tmp_list_tags = []
        for tag in jsondoc['typeSpecific']['context']['tags']:
            tmp_list_tags.append(tag)
            list_tags.append(tag)
            if tag not in list_unique_tags:
                list_unique_tags.append(tag)
                dict_tags[tag] = 1
            else:
                dict_tags[tag] += 1
        list_js_tags_inner.append(jsondoc['name'])
        list_js_tags_inner.append(jsondoc['description'])
        list_js_tags_inner.append(tmp_list_tags)
        list_list_js_tags.append(list_js_tags_inner)
                
    else:
        list_js_notags.append(jstmp)

In [None]:
list_tags_sorted = list(sorted(dict_tags.items(), key=lambda x: x[1], reverse = True))
ll_name_desc_tags = getFeaturesLabels(list_list_js_tags)

In [None]:
print('Json-Files: ', len(list_js_docs))
print('Json-Files with tags: ', len(list_list_js_tags))
print('Json-Files without tags: ', len(list_js_notags))
print('Unique tags: ', len(list_tags_sorted))

In [None]:
fdist_total_tags = FreqDist(list_tags)
fdist_total_tags.most_common(100)
if platform.system() == 'Windows':
    writeCsv(fdist_total_tags.most_common(500), '..\\output\\tags.csv')
else:
    writeCsv(fdist_total_tags.most_common(100), '../output/tags.csv')

In [None]:
pyplot.figure(figsize=(20, 8))
pyplot.ylim(0, 4000)
fdist_total_tags.plot(100, title = 'Most common Tags')

### Which tags should be ignored?

Only tags appearing less than 1000 times?

In [None]:
pyplot.figure(figsize=(20, 8))
pyplot.ylim(0, 1000)
fdist_total_tags.plot(25, title = 'Most common Tags')

* excluding first 3 tags
* assumption: take the first 200 most common tags

In [None]:
def chooseTags(list_tags,first,topx):
    list_chosen = []
    list_notchosen = []
    for i, elem in enumerate(list_tags):
        if (i > first) & (i < (topx+first+1)):
            list_chosen.append(elem[0].lower())   
        else:
            list_notchosen.append(elem[0].lower()) 
        
    return list_chosen, list_notchosen

In [None]:
list_chosen_tags = chooseTags(list_tags_sorted,2,80)

### Removing not used tags and fillerwords from name and describtion:

In [None]:
list_fillerwords = getTxt('../input/fillerwords.txt').split(',') + stopwords.words('english')

In [None]:
# string cleaning 1
porterstemmer = PorterStemmer()
lemmatizer = nltk.WordNetLemmatizer()
for i, elem in enumerate(ll_name_desc_tags):
    #prep name
    ll_name_desc_tags[i][0] = nltk.word_tokenize(''.join([x for x in elem[0] if not x.isdigit()]).replace('-',' ').replace('.','').replace(',','').replace('%','').replace('(','').replace(')',''))
    for word in list_fillerwords:
        while (word in ll_name_desc_tags[i][0]):
            ll_name_desc_tags[i][0].remove(word)
    for j, word in enumerate(ll_name_desc_tags[i][0]):
        ll_name_desc_tags[i][0][j] = ll_name_desc_tags[i][0][j].lower()
        lemmatizer.lemmatize(porterstemmer.stem(ll_name_desc_tags[i][0][j]))
    # prep desc
    ll_name_desc_tags[i][1] = nltk.word_tokenize(''.join([x for x in elem[1] if not x.isdigit()]).replace('-',' ').replace('.','').replace(',','').replace('%','').replace('(','').replace(')',''))
    for word in list_fillerwords:
        while (word in ll_name_desc_tags[i][1]):
            ll_name_desc_tags[i][1].remove(word)
    for j, word in enumerate(ll_name_desc_tags[i][1]):
        ll_name_desc_tags[i][1][j] = ll_name_desc_tags[i][1][j].lower()
        lemmatizer.lemmatize(porterstemmer.stem(ll_name_desc_tags[i][1][j]))
        
    # lower labels
    for j, word in enumerate(ll_name_desc_tags[i][2]):
        ll_name_desc_tags[i][2][j] = ll_name_desc_tags[i][2][j].lower()   
        
    # removing not chosen labels
    for tag in list_chosen_tags[1]:
        while (tag in ll_name_desc_tags[i][2]):
            ll_name_desc_tags[i][2].remove(tag)
        
    # split labels        
    list_1_tmp_tags = []
    for j, word in enumerate(ll_name_desc_tags[i][2]):
        if '-' in word:
            list_2_tmp_tags = word.split('-')
        elif '_' in word:
            list_2_tmp_tags = word.split('_')
        else:
            list_2_tmp_tags = word.split(' ')
        for sword in list_2_tmp_tags:
            list_1_tmp_tags.append(sword.lower())
            
    # finalise labels
    ll_name_desc_tags[i][2] = []
    for word in list_1_tmp_tags:
        ll_name_desc_tags[i][2].append(word)

In [None]:
# Deckungsqoute Berechnen
cnt = 0
cnt2 = 0
cnt3 = 0
print('Anzahl der Labels: ', len(list_chosen_tags[0]))
for i, elem in enumerate(ll_name_desc_tags):
    if not ll_name_desc_tags[i][2]:
        cnt += 1
    if len(ll_name_desc_tags[i][2]) < 3:
        cnt2 += 1
    if len(ll_name_desc_tags[i][2]) < 4:
        cnt3 += 1
        
        
print('Anzahl an ungedeckten Datensätzen: ', cnt)
print('Anteil der gedeckten Datensätze: ', 1-(cnt/len(ll_name_desc_tags)))
print('Anzahl der Datensätze mit weniger als 3 Label: ', cnt2)
print('Anteil der Datensätze mit weniger als 3 Label: ', 1-(cnt2/len(ll_name_desc_tags)))
print('Anzahl der Datensätze mit weniger als 4 Label: ', cnt3)
print('Anteil der Datensätze mit weniger als 4 Label: ', 1-(cnt3/len(ll_name_desc_tags)))

In [None]:
# # string cleaning 2
# for i, elem in enumerate(ll_name_desc_tags):
#     #prep name
#     ll_name_desc_tags[i][0] = nltk.word_tokenize(''.join([x for x in elem[0] if not x.isdigit()]).replace('.','').replace(',','').replace('%','').replace('(','').replace(')',''))
#     for word in list_fillerwords:
#         while (word in ll_name_desc_tags[i][0]):
#             ll_name_desc_tags[i][0].remove(word)
#     for j, word in enumerate(ll_name_desc_tags[i][0]):
#         ll_name_desc_tags[i][0][j] = ll_name_desc_tags[i][0][j].lower()
#         lemmatizer.lemmatize(porterstemmer.stem(ll_name_desc_tags[i][0][j]))
#     # prep desc
#     ll_name_desc_tags[i][1] = nltk.word_tokenize(''.join([x for x in elem[1] if not x.isdigit()]).replace('.','').replace(',','').replace('%','').replace('(','').replace(')',''))
#     for word in list_fillerwords:
#         while (word in ll_name_desc_tags[i][1]):
#             ll_name_desc_tags[i][1].remove(word)
#     for j, word in enumerate(ll_name_desc_tags[i][1]):
#         ll_name_desc_tags[i][1][j] = ll_name_desc_tags[i][1][j].lower()
#         lemmatizer.lemmatize(porterstemmer.stem(ll_name_desc_tags[i][1][j])) 
#     # lower labels
#     for j, word in enumerate(ll_name_desc_tags[i][2]):
#         ll_name_desc_tags[i][2][j] = ll_name_desc_tags[i][2][j].lower()    
#     # removing not chosen labels
#     for tag in list_chosen_tags[1]:
#         while (tag in ll_name_desc_tags[i][2]):
#             ll_name_desc_tags[i][2].remove(tag)

In [None]:
if platform.system() == 'Windows':
    writeCsv(ll_name_desc_tags, '..\\output\\prep_out.csv')
else:
    writeCsv(ll_name_desc_tags, '../output/prep_out.csv')

In [None]:
print(datetime.datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'))
print('\n\n_______________________________________________')