In [1]:
import os
import sys
import time
import csv
import datetime
import nltk
import requests
import _pickle

import numpy as np

from nltk import ngrams, FreqDist
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import sent_tokenize, word_tokenize
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

print(datetime.datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'))
print('_______________________________________________')

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/user1-2/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/user1-2/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /home/user1-2/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
2019-01-17 16:10:11
_______________________________________________


In [2]:
# !pip freeze > requirements_wu.txt

## Def Functions

In [3]:
def getTxt(path):  
    return open(path, 'r').read()

# function to open csv files with the right encoding
def getCsv(path, delim = ',', enc = 'utf-8'):          
    list_return = []
    with open (path, encoding = enc) as file:
        csvreader = csv.reader(file, delimiter = delim)
        for line in csvreader:
            list_return.append(line)
    return list_return

In [4]:
list_fillerwords = getTxt('..//input//fillerwords.txt').split(',') + stopwords.words('english')

In [5]:
def tokenizeString(title, desc):
    print('Started at ' , datetime.datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S.%f'))
    print('Tokenisation is running.')
    global list_fillerwords
#     string_to_clean = title + ' ' + desc
    string_to_clean = desc
    porterstemmer = PorterStemmer()
    lemmatizer = nltk.WordNetLemmatizer()
    
    list_token = nltk.word_tokenize(''.join([x for x in string_to_clean if not x.isdigit()]).replace('-',' ').replace('.','').replace(',','').replace('%','').replace(';',' ').replace('/', ' ').replace('(','').replace(')',''))
    
    for word in list_fillerwords:
        while (word in list_token):
            list_token.remove(word)
            
    for i, word in enumerate(list_token):
        list_token[i] = list_token[i].lower()
        lemmatizer.lemmatize(porterstemmer.stem(list_token[i]))
    
    print('Done at ' , datetime.datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S.%f'))
    return list_token

In [6]:
# function for summing up single vectors (=words) in a vectorlist
def sumVect(list_vect):
    try:
        for i, elem in enumerate(list_vect):
            if not elem:
                del list_vect[i]
        
        int_len_vect = len(list_vect[0])
        list_vect_sum = [0] * int_len_vect
        for vect in list_vect:
            for i, dim in enumerate(vect):
                list_vect_sum[i] += float(dim)
        return(list_vect_sum)
    except Exception as e:
        print(list_vect)
        print(i)
        print(list_vect_sum)
        print(e)

In [7]:
def vectorizeStringList(list_string):
    print('Vectorisation is running.')
    adress = 'http://word2vec.ai.wu.ac.at/googlenews/model?word='
    feat_words = []
    
    for str_elem in list_string:
        word = str_elem.strip() 
        try:
            feat_words.append([float(x) for x in requests.get((adress+word)).text.replace(' ','').replace('[','').replace(']','').split(',')])
        except:
            try:
                feat_words.append([float(x) for x in requests.get((adress+word.title())).text.replace(' ','').replace('[','').replace(']','').split(',')])
            except Exception as e:
                feat_words.append([])
                           
    if feat_words:
        list_return = sumVect(feat_words)
        
    print('Done at ' , datetime.datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S.%f'))
    return list_return

In [8]:
def labelDataRFC(vect):
    print('Creating labels.')
        
    with open('dumped_randomforestclassifier.pkl', 'rb') as fid:
        rf_load = _pickle.load(fid)
    with open('dumped_binarizer.pkl', 'rb') as fid:
        bin_load = _pickle.load(fid)
    
    np_in = [vect]
    X = np.array(np_in)
    pred_rf = rf_load.predict(X)
    pred_label = bin_load.inverse_transform(pred_rf)
    
    print('Done at ' , datetime.datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S.%f'))
    return pred_label

In [9]:
def labelDataOVRkNN(vect):
    print('Creating labels.')
        
    with open('dumped_ovrknn20.pkl', 'rb') as fid:
        rf_load = _pickle.load(fid)
    with open('dumped_binarizer.pkl', 'rb') as fid:
        bin_load = _pickle.load(fid)
    
    np_in = [vect]
    X = np.array(np_in)
    pred_rf = rf_load.predict(X)
    pred_label = bin_load.inverse_transform(pred_rf)
    
    print('Done at ' , datetime.datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S.%f'))
    return pred_label

In [10]:
def labelDataOVRRFC(vect):
    print('Creating labels.')
        
    with open('dumped_ovrrfc.pkl', 'rb') as fid:
        rf_load = _pickle.load(fid)
    with open('dumped_binarizer.pkl', 'rb') as fid:
        bin_load = _pickle.load(fid)
    
    np_in = [vect]
    X = np.array(np_in)
    pred_rf = rf_load.predict(X)
    pred_label = bin_load.inverse_transform(pred_rf)
    
    print('Done at ' , datetime.datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S.%f'))
    return pred_label

In [11]:
def labelDataVoting(vect):
    print('Creating labels.')
        
    with open('dumped_randomforestclassifier.pkl', 'rb') as fid:
        rf_load = _pickle.load(fid)
    with open('dumped_ovrrfc.pkl', 'rb') as fid:
        ovrrf_load = _pickle.load(fid)
    with open('dumped_ovrknn20.pkl', 'rb') as fid:
        ovrknn_load = _pickle.load(fid)
    with open('dumped_binarizer.pkl', 'rb') as fid:
        bin_load = _pickle.load(fid)
    
    np_in = [vect]
    X = np.array(np_in)
    pred_rf = rf_load.predict(X)
    pred_ovrrf = ovrrf_load.predict(X)
    pred_ovrknn = ovrknn_load.predict(X)
    
    pred_label = ((pred_rf+pred_ovrrf+pred_ovrknn)/3)
    for i, label in enumerate(pred_label[0]):
        if label > 0.5:
            pred_label[0][i] = 1
        else:
            pred_label[0][i] = 0
        
        
    pred_out = bin_load.inverse_transform(pred_label)
    
    print('Done at ' , datetime.datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S.%f'))
    return pred_out

## Create Labels

In [12]:
title = 'Labor force participation rate, female (% of female population ages 15+) (modeled ILO estimate), countries grouped by income levels'
describtion = 'Labor force participation rate is the proportion of the population ages 15 and older that is economically active: all people who supply labor for the production of goods and services during a specified period.'
# title = 'CO2 emissions (metric tons per capita)'
# describtion = 'Carbon dioxide emissions are those stemming from the burning of fossil fuels and the manufacture of cement. They include carbon dioxide produced during consumption of solid, liquid, and gas fuels and gas flaring. Government reaction taxes issue '

In [13]:
labelDataRFC(vectorizeStringList(tokenizeString(title, describtion)))

Started at  2019-01-17 16:10:11.678558
Tokenisation is running.
Done at  2019-01-17 16:10:14.450675
Vectorisation is running.
Done at  2019-01-17 16:10:14.555203
Creating labels.




Done at  2019-01-17 16:10:17.332287


[()]

In [14]:
labelDataOVRRFC(vectorizeStringList(tokenizeString(title, describtion)))

Started at  2019-01-17 16:10:17.375618
Tokenisation is running.
Done at  2019-01-17 16:10:17.378506
Vectorisation is running.
Done at  2019-01-17 16:10:17.478090
Creating labels.




Done at  2019-01-17 16:10:56.418872


[(' ageing',
  ' cpia',
  ' demographic economics',
  ' demography',
  ' disaster_accident',
  ' economics',
  ' energy development',
  ' energy economics',
  ' environment',
  ' expense',
  ' female labor force in the muslim world',
  ' government',
  ' health',
  ' health_medical_pharma',
  ' hiv',
  ' hospitality_recreation',
  ' human interest',
  ' human migration',
  ' law_crime',
  ' medicine',
  ' money',
  ' nature',
  ' population',
  ' religion_belief',
  ' taxation in the united states',
  ' technology_internet',
  ' tertiary education',
  ' unemployment',
  ' united nations development group',
  ' workforce',
  ' world bank cpia',
  ' world bank labor',
  ' world bank number',
  ' world bank population',
  'disaster_accident',
  'environment',
  'health_medical_pharma',
  'law_crime',
  'macroeconomics',
  'social issues',
  'technology_internet')]

In [15]:
labelDataVoting(vectorizeStringList(tokenizeString(title, describtion)))

Started at  2019-01-17 16:10:56.528598
Tokenisation is running.
Done at  2019-01-17 16:10:56.531618
Vectorisation is running.
Done at  2019-01-17 16:10:56.632977
Creating labels.




Done at  2019-01-17 16:11:38.546702


[(' female labor force in the muslim world',
  ' unemployment',
  ' united nations development group',
  ' workforce',
  ' world bank labor',
  'social issues')]

In [16]:
labelDataOVRkNN(vectorizeStringList(tokenizeString(title, describtion)))

Started at  2019-01-17 16:11:38.677289
Tokenisation is running.
Done at  2019-01-17 16:11:38.679619
Vectorisation is running.
Done at  2019-01-17 16:11:38.784558
Creating labels.




Done at  2019-01-17 16:11:39.692106


[(' female labor force in the muslim world',
  ' international labour organization',
  ' unemployment',
  ' united nations',
  ' united nations development group',
  ' workforce',
  ' world bank labor',
  'social issues')]

## ...some more test cases

In [17]:
testcases = getCsv('single_cases2.csv', delim = ';', enc = 'utf-8')

In [18]:
labels = []
for i, case in enumerate(testcases):
    print('Case index ', i)
    inner_list = []
    inner_list.append(labelDataRFC(vectorizeStringList(tokenizeString(case[1], case[0]))))
    inner_list.append(labelDataOVRRFC(vectorizeStringList(tokenizeString(case[1], case[0]))))
    inner_list.append(labelDataOVRkNN(vectorizeStringList(tokenizeString(case[1], case[0]))))
    inner_list.append(labelDataVoting(vectorizeStringList(tokenizeString(case[1], case[0]))))
    print(inner_list)
    labels.append(inner_list)    

Case index  0
Started at  2019-01-17 16:11:39.733947
Tokenisation is running.
Done at  2019-01-17 16:11:39.734805
Vectorisation is running.
Done at  2019-01-17 16:11:39.761613
Creating labels.




Done at  2019-01-17 16:11:41.761595
Started at  2019-01-17 16:11:41.798375
Tokenisation is running.
Done at  2019-01-17 16:11:41.799597
Vectorisation is running.
Done at  2019-01-17 16:11:41.826550
Creating labels.




Done at  2019-01-17 16:12:20.722931
Started at  2019-01-17 16:12:20.826234
Tokenisation is running.
Done at  2019-01-17 16:12:20.827273
Vectorisation is running.
Done at  2019-01-17 16:12:20.858047
Creating labels.




Done at  2019-01-17 16:12:21.782548
Started at  2019-01-17 16:12:21.786308
Tokenisation is running.
Done at  2019-01-17 16:12:21.787194
Vectorisation is running.
Done at  2019-01-17 16:12:21.813190
Creating labels.
Done at  2019-01-17 16:13:03.977530
[[()], [(' cpia', ' demography', ' disaster_accident', ' environment', ' expense', ' government', ' health', ' health_medical_pharma', ' hiv', ' human interest', ' infant', ' law_crime', ' medicine', ' population', ' rtt', ' technology_internet', ' world bank number', 'disaster_accident', 'health_medical_pharma', 'human interest', 'law_crime')], [(' demography', ' health')], [(' demography', ' health')]]
Case index  1
Started at  2019-01-17 16:13:04.105283
Tokenisation is running.
Done at  2019-01-17 16:13:04.106259
Vectorisation is running.
Done at  2019-01-17 16:13:04.114924
Creating labels.
Done at  2019-01-17 16:13:06.225976
Started at  2019-01-17 16:13:06.248797
Tokenisation is running.
Done at  2019-01-17 16:13:06.250342
Vectorisatio

Done at  2019-01-17 16:21:32.951371
Started at  2019-01-17 16:21:32.965334
Tokenisation is running.
Done at  2019-01-17 16:21:32.966215
Vectorisation is running.
Done at  2019-01-17 16:21:32.987344
Creating labels.
Done at  2019-01-17 16:22:11.885281
Started at  2019-01-17 16:22:11.988059
Tokenisation is running.
Done at  2019-01-17 16:22:11.989541
Vectorisation is running.
Done at  2019-01-17 16:22:12.019539
Creating labels.
Done at  2019-01-17 16:22:13.018747
Started at  2019-01-17 16:22:13.023578
Tokenisation is running.
Done at  2019-01-17 16:22:13.024603
Vectorisation is running.
Done at  2019-01-17 16:22:13.046726
Creating labels.
Done at  2019-01-17 16:22:54.941267
[[()], [(' cpia', ' disaster_accident', ' environment', ' expenditure', ' general government', ' government', ' government final consumption expenditure', ' gross domestic product', ' health', ' household final consumption expenditure', ' medicine', ' military budget', ' politics', ' social statistics', ' tax', ' tech

Done at  2019-01-17 16:30:40.899153
Started at  2019-01-17 16:30:40.903254
Tokenisation is running.
Done at  2019-01-17 16:30:40.904073
Vectorisation is running.
Done at  2019-01-17 16:30:40.927549
Creating labels.
Done at  2019-01-17 16:31:24.297928
[[()], [(' disaster_accident', ' environment', ' freight transport', ' law_crime', ' technology_internet', ' transport', ' world bank number', 'disaster_accident', 'freight transport', 'health_medical_pharma', 'law_crime')], [()], [()]]
Case index  14
Started at  2019-01-17 16:31:24.427631
Tokenisation is running.
Done at  2019-01-17 16:31:24.429496
Vectorisation is running.
Done at  2019-01-17 16:31:24.474669
Creating labels.
Done at  2019-01-17 16:31:26.617496
Started at  2019-01-17 16:31:26.636135
Tokenisation is running.
Done at  2019-01-17 16:31:26.637434
Vectorisation is running.
Done at  2019-01-17 16:31:26.681936
Creating labels.
Done at  2019-01-17 16:32:05.915530
Started at  2019-01-17 16:32:06.016154
Tokenisation is running.
Don

Done at  2019-01-17 16:39:10.653731
Started at  2019-01-17 16:39:10.765487
Tokenisation is running.
Done at  2019-01-17 16:39:10.766881
Vectorisation is running.
Done at  2019-01-17 16:39:10.782511
Creating labels.
Done at  2019-01-17 16:39:11.681170
Started at  2019-01-17 16:39:11.685163
Tokenisation is running.
Done at  2019-01-17 16:39:11.686338
Vectorisation is running.
Done at  2019-01-17 16:39:11.710009
Creating labels.
Done at  2019-01-17 16:39:54.096568
[[('social issues',)], [(' ageing', ' de facto', ' demography', ' disaster_accident', ' environment', ' environmental social science', ' government', ' gross domestic product', ' law_crime', ' medicine', ' politics', ' population', ' technology_internet', ' urban planning', ' world bank cpia', ' world bank population', 'health_medical_pharma', 'macroeconomics', 'social issues')], [(' environment', ' world bank population', 'social issues')], [(' environment', ' world bank population', 'social issues')]]


In [19]:
print('RandomForestClassifier ----- OvR RFC ----- OvR kNN ----- Voting Classifier\n==================================================================')
for i, line in enumerate(labels):
    print('Case ', i, ':')
    for elem in line:
        print(elem, end=' - - ')
    print('\n==================================================================')

RandomForestClassifier ----- OvR RFC ----- OvR kNN ----- Voting Classifier
Case  0 :
[()] - - [(' cpia', ' demography', ' disaster_accident', ' environment', ' expense', ' government', ' health', ' health_medical_pharma', ' hiv', ' human interest', ' infant', ' law_crime', ' medicine', ' population', ' rtt', ' technology_internet', ' world bank number', 'disaster_accident', 'health_medical_pharma', 'human interest', 'law_crime')] - - [(' demography', ' health')] - - [(' demography', ' health')] - - 
Case  1 :
[()] - - [(' cpia', ' environment', ' government', ' health', ' labor', ' law', ' law_crime', ' tax', 'disaster_accident')] - - [(' business', ' freight transport', ' international relations')] - - [()] - - 
Case  2 :
[()] - - [(' chemistry', ' climate change policy', ' climatology', ' disaster_accident', ' energy development', ' energy economics', ' environment', ' fossil fuel', ' fuel', ' fuels', ' government', ' greenhouse gas', ' greenhouse gases', ' health_medical_pharma', ' 