In [1]:
import os
import sys
import time
import csv
import datetime
import nltk
import requests
import _pickle

import numpy as np

from nltk import ngrams, FreqDist
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import sent_tokenize, word_tokenize
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

print(datetime.datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'))
print('_______________________________________________')

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/user1-2/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/user1-2/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /home/user1-2/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
2019-01-13 16:56:35
_______________________________________________


In [2]:
# !pip freeze > requirements_wu.txt

## Def Functions

In [3]:
def getTxt(path):  
    return open(path, 'r').read()

# function to open csv files with the right encoding
def getCsv(path, delim = ',', enc = 'utf-8'):          
    list_return = []
    with open (path, encoding = enc) as file:
        csvreader = csv.reader(file, delimiter = delim)
        for line in csvreader:
            list_return.append(line)
    return list_return

In [4]:
list_fillerwords = getTxt('..//input//fillerwords.txt').split(',') + stopwords.words('english')

In [5]:
def tokenizeString(title, desc):
    print('Started at ' , datetime.datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S.%f'))
    print('Tokenisation is running.')
    global list_fillerwords
#     string_to_clean = title + ' ' + desc
    string_to_clean = desc
    porterstemmer = PorterStemmer()
    lemmatizer = nltk.WordNetLemmatizer()
    
    list_token = nltk.word_tokenize(''.join([x for x in string_to_clean if not x.isdigit()]).replace('-',' ').replace('.','').replace(',','').replace('%','').replace(';',' ').replace('/', ' ').replace('(','').replace(')',''))
    
    for word in list_fillerwords:
        while (word in list_token):
            list_token.remove(word)
            
    for i, word in enumerate(list_token):
        list_token[i] = list_token[i].lower()
        lemmatizer.lemmatize(porterstemmer.stem(list_token[i]))
    
    print('Done at ' , datetime.datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S.%f'))
    return list_token

In [6]:
# function for summing up single vectors (=words) in a vectorlist
def sumVect(list_vect):
    try:
        for i, elem in enumerate(list_vect):
            if not elem:
                del list_vect[i]
        
        int_len_vect = len(list_vect[0])
        list_vect_sum = [0] * int_len_vect
        for vect in list_vect:
            for i, dim in enumerate(vect):
                list_vect_sum[i] += float(dim)
        return(list_vect_sum)
    except Exception as e:
        print(list_vect)
        print(i)
        print(list_vect_sum)
        print(e)

In [7]:
def vectorizeStringList(list_string):
    print('Vectorisation is running.')
    adress = 'http://word2vec.ai.wu.ac.at/googlenews/model?word='
    feat_words = []
    
    for str_elem in list_string:
        word = str_elem.strip() 
        try:
            feat_words.append([float(x) for x in requests.get((adress+word)).text.replace(' ','').replace('[','').replace(']','').split(',')])
        except:
            try:
                feat_words.append([float(x) for x in requests.get((adress+word.title())).text.replace(' ','').replace('[','').replace(']','').split(',')])
            except Exception as e:
                feat_words.append([])
                           
    if feat_words:
        list_return = sumVect(feat_words)
        
    print('Done at ' , datetime.datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S.%f'))
    return list_return

In [8]:
def labelDataRFC(vect):
    print('Creating labels.')
        
    with open('dumped_randomforestclassifier.pkl', 'rb') as fid:
        rf_load = _pickle.load(fid)
    with open('dumped_binarizer.pkl', 'rb') as fid:
        bin_load = _pickle.load(fid)
    
    np_in = [vect]
    X = np.array(np_in)
    pred_rf = rf_load.predict(X)
    pred_label = bin_load.inverse_transform(pred_rf)
    
    print('Done at ' , datetime.datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S.%f'))
    return pred_label

In [9]:
def labelDataOVRkNN(vect):
    print('Creating labels.')
        
    with open('dumped_ovrknn20.pkl', 'rb') as fid:
        rf_load = _pickle.load(fid)
    with open('dumped_binarizer.pkl', 'rb') as fid:
        bin_load = _pickle.load(fid)
    
    np_in = [vect]
    X = np.array(np_in)
    pred_rf = rf_load.predict(X)
    pred_label = bin_load.inverse_transform(pred_rf)
    
    print('Done at ' , datetime.datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S.%f'))
    return pred_label

In [10]:
def labelDataOVRRFC(vect):
    print('Creating labels.')
        
    with open('dumped_ovrrfc.pkl', 'rb') as fid:
        rf_load = _pickle.load(fid)
    with open('dumped_binarizer.pkl', 'rb') as fid:
        bin_load = _pickle.load(fid)
    
    np_in = [vect]
    X = np.array(np_in)
    pred_rf = rf_load.predict(X)
    pred_label = bin_load.inverse_transform(pred_rf)
    
    print('Done at ' , datetime.datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S.%f'))
    return pred_label

In [11]:
def labelDataVoting(vect):
    print('Creating labels.')
        
    with open('dumped_randomforestclassifier.pkl', 'rb') as fid:
        rf_load = _pickle.load(fid)
    with open('dumped_ovrrfc.pkl', 'rb') as fid:
        ovrrf_load = _pickle.load(fid)
    with open('dumped_ovrknn20.pkl', 'rb') as fid:
        ovrknn_load = _pickle.load(fid)
    with open('dumped_binarizer.pkl', 'rb') as fid:
        bin_load = _pickle.load(fid)
    
    np_in = [vect]
    X = np.array(np_in)
    pred_rf = rf_load.predict(X)
    pred_ovrrf = ovrrf_load.predict(X)
    pred_ovrknn = ovrknn_load.predict(X)
    
    pred_label = ((pred_rf+pred_ovrrf+pred_ovrknn)/3)
    for i, label in enumerate(pred_label[0]):
        if label > 0.5:
            pred_label[0][i] = 1
        else:
            pred_label[0][i] = 0
        
        
    pred_out = bin_load.inverse_transform(pred_label)
    
    print('Done at ' , datetime.datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S.%f'))
    return pred_out

## Create Labels

In [12]:
title = 'Labor force participation rate, female (% of female population ages 15+) (modeled ILO estimate), countries grouped by income levels'
describtion = 'Labor force participation rate is the proportion of the population ages 15 and older that is economically active: all people who supply labor for the production of goods and services during a specified period.'
# title = 'CO2 emissions (metric tons per capita)'
# describtion = 'Carbon dioxide emissions are those stemming from the burning of fossil fuels and the manufacture of cement. They include carbon dioxide produced during consumption of solid, liquid, and gas fuels and gas flaring. Government reaction taxes issue '

In [13]:
labelDataRFC(vectorizeStringList(tokenizeString(title, describtion)))

Started at  2019-01-13 16:56:35.329619
Tokenisation is running.
Done at  2019-01-13 16:56:39.053012
Vectorisation is running.
Done at  2019-01-13 16:56:39.179940
Creating labels.




Done at  2019-01-13 16:56:46.541823


[()]

In [14]:
labelDataOVRRFC(vectorizeStringList(tokenizeString(title, describtion)))

Started at  2019-01-13 16:56:46.574544
Tokenisation is running.
Done at  2019-01-13 16:56:46.576316
Vectorisation is running.
Done at  2019-01-13 16:56:46.689278
Creating labels.




Done at  2019-01-13 16:57:26.494032


[(' ageing',
  ' cpia',
  ' demographic economics',
  ' demography',
  ' disaster_accident',
  ' economics',
  ' energy development',
  ' energy economics',
  ' environment',
  ' expense',
  ' female labor force in the muslim world',
  ' government',
  ' health',
  ' health_medical_pharma',
  ' hiv',
  ' hospitality_recreation',
  ' human interest',
  ' human migration',
  ' law_crime',
  ' medicine',
  ' money',
  ' nature',
  ' population',
  ' religion_belief',
  ' taxation in the united states',
  ' technology_internet',
  ' tertiary education',
  ' unemployment',
  ' united nations development group',
  ' workforce',
  ' world bank cpia',
  ' world bank labor',
  ' world bank number',
  ' world bank population',
  'disaster_accident',
  'environment',
  'health_medical_pharma',
  'law_crime',
  'macroeconomics',
  'social issues',
  'technology_internet')]

In [15]:
labelDataVoting(vectorizeStringList(tokenizeString(title, describtion)))

Started at  2019-01-13 16:57:26.632824
Tokenisation is running.
Done at  2019-01-13 16:57:26.636034
Vectorisation is running.
Done at  2019-01-13 16:57:26.762438
Creating labels.




Done at  2019-01-13 16:58:10.281054


[(' female labor force in the muslim world',
  ' unemployment',
  ' united nations development group',
  ' workforce',
  ' world bank labor',
  'social issues')]

In [16]:
labelDataOVRkNN(vectorizeStringList(tokenizeString(title, describtion)))

Started at  2019-01-13 16:58:10.407445
Tokenisation is running.
Done at  2019-01-13 16:58:10.409517
Vectorisation is running.
Done at  2019-01-13 16:58:10.503655
Creating labels.




Done at  2019-01-13 16:58:11.475550


[(' female labor force in the muslim world',
  ' international labour organization',
  ' unemployment',
  ' united nations',
  ' united nations development group',
  ' workforce',
  ' world bank labor',
  'social issues')]

## ...some more test cases

In [17]:
testcases = getCsv('single_cases2.csv', delim = ';', enc = 'utf-8')

In [18]:
labels = []
for i, case in enumerate(testcases):
    print('Case index ', i)
    inner_list = []
    inner_list.append(labelDataRFC(vectorizeStringList(tokenizeString(case[1], case[0]))))
    inner_list.append(labelDataOVRRFC(vectorizeStringList(tokenizeString(case[1], case[0]))))
    inner_list.append(labelDataOVRkNN(vectorizeStringList(tokenizeString(case[1], case[0]))))
    inner_list.append(labelDataVoting(vectorizeStringList(tokenizeString(case[1], case[0]))))
    print(inner_list)
    labels.append(inner_list)    

Case index  0
Started at  2019-01-13 16:58:11.507201
Tokenisation is running.
Done at  2019-01-13 16:58:11.507937
Vectorisation is running.
Done at  2019-01-13 16:58:11.533153
Creating labels.




Done at  2019-01-13 16:58:13.263672
Started at  2019-01-13 16:58:13.280664
Tokenisation is running.
Done at  2019-01-13 16:58:13.282431
Vectorisation is running.
Done at  2019-01-13 16:58:13.307516
Creating labels.




KeyboardInterrupt: 

In [None]:
print('RandomForestClassifier ----- OvR RFC ----- OvR kNN ----- Voting Classifier\n==================================================================')
for i, line in enumerate(labels):
    print('Case ', i, ':')
    for elem in line:
        print(elem, end=' - - ')
    print('\n==================================================================')