In [1]:
import os
import sys
import time
import datetime
import nltk
import requests
import _pickle

import numpy as np

from nltk import ngrams, FreqDist
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import sent_tokenize, word_tokenize
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

print(datetime.datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'))
print('_______________________________________________')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
2018-12-05 02:10:46
_______________________________________________


## Def Functions

In [2]:
def getTxt(path):  
    return open(path, 'r').read()

In [3]:
list_fillerwords = getTxt('../input/fillerwords.txt').split(',') + stopwords.words('english')

In [4]:
def tokenizeString(title, desc):
    print('Started at ' , datetime.datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S.%f'))
    print('Tokenisation is running.')
    global list_fillerwords
    string_to_clean = title + ' ' + desc
    porterstemmer = PorterStemmer()
    lemmatizer = nltk.WordNetLemmatizer()
    
    list_token = nltk.word_tokenize(''.join([x for x in string_to_clean if not x.isdigit()]).replace('-',' ').replace('.','').replace(',','').replace('%','').replace(';',' ').replace('/', ' ').replace('(','').replace(')',''))
    
    for word in list_fillerwords:
        while (word in list_token):
            list_token.remove(word)
            
    for i, word in enumerate(list_token):
        list_token[i] = list_token[i].lower()
        lemmatizer.lemmatize(porterstemmer.stem(list_token[i]))
    
    print('Done at ' , datetime.datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S.%f'))
    return list_token

In [5]:
# function for summing up single vectors (=words) in a vectorlist
def sumVect(list_vect):
    try:
        for i, elem in enumerate(list_vect):
            if not elem:
                del list_vect[i]
        
        int_len_vect = len(list_vect[0])
        list_vect_sum = [0] * int_len_vect
        for vect in list_vect:
            for i, dim in enumerate(vect):
                list_vect_sum[i] += float(dim)
        return(list_vect_sum)
    except Exception as e:
        print(list_vect)
        print(i)
        print(list_vect_sum)
        print(e)

In [6]:
def vectorizeStringList(list_string):
    print('Vectorisation is running.')
    adress = 'http://word2vec.ai.wu.ac.at/googlenews/model?word='
    feat_words = []
    
    for str_elem in list_string:
        word = str_elem.strip() 
        try:
            feat_words.append([float(x) for x in requests.get((adress+word)).text.replace(' ','').replace('[','').replace(']','').split(',')])
        except:
            try:
                feat_words.append([float(x) for x in requests.get((adress+word.title())).text.replace(' ','').replace('[','').replace(']','').split(',')])
            except Exception as e:
                feat_words.append([])
                           
    if feat_words:
        list_return = sumVect(feat_words)
        
    print('Done at ' , datetime.datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S.%f'))
    return list_return

In [7]:
def labelDataRFC(vect):
    print('Creating labels.')
        
    with open('dumped_randomforestclassifier.pkl', 'rb') as fid:
        rf_load = _pickle.load(fid)
    with open('dumped_binarizer.pkl', 'rb') as fid:
        bin_load = _pickle.load(fid)
    
    np_in = [vect]
    X = np.array(np_in)
    pred_rf = rf_load.predict(X)
    pred_label = bin_load.inverse_transform(pred_rf)
    
    print('Done at ' , datetime.datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S.%f'))
    return pred_label

In [8]:
def labelDataOVRkNN(vect):
    print('Creating labels.')
        
    with open('dumped_ovrknn.pkl', 'rb') as fid:
        rf_load = _pickle.load(fid)
    with open('dumped_binarizer.pkl', 'rb') as fid:
        bin_load = _pickle.load(fid)
    
    np_in = [vect]
    X = np.array(np_in)
    pred_rf = rf_load.predict(X)
    pred_label = bin_load.inverse_transform(pred_rf)
    
    print('Done at ' , datetime.datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S.%f'))
    return pred_label

In [9]:
def labelDataOVRRFC(vect):
    print('Creating labels.')
        
    with open('dumped_ovrrfc.pkl', 'rb') as fid:
        rf_load = _pickle.load(fid)
    with open('dumped_binarizer.pkl', 'rb') as fid:
        bin_load = _pickle.load(fid)
    
    np_in = [vect]
    X = np.array(np_in)
    pred_rf = rf_load.predict(X)
    pred_label = bin_load.inverse_transform(pred_rf)
    
    print('Done at ' , datetime.datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S.%f'))
    return pred_label

In [10]:
def labelDataVoting(vect):
    print('Creating labels.')
        
    with open('dumped_randomforestclassifier.pkl', 'rb') as fid:
        rf_load = _pickle.load(fid)
    with open('dumped_ovrrfc.pkl', 'rb') as fid:
        ovrrf_load = _pickle.load(fid)
    with open('dumped_ovrknn.pkl', 'rb') as fid:
        ovrknn_load = _pickle.load(fid)
    with open('dumped_binarizer.pkl', 'rb') as fid:
        bin_load = _pickle.load(fid)
    
    np_in = [vect]
    X = np.array(np_in)
    pred_rf = rf_load.predict(X)
    pred_ovrrf = ovrrf_load.predict(X)
    pred_ovrknn = ovrknn_load.predict(X)
    
    pred_label = ((pred_rf+pred_ovrrf+pred_ovrknn)/3)
    for i, label in enumerate(pred_label[0]):
        if label > 0.5:
            pred_label[0][i] = 1
        else:
            pred_label[0][i] = 0
        
        
    pred_out = bin_load.inverse_transform(pred_label)
    
    print('Done at ' , datetime.datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S.%f'))
    return pred_out

## Create Labels

In [11]:
# title = 'Labor force participation rate, female (% of female population ages 15+) (modeled ILO estimate), countries grouped by income levels'
# describtion = 'Labor force participation rate is the proportion of the population ages 15 and older that is economically active: all people who supply labor for the production of goods and services during a specified period.'
title = 'CO2 emissions (metric tons per capita)'
describtion = 'Carbon dioxide emissions are those stemming from the burning of fossil fuels and the manufacture of cement. They include carbon dioxide produced during consumption of solid, liquid, and gas fuels and gas flaring. Government reaction taxes issue '

In [12]:
labelDataRFC(vectorizeStringList(tokenizeString(title, describtion)))

Started at  2018-12-05 02:10:46.682182
Tokenisation is running.
Done at  2018-12-05 02:10:48.562149
Vectorisation is running.
Done at  2018-12-05 02:10:57.648889
Creating labels.




Done at  2018-12-05 02:10:58.756923


[(' chemistry', ' universe', 'environment')]

In [13]:
labelDataOVRRFC(vectorizeStringList(tokenizeString(title, describtion)))

Started at  2018-12-05 02:11:02.717100
Tokenisation is running.
Done at  2018-12-05 02:11:02.720093
Vectorisation is running.
Done at  2018-12-05 02:11:14.763058
Creating labels.




Done at  2018-12-05 02:11:28.287573


[(' chemistry', ' universe', 'environment')]

In [None]:
labelDataVoting(vectorizeStringList(tokenizeString(title, describtion)))

In [None]:
labelDataOVRkNN(vectorizeStringList(tokenizeString(title, describtion)))

Started at  2018-12-05 02:11:39.851882
Tokenisation is running.
Done at  2018-12-05 02:11:39.853861
Vectorisation is running.
