In [28]:
import pandas as pd
import numpy as np
%matplotlib inline
import matplotlib.pyplot as plt
import io
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import SGDClassifier
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import PCA

from sklearn.pipeline import make_pipeline
from sklearn.model_selection import cross_validate
from sklearn.svm import SVC

from sklearn.pipeline import Pipeline, FeatureUnion

from nltk.tokenize import RegexpTokenizer
from nltk.stem.snowball import SnowballStemmer
import string

import re

from imblearn.combine import SMOTEENN
from imblearn.under_sampling import ClusterCentroids

from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from sklearn.utils.multiclass import unique_labels

from sklearn.base import BaseEstimator, TransformerMixin

from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score, StratifiedKFold

from gensim.models.fasttext import load_facebook_vectors

In [29]:
fbkv = load_facebook_vectors('../../../datafiles/81/parameters.bin')

  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


In [149]:
def recode_topics(topic):
    recoded = topic
    if len(topic) == 3:
        recoded = topic[:1]
    if len(topic) == 4:
        recoded = topic[:2]
    return recoded

def get_article(dataroot, id):
    article = ""
    try:
        with io.open(dataroot+id+'.txt', mode='r', encoding='utf-8') as raw:
            temp = raw.readlines()
            article = article.join(temp)
    except:
        try:
            with io.open(dataroot+id+'_1.txt', mode='r', encoding='utf-8') as raw:
                temp = raw.readlines()
                article = article.join(temp)
        except:
            article = None
            print('file not found!')
    return article

def get_articles(dataroot, ids):
    articles = []
    for index, val in ids.items():
        article = get_article(dataroot, val)
        articles.append(article)
    return articles

def get_fasttext_means(ft, corpus):

    vectorizer = CountVectorizer(analyzer='word')
    vectorized = vectorizer.fit_transform(corpus.values)
    embedding_matrix = np.zeros((vectorized.shape[1], 100))
    
    words = []
    for index, word in enumerate(vectorizer.get_feature_names()):
        words.append(word)
        embedding_matrix[index] = ft.get_vector(word)

    means = []
    for index in range(vectorized.shape[0]):
        means.append( np.mean(embedding_matrix[vectorized.getrow(index).indices], axis=0) )
        
    return means


def get_fasttext_tfidf_weighted(ft, corpus):

    vectorizer = TfidfVectorizer()
    vectorized = vectorizer.fit_transform(corpus.values)
    
    embedding_matrix = np.zeros((vectorized.shape[1], 100))
    
    words = []
    for index, word in enumerate(vectorizer.get_feature_names()):
        words.append(word)
        embedding_matrix[index] = ft.get_vector(word)

    tfidf_weighted_vecs = []
    for index in range(vectorized.shape[0]):
        
        doc_indices = vectorized.getrow(index).indices
        doc_vec = embedding_matrix[doc_indices]
        doc_weights = np.asarray(vectorized.getrow(index).todense()[0,doc_indices]).reshape(-1)
        weighted = np.dot(doc_weights.T, doc_vec) / np.sum(doc_weights)
        tfidf_weighted_vecs.append( weighted )
        
    return tfidf_weighted_vecs

def FT_to_matrix(data):
    M = np.zeros((len(data), 100))
    for index in range(len(data)):
        M[index] = data[index]
        
    return M

def print_stats(preds, validation_y):
    print('Accuracy = {}'.format(accuracy_score(validation_y, preds)))
    print('Classification report:')
    print(classification_report(validation_y, preds))
    plot_confusion_matrix(validation_y, preds, normalize=True, title='Normalized confusion matrix')

In [143]:
labeled_data = pd.read_pickle('../../data/labeled_data_pos_ont_nn.pkl')

In [145]:
labeled_data['Raw_FT_weighted'] = get_fasttext_tfidf_weighted(fbkv, labeled_data['Raw'])

In [150]:
labeled_data.to_pickle('../../data/labeled_data_pos_ont_nn_ftweighted.pkl')

In [2]:
data = pd.read_csv("../../data_hu.csv")
data[data['_source.codes.majorTopic'].notnull()]

Unnamed: 0,_id,_source.codes.majorTopic,_source.publication_date,_source.doctype,_source.codes.coderId
234,ZrnCbmABVo8DrD4X_sWL,1601.0,2000-08-06T02:00:00Z,Aftenposten,1
254,X8THbmABVo8DrD4XfiV1,12.0,2012-08-29T02:00:00Z,Aftenposten,1
276,68_NbmABVo8DrD4XO_IN,12.0,2005-04-30T02:00:00Z,VG,1
277,-L_FbmABVo8DrD4XQU8d,16.0,2006-04-28T02:00:00Z,Aftenposten,1
370,jdjQbmABVo8DrD4XdGPO,401.0,2015-01-16T03:00:00Z,VG,2
...,...,...,...,...,...
1559546,SLrDbmABVo8DrD4XTVvC,93.0,2001-11-17T03:00:00Z,Aftenposten,1
1559553,8LzEbmABVo8DrD4XEDsX,15.0,2003-10-18T02:00:00Z,Aftenposten,2
1559555,MLzEbmABVo8DrD4XMIiG,16.0,2003-06-23T02:00:00Z,Aftenposten,3
1559568,IbvDbmABVo8DrD4XtEgr,17.0,2002-11-14T03:00:00Z,Aftenposten,1


In [15]:
aftvg = data.groupby('_source.doctype')['_source.codes.majorTopic'].count().tolist()
print(aftvg[0]/sum(aftvg))
print(aftvg[1]/sum(aftvg))

0.6049085014602256
0.3950914985397744


In [17]:
labeled_data = data[data['_source.codes.majorTopic'].notnull()]
labeled_data.columns = ['Id', 'Label', 'Pub.Date', 'Source', 'CoderId']
labeled_data['Aggr.Label'] = labeled_data['Label'].astype(int).astype(str).apply(recode_topics).astype(int)
labeled_data['Label'] = labeled_data['Label'].astype(int)
labeled_data['Source'] = labeled_data['Source'].astype('category')
labeled_data['Pub.Date'] = pd.to_datetime(labeled_data['Pub.Date'])
labeled_data['CoderId'].fillna(0, inplace=True)
labeled_data['CoderId'] = labeled_data['CoderId'].astype(int)
labeled_data.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_

Unnamed: 0,Id,Label,Pub.Date,Source,CoderId,Aggr.Label
234,ZrnCbmABVo8DrD4X_sWL,1601,2000-08-06 02:00:00+00:00,Aftenposten,1,16
254,X8THbmABVo8DrD4XfiV1,12,2012-08-29 02:00:00+00:00,Aftenposten,1,12
276,68_NbmABVo8DrD4XO_IN,12,2005-04-30 02:00:00+00:00,VG,1,12
277,-L_FbmABVo8DrD4XQU8d,16,2006-04-28 02:00:00+00:00,Aftenposten,1,16
370,jdjQbmABVo8DrD4XdGPO,401,2015-01-16 03:00:00+00:00,VG,2,4


In [20]:
labeled_data['Aggr.Label'].unique()

array([16, 12,  4, 20,  3, 23, 14,  5, 19, 15, 10,  1, 13,  8, 17,  7,  6,
        2, 18,  9, 21, 92, 25, 93, 91, 24, 26], dtype=int64)

In [6]:
articles = get_articles('../../data_hu/', labeled_data['Id'])
labeled_data['Raw'] = articles
labeled_data.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


Unnamed: 0,Id,Label,Pub.Date,Source,CoderId,Aggr.Label,Raw
234,ZrnCbmABVo8DrD4X_sWL,1601,2000-08-06 02:00:00+00:00,Aftenposten,1,16,Islamister mistenkes for massakre på Filippine...
254,X8THbmABVo8DrD4XfiV1,12,2012-08-29 02:00:00+00:00,Aftenposten,1,12,Kart i politiets daglige virke. I et innlegg i...
276,68_NbmABVo8DrD4XO_IN,12,2005-04-30 02:00:00+00:00,VG,1,12,Kastet arbeider til løvene. Den fargede gårdsa...
277,-L_FbmABVo8DrD4XQU8d,16,2006-04-28 02:00:00+00:00,Aftenposten,1,16,Karl Rove grilles videre av storjury. Presiden...
370,jdjQbmABVo8DrD4XdGPO,401,2015-01-16 03:00:00+00:00,VG,2,4,Norske gründere med hårete mål - Vi skal erobr...


In [7]:
articles_lem = get_articles('../../data_hu_lemma/', labeled_data['Id'])
labeled_data['Lemma'] = articles_lem
labeled_data.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


Unnamed: 0,Id,Label,Pub.Date,Source,CoderId,Aggr.Label,Raw,Lemma
234,ZrnCbmABVo8DrD4X_sWL,1601,2000-08-06 02:00:00+00:00,Aftenposten,1,16,Islamister mistenkes for massakre på Filippine...,islamist mistenke for massaker på Filippin $. ...
254,X8THbmABVo8DrD4XfiV1,12,2012-08-29 02:00:00+00:00,Aftenposten,1,12,Kart i politiets daglige virke. I et innlegg i...,kart i politi daglig virke $. i en innlegg i A...
276,68_NbmABVo8DrD4XO_IN,12,2005-04-30 02:00:00+00:00,VG,1,12,Kastet arbeider til løvene. Den fargede gårdsa...,kastet arbeide til løve $. den farge gårdsarbe...
277,-L_FbmABVo8DrD4XQU8d,16,2006-04-28 02:00:00+00:00,Aftenposten,1,16,Karl Rove grilles videre av storjury. Presiden...,Karl Rove grille vid av storjury $. president ...
370,jdjQbmABVo8DrD4XdGPO,401,2015-01-16 03:00:00+00:00,VG,2,4,Norske gründere med hårete mål - Vi skal erobr...,norsk gründer med håret mål $- vi skulle erobr...


In [8]:
labeled_data['Raw_len'] = labeled_data['Raw'].astype(str).apply(len)
labeled_data['Raw_word_count'] = labeled_data['Raw'].apply(lambda x: len(str(x).split()))
labeled_data.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


Unnamed: 0,Id,Label,Pub.Date,Source,CoderId,Aggr.Label,Raw,Lemma,Raw_len,Raw_word_count
234,ZrnCbmABVo8DrD4X_sWL,1601,2000-08-06 02:00:00+00:00,Aftenposten,1,16,Islamister mistenkes for massakre på Filippine...,islamist mistenke for massaker på Filippin $. ...,2687,407
254,X8THbmABVo8DrD4XfiV1,12,2012-08-29 02:00:00+00:00,Aftenposten,1,12,Kart i politiets daglige virke. I et innlegg i...,kart i politi daglig virke $. i en innlegg i A...,1927,300
276,68_NbmABVo8DrD4XO_IN,12,2005-04-30 02:00:00+00:00,VG,1,12,Kastet arbeider til løvene. Den fargede gårdsa...,kastet arbeide til løve $. den farge gårdsarbe...,1070,175
277,-L_FbmABVo8DrD4XQU8d,16,2006-04-28 02:00:00+00:00,Aftenposten,1,16,Karl Rove grilles videre av storjury. Presiden...,Karl Rove grille vid av storjury $. president ...,889,144
370,jdjQbmABVo8DrD4XdGPO,401,2015-01-16 03:00:00+00:00,VG,2,4,Norske gründere med hårete mål - Vi skal erobr...,norsk gründer med håret mål $- vi skulle erobr...,2801,476


In [9]:
labeled_data['Lemma_FT_mean'] = get_fasttext_means(fbkv, labeled_data['Lemma'])
labeled_data.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


Unnamed: 0,Id,Label,Pub.Date,Source,CoderId,Aggr.Label,Raw,Lemma,Raw_len,Raw_word_count,Lemma_FT_mean
234,ZrnCbmABVo8DrD4X_sWL,1601,2000-08-06 02:00:00+00:00,Aftenposten,1,16,Islamister mistenkes for massakre på Filippine...,islamist mistenke for massaker på Filippin $. ...,2687,407,"[-0.06584884587773798, -0.04098963224850102, 0..."
254,X8THbmABVo8DrD4XfiV1,12,2012-08-29 02:00:00+00:00,Aftenposten,1,12,Kart i politiets daglige virke. I et innlegg i...,kart i politi daglig virke $. i en innlegg i A...,1927,300,"[-0.23100158159451728, 0.027330515746055453, 0..."
276,68_NbmABVo8DrD4XO_IN,12,2005-04-30 02:00:00+00:00,VG,1,12,Kastet arbeider til løvene. Den fargede gårdsa...,kastet arbeide til løve $. den farge gårdsarbe...,1070,175,"[-0.13832956393958912, -0.09283030465913249, 0..."
277,-L_FbmABVo8DrD4XQU8d,16,2006-04-28 02:00:00+00:00,Aftenposten,1,16,Karl Rove grilles videre av storjury. Presiden...,Karl Rove grille vid av storjury $. president ...,889,144,"[-0.2623476237175055, -0.00673417117795907, 0...."
370,jdjQbmABVo8DrD4XdGPO,401,2015-01-16 03:00:00+00:00,VG,2,4,Norske gründere med hårete mål - Vi skal erobr...,norsk gründer med håret mål $- vi skulle erobr...,2801,476,"[-0.20017048150353203, -0.05119177845478597, 0..."


In [10]:
labeled_data['Raw_FT_mean'] = get_fasttext_means(fbkv, labeled_data['Raw'])
labeled_data.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


Unnamed: 0,Id,Label,Pub.Date,Source,CoderId,Aggr.Label,Raw,Lemma,Raw_len,Raw_word_count,Lemma_FT_mean,Raw_FT_mean
234,ZrnCbmABVo8DrD4X_sWL,1601,2000-08-06 02:00:00+00:00,Aftenposten,1,16,Islamister mistenkes for massakre på Filippine...,islamist mistenke for massaker på Filippin $. ...,2687,407,"[-0.06584884587773798, -0.04098963224850102, 0...","[0.0061439156458215965, -0.03935358862604442, ..."
254,X8THbmABVo8DrD4XfiV1,12,2012-08-29 02:00:00+00:00,Aftenposten,1,12,Kart i politiets daglige virke. I et innlegg i...,kart i politi daglig virke $. i en innlegg i A...,1927,300,"[-0.23100158159451728, 0.027330515746055453, 0...","[-0.2131826601575285, 0.025133214832412758, -0..."
276,68_NbmABVo8DrD4XO_IN,12,2005-04-30 02:00:00+00:00,VG,1,12,Kastet arbeider til løvene. Den fargede gårdsa...,kastet arbeide til løve $. den farge gårdsarbe...,1070,175,"[-0.13832956393958912, -0.09283030465913249, 0...","[-0.07781700275372713, -0.06978481634988301, 0..."
277,-L_FbmABVo8DrD4XQU8d,16,2006-04-28 02:00:00+00:00,Aftenposten,1,16,Karl Rove grilles videre av storjury. Presiden...,Karl Rove grille vid av storjury $. president ...,889,144,"[-0.2623476237175055, -0.00673417117795907, 0....","[-0.22134696929198172, 0.013158530245224636, 0..."
370,jdjQbmABVo8DrD4XdGPO,401,2015-01-16 03:00:00+00:00,VG,2,4,Norske gründere med hårete mål - Vi skal erobr...,norsk gründer med håret mål $- vi skulle erobr...,2801,476,"[-0.20017048150353203, -0.05119177845478597, 0...","[-0.2003348228373203, -0.06986049747279556, -0..."


In [11]:
labeled_data['Lemma_stripped'] = labeled_data['Lemma'].astype(str).apply(lambda x: re.sub(r'\$.', '', x))
labeled_data['Lemma_stripped'].head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


234    islamist mistenke for massaker på Filippin  Un...
254    kart i politi daglig virke  i en innlegg i Aft...
276    kastet arbeide til løve  den farge gårdsarbeid...
277    Karl Rove grille vid av storjury  president Ge...
370    norsk gründer med håret mål  vi skulle erobre ...
Name: Lemma_stripped, dtype: object

In [13]:
labeled_data.to_pickle('../../data/labeled_data.pkl')