In [38]:
import pandas as pd
import numpy as np
%matplotlib inline

from sklearn.feature_selection import SelectKBest, chi2

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

from collections import Counter

from sklearn.naive_bayes import MultinomialNB

In [2]:
labeled_data = pd.read_pickle('../../data/labeled_data_pos_ont_nn.pkl')
drop_trash = (labeled_data['Aggr.Label'] < 90) & (labeled_data['NN_bool'] == False)
labeled_data = labeled_data[drop_trash]

In [3]:
pos_features = ['Adjective', 'Verb', 'Noun']
ont_features = ['Underspecified', 'Artifact', 'Object', 'Group', 'Human', 'Natural', 'LanguageRepresentation', 'Living', 'GeopoliticalPlace', 'BodyPart', 'Instrument', 'Place', '3rdOrderEntity', 'Mental', 'Purpose', 'Social', 'Institution', 'Plant', 'Imagerepresentation', 'Creature', 'Animal', 'Comestible', 'Quantity', 'Building', 'Substance', 'Part', 'Property', 'BoundedEvent', 'Agentive', 'Communication', 'Garment', 'Furniture', 'Vehicle', '1stOrderEntity', 'Covering', 'Liquid', 'Time', 'UnboundedEvent', 'Physical', 'Dynamic', 'Domain', 'Existence', 'Location', 'Manner', 'Container', 'Condition', 'Static', '2ndOrderEntity', 'Phenomenal', 'MoneyRepresentation', 'Experience', 'Relation', 'Form', 'Representation', 'Stimulating', 'Colour', 'Cause', 'Occupation', 'Possession', 'Artwork', 'Software']
numerical_features = ['Raw_len', 'Raw_word_count']
categorical_features = ['Label', 'Aggr.Label', 'Source', 'CoderId']
text_features = ['Lemma_stripped', 'Raw', 'Lemma']
fastt_features = ['Raw_FT_mean', 'Lemma_FT_mean']
features = pos_features + ont_features + numerical_features + text_features + fastt_features + categorical_features

fastt = 'Raw_FT_mean'
corpus = 'Lemma'
target = 'Aggr.Label'
#target = 'Label'


train_X, validation_X, train_y, validation_y = train_test_split(
    labeled_data[features],
    labeled_data[target],
    test_size=0.33,
    random_state=1,
    stratify=labeled_data[target])

In [57]:
vectorizer = TfidfVectorizer(**{'tokenizer': lambda x: x.split(), 'max_df': 0.3, 'max_features': None, 'norm': 'l2', 'use_idf': False})
train_x = vectorizer.fit_transform(train_X[corpus])
validation_x = vectorizer.transform(validation_X[corpus])

words = vectorizer.get_feature_names()

In [6]:
def get_class_tfidf_means(X, Y, c, words, n):
    class_ids = np.where(Y == c)
    return sorted( list( zip(words, X[class_ids].mean(0).getA1())), key=lambda x: x[1], reverse=True)[:n]

def get_corpus_tfidf_means(words, X, n):
    return sorted( list( zip(words, X.mean(0).getA1())), key=lambda x: x[1], reverse=True)[:n]

In [7]:
def get_accumulated_ranks(words, X, n):
    l = len(words)

    counts = Counter()
    
    for i in range(X.shape[0]):
        
        row = np.squeeze(X[i].toarray())
        top_id = np.argsort(row)[::-1][:10]
        counts.update([words[k] for k in top_id if row[k] != 0])

    count_words = [(k, v) for k,v in counts.items()]

    return sorted( count_words, key=lambda x: x[1], reverse=True)[:n]

In [58]:
count_corpus_top = get_accumulated_ranks(words, train_x, 100)

KeyboardInterrupt: 

In [16]:
[x[0] for x in count_corpus_top]

['hun',
 'norge',
 'norsk',
 'du',
 'krone',
 'prosent',
 'man',
 'land',
 'oslo',
 'politi',
 'mann',
 'gar',
 'mene',
 'million',
 'sak',
 'selskap',
 'vg',
 'barn',
 'kvinne',
 'liten',
 'slik',
 'vise',
 'gammel',
 'usa',
 'der',
 'hva',
 'finne',
 'var',
 'bruke',
 'tid',
 'tro',
 'gang',
 'aring',
 'ntb',
 'ke',
 'denne',
 'tidlig',
 'skrive',
 'folk',
 'parti',
 'milliard',
 'bil',
 'under',
 'hvor',
 'sist',
 'tre',
 'min',
 'eu',
 'siden',
 'mellom',
 'iflge',
 'rett',
 'kommune',
 'viktig',
 '000',
 'hans',
 'burde',
 'leder',
 'uke',
 'regjering',
 'menneske',
 'person',
 'aftenposten',
 'verden',
 'liv',
 'penge',
 'president',
 'skole',
 'hy',
 'mte',
 'frste',
 'ingen',
 'lang',
 'sta',
 'politisk',
 'nske',
 'pris',
 'legge',
 'egen',
 'drepe',
 'fortelle',
 'her',
 'rundt',
 'skje',
 'kamp',
 'selge',
 'ansatt',
 'tall',
 'by',
 'hvis',
 '10',
 'tv',
 'bade',
 'jobb',
 'ned',
 'fire',
 'fjor',
 'ap',
 'jente',
 'amerikansk']

In [24]:
[x[0] for x in get_class_tfidf_means(train_x, train_y, 18, words, 25)]

['norsk',
 'norge',
 'land',
 'prosent',
 'krone',
 'ke',
 'usa',
 'sverige',
 'mene',
 'du',
 'svensk',
 'ntb',
 'verden',
 'hun',
 'million',
 'vare',
 'eu',
 'handle',
 'aftenposten',
 'mellom',
 'slik',
 'toll',
 'man',
 'nordmann',
 'sist']

In [59]:
corpus_top = get_corpus_tfidf_means(words, train_x, 100)
corpus_top

[('hun', 0.03679772952281546),
 ('norsk', 0.032547337712039145),
 ('$?', 0.031215093265825822),
 ('norge', 0.03078468777543775),
 ('du', 0.02734389331220279),
 ('krone', 0.025892148537540866),
 ('mene', 0.02550733003104852),
 ('prosent', 0.02498723745219147),
 ('man', 0.0235673563033522),
 ('går', 0.023197255884258423),
 ('land', 0.022996185569323466),
 ('liten', 0.022224659969511822),
 ('før', 0.021936021176808092),
 ('slik', 0.021642812312290892),
 ('mann', 0.02075133781561205),
 ('denne', 0.020519954113338096),
 ('der', 0.020233468856998314),
 ('oslo', 0.019890461463660757),
 ('vise', 0.01936214164949951),
 ('tid', 0.019200190869811874),
 ('million', 0.018791269833757673),
 ('gang', 0.01839503592427513),
 ('politi', 0.018076763036248552),
 ('hva', 0.017744077141693137),
 ('stå', 0.01759687949630474),
 ('sist', 0.01741824791239221),
 ('tidlig', 0.01724624315406099),
 ('bruke', 0.017168844291142744),
 ('under', 0.017140408578401573),
 ('sak', 0.016708868127309186),
 ('siden', 0.016608

In [12]:
[x[0] for x in corpus_top]

['hun',
 'norsk',
 'norge',
 'du',
 'krone',
 'mene',
 'prosent',
 'land',
 'man',
 'gar',
 'liten',
 'oslo',
 'slik',
 'mann',
 'tid',
 'denne',
 'der',
 'vise',
 'million',
 'ntb',
 'gang',
 'politi',
 'sak',
 'hva',
 'sta']

In [20]:
corpus_intersect = set([x[0] for x in count_corpus_top]) & set([x[0] for x in corpus_top])
corpus_intersect_prop = len(corpus_intersect)/100
corpus_intersect_prop

0.77

In [25]:
class_labels = np.unique(train_y)
corpus_top_words = set([t[0] for t in corpus_top])

for c in class_labels:
    top = get_class_tfidf_means(train_x, train_y, c, words, len(corpus_top_words))
    top_words = set([t[0] for t in top])
    intersect = top_words & corpus_top_words
    intersect_prop = len(intersect)/len(corpus_top_words)
    class_prop = len(np.where(train_y == c)[0]) / len(train_y)
    print(c,intersect_prop, class_prop)
    

1 0.73 0.03465671190031781
2 0.83 0.022801795893658883
3 0.82 0.07430762245876002
4 0.8 0.03743126671038693
5 0.73 0.023911617817686526
6 0.72 0.029460727437824748
7 0.79 0.026686172627755637
8 0.76 0.03157947838369571
9 0.84 0.022297331382737224
10 0.77 0.05695404328305504
12 0.77 0.1344397921606215
13 0.82 0.03611965898199062
14 0.75 0.019371437219391614
15 0.78 0.0811178933562024
16 0.79 0.08096655400292589
17 0.82 0.05513797104373707
18 0.65 0.005599556071230389
19 0.76 0.04787368208646522
20 0.77 0.07592190889370933
21 0.74 0.00721384250617969
23 0.83 0.0320334964435252
24 0.72 0.01992634818140544
25 0.61 0.026686172627755637
26 0.81 0.017504918528981485


In [152]:
words = vectorizer.get_feature_names()

test_class_ids = np.where(train_y == 18)


df = get_class_tfidf_means(train_x, test_class_ids, words)
#ax = df.plot.scatter(x='x', y='y')
#df[['x', 'y', 'word']].apply(lambda x: ax.text(*x, rotation=0, rotation_mode='anchor'), axis=1);
df

Unnamed: 0,word,y,x
0,norsk,1,0.062554
1,norge,2,0.053563
2,land,3,0.049224
3,prosent,4,0.047468
4,krone,5,0.038828
5,ke,6,0.035839
6,usa,7,0.031895
7,sverige,8,0.031859
8,mene,9,0.030514
9,du,10,0.028623


In [169]:
train_y[train_y == 18].index

Int64Index([ 960101, 1222831,  902857,  416490,  463979, 1328550, 1528755,
            1372426,  941033,  534793,
            ...
            1023255,  676612,  595193, 1260345,  589074,  304335, 1102365,
             304338,  199586,  108149],
           dtype='int64', length=111)

In [158]:
def get_corpus_tfidf(words, tfidfs, n):
    return sorted( list( zip(words, tfidfs.sum(0).getA1())), key=lambda x: x[1], reverse=True)[:n]

In [159]:
get_corpus_tfidf(words, train_x, 25)

[('hun', 722.3461070988958),
 ('norsk', 646.7482907609138),
 ('norge', 645.099634864106),
 ('du', 545.4351939279998),
 ('krone', 507.0044142288883),
 ('mene', 498.6395830356461),
 ('prosent', 492.30294291879756),
 ('land', 473.67226522662304),
 ('man', 468.7542134495717),
 ('gar', 451.9632494334225),
 ('liten', 437.3297834967681),
 ('oslo', 426.6713503319191),
 ('slik', 425.65661873005354),
 ('mann', 414.7627044936016),
 ('tid', 403.6408219285322),
 ('denne', 403.1271653656752),
 ('der', 397.60581887386127),
 ('vise', 380.1452738795548),
 ('million', 369.0178088934919),
 ('ntb', 368.31644806180367),
 ('gang', 362.18831341932486),
 ('politi', 361.0660631324958),
 ('sak', 355.7012177140237),
 ('hva', 354.68436592567406),
 ('sta', 347.0451685107662)]

In [60]:
def get_class_KBest(words, c, K, x, y):
    
    words = np.array(words)
    msk = np.zeros(words.shape[0], dtype=bool)
    for cl in c:
        class_y = y.copy()
        class_y[class_y != cl] = 0
        selected = SelectKBest(chi2, k=K).fit(x, class_y)
        selected_words = selected.get_support()
        msk = msk | selected_words
        
    return words[msk]

    
    

In [71]:
custom_vocab = get_class_KBest(words, class_labels, 100, train_x, train_y)

In [67]:
custom_vocab

array(['2022', 'afghanistan', 'aksje', 'ambassade', 'ambassadør',
       'angrep', 'arbeidsgiver', 'arbeidsliv', 'arbeidstager',
       'arbeidstilsynet', 'asyl', 'asylmottak', 'asylsøker', 'asylsøknad',
       'avis', 'bank', 'barn', 'barnehage', 'barnehageplass', 'barnevern',
       'bedrift', 'behandling', 'bibliotek', 'bil', 'bilist', 'biskop',
       'blatter', 'bolig', 'boligmarked', 'boligpris', 'bombe', 'bonde',
       'brann', 'brannvesen', 'burka', 'børs', 'digital',
       'diskriminering', 'dnb', 'doggerland', 'drap', 'drepe', 'dyr',
       'dømme', 'e18', 'eiendom', 'eiendomsmegler', 'eiendomsskatt',
       'elbil', 'elev', 'enso', 'eu', 'eu-medlemskap', 'evakuere', 'fag',
       'fengsel', 'ferdsel', 'festekontrakt', 'fifa', 'fisk', 'fly',
       'flyktning', 'fn', 'forelder', 'forlag', 'forsvaret', 'fosterhjem',
       'fotballforbund', 'frp', 'gjedrem', 'global', 'grensehandel',
       'grimsdalshytta', 'gud', 'helse', 'helseminister', 'helsetilsynet',
       'hijab', '

In [77]:
vocab_vectorizer = TfidfVectorizer(**{'tokenizer': lambda x: x.split(), 'vocabulary': custom_vocab, 'min_df': 1, 'max_features': None, 'norm': 'l2', 'use_idf': True})
vocab_train_x = vocab_vectorizer.fit_transform(train_X[corpus])
vocab_validation_x = vocab_vectorizer.transform(validation_X[corpus])

In [73]:
vocab_train_x.shape

(19823, 2320)

In [79]:
clf = MultinomialNB(**{'alpha': 0.001})
clf.fit(vocab_train_x, train_y)

preds = clf.predict(vocab_validation_x)
np.mean(preds == validation_y)

0.48166734944694795