In [1]:
import pandas as pd
import numpy as np
%matplotlib inline

from sklearn.feature_selection import SelectKBest, chi2

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

from collections import Counter

from sklearn.naive_bayes import MultinomialNB

In [3]:
labeled_data = pd.read_pickle('../data/labeled_data_pos_ont_nn.pkl')
drop_trash = (labeled_data['Aggr.Label'] < 90) & (labeled_data['NN_bool'] == False)
labeled_data = labeled_data[drop_trash]

In [4]:
pos_features = ['Adjective', 'Verb', 'Noun']
ont_features = ['Underspecified', 'Artifact', 'Object', 'Group', 'Human', 'Natural', 'LanguageRepresentation', 'Living', 'GeopoliticalPlace', 'BodyPart', 'Instrument', 'Place', '3rdOrderEntity', 'Mental', 'Purpose', 'Social', 'Institution', 'Plant', 'Imagerepresentation', 'Creature', 'Animal', 'Comestible', 'Quantity', 'Building', 'Substance', 'Part', 'Property', 'BoundedEvent', 'Agentive', 'Communication', 'Garment', 'Furniture', 'Vehicle', '1stOrderEntity', 'Covering', 'Liquid', 'Time', 'UnboundedEvent', 'Physical', 'Dynamic', 'Domain', 'Existence', 'Location', 'Manner', 'Container', 'Condition', 'Static', '2ndOrderEntity', 'Phenomenal', 'MoneyRepresentation', 'Experience', 'Relation', 'Form', 'Representation', 'Stimulating', 'Colour', 'Cause', 'Occupation', 'Possession', 'Artwork', 'Software']
numerical_features = ['Raw_len', 'Raw_word_count']
categorical_features = ['Label', 'Aggr.Label', 'Source', 'CoderId']
text_features = ['Lemma_stripped', 'Raw', 'Lemma']
fastt_features = ['Raw_FT_mean', 'Lemma_FT_mean']
features = pos_features + ont_features + numerical_features + text_features + fastt_features + categorical_features

fastt = 'Raw_FT_mean'
corpus = 'Lemma'
target = 'Aggr.Label'
#target = 'Label'


train_X, validation_X, train_y, validation_y = train_test_split(
    labeled_data[features],
    labeled_data[target],
    test_size=0.33,
    random_state=1,
    stratify=labeled_data[target])

In [5]:
vectorizer = TfidfVectorizer(**{'max_df': 0.3, 'max_features': None, 'norm': 'l2', 'strip_accents': 'ascii', 'use_idf': False})
train_x = vectorizer.fit_transform(train_X[corpus])
validation_x = vectorizer.transform(validation_X[corpus])

words = vectorizer.get_feature_names()

In [6]:
def get_class_tfidf_means(X, Y, c, words, n):
    class_ids = np.where(Y == c)
    return sorted( list( zip(words, X[class_ids].mean(0).getA1())), key=lambda x: x[1], reverse=True)[:n]

def get_corpus_tfidf_means(words, X, n):
    return sorted( list( zip(words, X.mean(0).getA1())), key=lambda x: x[1], reverse=True)[:n]

def get_accumulated_ranks(words, X, n):
    l = len(words)

    counts = Counter()
    
    for i in range(X.shape[0]):
        
        row = np.squeeze(X[i].toarray())
        top_id = np.argsort(row)[::-1][:10]
        counts.update([words[k] for k in top_id if row[k] != 0])

    count_words = [(k, v) for k,v in counts.items()]

    return sorted( count_words, key=lambda x: x[1], reverse=True)[:n]

In [8]:
count_corpus_top = get_accumulated_ranks(words, train_x, 25)
count_corpus_top

[('hun', 1813),
 ('norge', 1603),
 ('norsk', 1588),
 ('du', 1318),
 ('krone', 1304),
 ('prosent', 1204),
 ('man', 1057),
 ('land', 1036),
 ('oslo', 1007),
 ('politi', 933),
 ('mann', 918),
 ('gar', 873),
 ('mene', 833),
 ('million', 822),
 ('sak', 746),
 ('selskap', 735),
 ('vg', 734),
 ('barn', 730),
 ('kvinne', 626),
 ('liten', 565),
 ('slik', 535),
 ('vise', 528),
 ('gammel', 523),
 ('usa', 519),
 ('der', 508)]

In [9]:
corpus_top = get_corpus_tfidf_means(words, train_x, 25)
corpus_top

[('hun', 0.036439797563380574),
 ('norsk', 0.03262615601881231),
 ('norge', 0.03254298717974614),
 ('du', 0.027515269834434756),
 ('krone', 0.02557657338590981),
 ('mene', 0.025154597338225743),
 ('prosent', 0.02483493633248241),
 ('land', 0.023895084761470196),
 ('man', 0.023646986503030508),
 ('gar', 0.02279994195799935),
 ('liten', 0.02206173553431721),
 ('oslo', 0.02152405540694742),
 ('slik', 0.021472865798822325),
 ('mann', 0.02092330648709083),
 ('tid', 0.020362246982219372),
 ('denne', 0.02033633483154298),
 ('der', 0.0200578024957807),
 ('vise', 0.019176979966682873),
 ('million', 0.01861563884848365),
 ('ntb', 0.01858025768358999),
 ('gang', 0.018271115039061973),
 ('politi', 0.018214501494854263),
 ('sak', 0.01794386408283429),
 ('hva', 0.01789256751882543),
 ('sta', 0.017507197120050774)]

In [10]:
class_mean = get_class_tfidf_means(train_x, train_y, 18, words, 25)
class_mean

[('norsk', 0.06255350273938819),
 ('norge', 0.05356272702004951),
 ('land', 0.049224310599757584),
 ('prosent', 0.04746801690737708),
 ('krone', 0.03882839182744247),
 ('ke', 0.03583877850506815),
 ('usa', 0.03189491497061322),
 ('sverige', 0.03185911208669827),
 ('mene', 0.03051383703049594),
 ('du', 0.02862267635405802),
 ('svensk', 0.026645872341275918),
 ('ntb', 0.026628885363830516),
 ('verden', 0.02581254542222896),
 ('hun', 0.023391529341185595),
 ('million', 0.022930952500949807),
 ('vare', 0.022262905149953896),
 ('eu', 0.022227974789290138),
 ('handle', 0.02219894164532183),
 ('aftenposten', 0.022049811492129683),
 ('mellom', 0.02173980014570053),
 ('slik', 0.020741032949343773),
 ('toll', 0.02064209208301835),
 ('man', 0.020641284881917823),
 ('nordmann', 0.020591179228851692),
 ('sist', 0.020299945958600362)]

In [11]:
corpus_intersect = set([x[0] for x in count_corpus_top]) & set([x[0] for x in corpus_top])
corpus_intersect_prop = len(corpus_intersect)/25
corpus_intersect_prop

0.76

In [12]:
class_labels = np.unique(train_y)
corpus_top_words = set([t[0] for t in corpus_top])

for c in class_labels:
    top = get_class_tfidf_means(train_x, train_y, c, words, len(corpus_top_words))
    top_words = set([t[0] for t in top])
    intersect = top_words & corpus_top_words
    intersect_prop = len(intersect)/len(corpus_top_words)
    class_prop = len(np.where(train_y == c)[0]) / len(train_y)
    print(c,intersect_prop, class_prop)
    

1 0.6 0.03465671190031781
2 0.68 0.022801795893658883
3 0.64 0.07430762245876002
4 0.68 0.03743126671038693
5 0.6 0.023911617817686526
6 0.6 0.029460727437824748
7 0.76 0.026686172627755637
8 0.6 0.03157947838369571
9 0.56 0.022297331382737224
10 0.68 0.05695404328305504
12 0.56 0.1344397921606215
13 0.68 0.03611965898199062
14 0.56 0.019371437219391614
15 0.64 0.0811178933562024
16 0.64 0.08096655400292589
17 0.64 0.05513797104373707
18 0.48 0.005599556071230389
19 0.68 0.04787368208646522
20 0.6 0.07592190889370933
21 0.52 0.00721384250617969
23 0.76 0.0320334964435252
24 0.6 0.01992634818140544
25 0.4 0.026686172627755637
26 0.6 0.017504918528981485


In [13]:
def get_class_KBest(words, c, K, x, y):
    
    words = np.array(words)
    msk = np.zeros(words.shape[0], dtype=bool)
    for cl in c:
        class_y = y.copy()
        class_y[class_y != cl] = 0
        selected = SelectKBest(chi2, k=K).fit(x, class_y)
        selected_words = selected.get_support()
        msk = msk | selected_words
        
    return words[msk]

In [14]:
custom_vocab = get_class_KBest(words, class_labels, 100, train_x, train_y)

In [15]:
vocab_vectorizer = TfidfVectorizer(**{'max_df': 0.3, 'max_features': None, 'norm': 'l2', 'strip_accents': 'ascii', 'use_idf': False})
vocab_train_x = vocab_vectorizer.fit_transform(train_X[corpus])
vocab_validation_x = vocab_vectorizer.transform(validation_X[corpus])

In [16]:
clf = MultinomialNB(**{'alpha': 0.001})
clf.fit(vocab_train_x, train_y)

preds = clf.predict(vocab_validation_x)
np.mean(preds == validation_y)

0.4870954526833265