In [56]:
import pandas as pd
from keyness import log_likelihood
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler

In [26]:
corpus = ['islamist mistenke for massaker på Filippin', 'Uniformere og tung bevæpne mann overfalle']
reference = ['islamist Ugjerning finne sted fredag kveld bevæpne mann']


corpus_vec = TfidfVectorizer(use_idf=False)
corpus_vectorized = corpus_vec.fit_transform(corpus)
reference_vec = TfidfVectorizer(use_idf=False)
reference_vectorized = reference_vec.fit_transform(reference)


print(corpus_vec.get_feature_names())

#log_likelihood(corpus, reference)

['bevæpne', 'filippin', 'for', 'islamist', 'mann', 'massaker', 'mistenke', 'og', 'overfalle', 'på', 'tung', 'uniformere']


In [3]:
labeled_data = pd.read_pickle('../../data/labeled_data_pos_ont.pkl')
drop_trash = labeled_data['Aggr.Label'] < 90
labeled_data = labeled_data[drop_trash]

In [32]:
train_X, validation_X, train_y, validation_y = train_test_split(
    labeled_data['Lemma'],
    labeled_data['Aggr.Label'],
    test_size=0.33,
    random_state=42,
    stratify=labeled_data['Aggr.Label'])

In [55]:
vec = TfidfVectorizer(**{'max_df': 0.3, 'max_features': None, 'norm': 'l2', 'strip_accents': 'unicode', 'use_idf': False})

corpus_vectorized = vec.fit_transform(train_X)



selector = SelectKBest(chi2, k=30000)
corpus_vectorized_reduced = selector.fit_transform(corpus_vectorized, train_y)

scaler = StandardScaler()

clf = MultinomialNB(**{'alpha': 0.0015})
clf.fit(corpus_vectorized_reduced, train_y)



preds = clf.predict(selector.transform(vec.transform(validation_X)))

acc = accuracy_score(validation_y, preds)

print(acc)

0.4842072667217176


In [29]:
#labeled_data[labeled_data['Aggr.Label'] == 16].head()['Lemma'].tolist()

In [4]:
labeled_data.head()

Unnamed: 0,Id,Label,Pub.Date,Source,CoderId,Aggr.Label,Raw,Lemma,Raw_len,Raw_word_count,...,Form,Representation,Stimulating,Colour,Cause,Occupation,Possession,Artwork,Software,None
234,ZrnCbmABVo8DrD4X_sWL,1601,2000-08-06 02:00:00,Aftenposten,1,16,Islamister mistenkes for massakre på Filippine...,islamist mistenke for massaker på Filippin $. ...,2687,407,...,0,0,0,0,5,4,1,0,0,0
254,X8THbmABVo8DrD4XfiV1,12,2012-08-29 02:00:00,Aftenposten,1,12,Kart i politiets daglige virke. I et innlegg i...,kart i politi daglig virke $. i en innlegg i A...,1927,300,...,0,0,0,0,4,0,0,0,0,0
276,68_NbmABVo8DrD4XO_IN,12,2005-04-30 02:00:00,VG,1,12,Kastet arbeider til løvene. Den fargede gårdsa...,kastet arbeide til løve $. den farge gårdsarbe...,1070,175,...,0,0,0,0,3,2,1,0,0,0
277,-L_FbmABVo8DrD4XQU8d,16,2006-04-28 02:00:00,Aftenposten,1,16,Karl Rove grilles videre av storjury. Presiden...,Karl Rove grille vid av storjury $. president ...,889,144,...,0,0,0,0,3,3,0,0,0,0
370,jdjQbmABVo8DrD4XdGPO,401,2015-01-16 03:00:00,VG,2,40,Norske gründere med hårete mål - Vi skal erobr...,norsk gründer med håret mål $- vi skulle erobr...,2801,476,...,0,0,0,0,6,1,7,0,0,0


In [None]:
numerical_features = ['Raw_len', 'Raw_word_count', 'Underspecified', 'Artifact', 'Object', 'Group', 'Human', 'Natural', 'LanguageRepresentation', 'Living', 'GeopoliticalPlace', 'BodyPart', 'Instrument', 'Place', '3rdOrderEntity', 'Mental', 'Purpose', 'Social', 'Institution', 'Plant', 'Imagerepresentation', 'Creature', 'Animal', 'Comestible', 'Quantity', 'Building', 'Substance', 'Part', 'Property', 'BoundedEvent', 'Agentive', 'Communication', 'Garment', 'Furniture', 'Vehicle', '1stOrderEntity', 'Covering', 'Liquid', 'Time', 'UnboundedEvent', 'Physical', 'Dynamic', 'Domain', 'Existence', 'Location', 'Manner', 'Container', 'Condition', 'Static', '2ndOrderEntity', 'Phenomenal', 'MoneyRepresentation', 'Experience', 'Relation', 'Form', 'Representation', 'Stimulating', 'Colour', 'Cause', 'Occupation', 'Possession', 'Artwork', 'Software']
text_features = ['Lemma_stripped']
features = numerical_features + text_features
target = 'Aggr.Label'

train_X, validation_X, train_y, validation_y = train_test_split(
    labeled_data[features],
    labeled_data[target],
    test_size=0.33,
    random_state=1,
    stratify=labeled_data[target])

In [None]:
# vectorizer params
TfidfVectorizer_params = list(ParameterGrid({
    'strip_accents': ['ascii', 'unicode', None],
    'max_df': [round(0.1*x, 2) for x in range(3,11,2)] + [1],
    'max_features': [1000*x for x in range(5,50,20)] + [None],
    'norm': ['l2', None],
    'use_idf': [True, False]
}))

# classifier params
MultinomialNB_params = list(ParameterGrid({
    #'alpha': [0.01,0.001,0.0001]
    'alpha': [0.0015,0.001,0.0005]
}))