In [165]:
from gensim.models import KeyedVectors
from sklearn import svm
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.neural_network import MLPClassifier
import numpy as np

count_vectorizer = CountVectorizer()
tfidf_vectorizer = TfidfVectorizer()

# Sentiment classifier - data 

In [166]:
corpus = [
    # Træning
    'Firefox er fantastisk',
    'Firefox: en god browser!',
    'Firefox er helt fantastisk',
    'Firefox er en dårlig browser',
    'Firefox er verdens dårligste browser',
    'Jeg vil ikke bruge Firefox fremover',
    # Manuel Test
    'Firefox er aldeles god',
    'Firefox er mega dårlig',
    'Jeg er en Firefox fan'
]
y_train = [
    1,
    1,
    1,
    0,
    0,
    0,
]

## Exempel 1: Count vectorizer

In [167]:
X_count = count_vectorizer.fit_transform(corpus)

In [168]:
print(count_vectorizer.get_feature_names())
X_count.toarray()[0]

['aldeles', 'browser', 'bruge', 'dårlig', 'dårligste', 'en', 'er', 'fan', 'fantastisk', 'firefox', 'fremover', 'god', 'helt', 'ikke', 'jeg', 'mega', 'verdens', 'vil']


array([0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0], dtype=int64)

In [169]:
clf = MLPClassifier()
clf.fit(X[0:6], y_train)



MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(100,), learning_rate='constant',
       learning_rate_init=0.001, max_iter=200, momentum=0.9,
       n_iter_no_change=10, nesterovs_momentum=True, power_t=0.5,
       random_state=None, shuffle=True, solver='adam', tol=0.0001,
       validation_fraction=0.1, verbose=False, warm_start=False)

In [170]:
for i in range(6,9):
    print(corpus[i], clf.predict(X_count[i]))

Firefox er aldeles god [1]
Firefox er mega dårlig [0]
Jeg er en Firefox fan [0]


## Exempel 2: Tf-idf vectorizer

In [171]:
X_tfidf = vectorizer.fit_transform(corpus)

In [172]:
print(vectorizer.get_feature_names())
X_tfidf.toarray()[0]

['aldeles', 'browser', 'bruge', 'dårlig', 'dårligste', 'en', 'er', 'fan', 'fantastisk', 'firefox', 'fremover', 'god', 'helt', 'ikke', 'jeg', 'mega', 'verdens', 'vil']


array([0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.45105358, 0.        , 0.81274991, 0.36876585,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        ])

In [173]:
clf = MLPClassifier()
clf.fit(X_tfidf[0:6], y_train)



MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(100,), learning_rate='constant',
       learning_rate_init=0.001, max_iter=200, momentum=0.9,
       n_iter_no_change=10, nesterovs_momentum=True, power_t=0.5,
       random_state=None, shuffle=True, solver='adam', tol=0.0001,
       validation_fraction=0.1, verbose=False, warm_start=False)

In [174]:
for i in range(6,9):
    print(corpus[i], clf.predict(X_tfidf[i]))

Firefox er aldeles god [1]
Firefox er mega dårlig [0]
Jeg er en Firefox fan [0]


## Exempel 3: Word2vec vectorizer

In [175]:
da_model = KeyedVectors.load_word2vec_format('wiki.da.vec')

In [176]:
X_word2vec = []

for line in corpus:
    linevec = np.mean([da_model[word] for word in line.split(' ') if word in da_model], axis=0)
    X_word2vec.append(linevec)
        

In [177]:
X_word2vec[0]

array([-3.08735013e-01, -7.25400001e-02, -8.39378461e-02,  4.52099927e-03,
       -2.31219977e-01, -5.64030036e-02, -2.98079997e-01,  1.69962998e-02,
        1.60550028e-02, -3.46094966e-02,  1.83789998e-01,  2.31804997e-01,
        1.61355004e-01,  1.63069993e-01,  1.11123502e-01,  1.61659978e-02,
        3.02764997e-02,  2.44269997e-01, -2.70865001e-02,  2.43214995e-01,
       -1.01605006e-01,  1.48843005e-01, -6.06140010e-02, -1.12481996e-01,
       -1.27221003e-01, -7.65419975e-02, -1.76624998e-01, -1.05234504e-01,
       -3.11760008e-01,  1.41272500e-01, -1.77239999e-01, -1.60480708e-01,
        6.12603500e-02,  2.10948497e-01, -6.74699992e-02,  8.85615498e-02,
        1.52541012e-01,  7.38050044e-02,  1.13879003e-01, -1.00872047e-01,
       -3.31084989e-02,  1.60662502e-01,  2.14050002e-02, -2.42480002e-02,
       -8.79020020e-02,  6.10705018e-02,  3.17800015e-01,  2.58929998e-01,
       -1.10495001e-01, -3.50109994e-01, -1.19749501e-01, -1.33529991e-01,
        1.36190001e-02, -

In [178]:
clf = MLPClassifier()
clf.fit(X_word2vec[0:6], y_train)

MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(100,), learning_rate='constant',
       learning_rate_init=0.001, max_iter=200, momentum=0.9,
       n_iter_no_change=10, nesterovs_momentum=True, power_t=0.5,
       random_state=None, shuffle=True, solver='adam', tol=0.0001,
       validation_fraction=0.1, verbose=False, warm_start=False)

In [179]:
for i in range(6,9):
    print(corpus[i], clf.predict(X_word2vec[i].reshape(1,-1)))

Firefox er aldeles god [1]
Firefox er mega dårlig [0]
Jeg er en Firefox fan [1]
