In [38]:
import os
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn import svm
import numpy as np
import fasttext as ft
from keras.layers import Dense, Masking, Conv1D, MaxPooling1D, Embedding, Flatten
from keras.models import Sequential
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import Tokenizer

# Baselines

In [2]:
# Load train and test
# train = pd.read_csv('/home/michael/school/research/convote/convote_1train_dev.csv')
# test = pd.read_csv('/home/michael/school/research/convote/convote_1test.csv')
train = pd.read_csv('/usr2/mamille2/convote/convote_1train_dev.csv')
test = pd.read_csv('/usr2/mamille2/convote/convote_1test.csv')
print(len(train))
print(len(test))

6362
1759


## Unigrams

In [20]:
# Define classes order
class_idx = {'d':[1,0,0], 'i':[0,1,0], 'r':[0,0,1]}

In [39]:
v_all = TfidfVectorizer(min_df=1)
v = TfidfVectorizer(min_df=1, stop_words='english')

docs_train = train['text'].values
docs_test = test['text'].values
y_train = np.array([class_idx[y] for y in train['party'].tolist()])
y_test = np.array([class_idx[y] for y in test['party'].tolist()])

bow = v.fit(docs_train)
bow = v.fit(docs_test)
v_all.fit(docs_train)
v_all.fit(docs_test)

X_train = v.transform(docs_train)
X_test = v.transform(docs_test)

print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(6362, 15030)
(1759, 15030)
(6362, 3)
(1759, 3)


## CNN

In [8]:
# Load fasttext word embeddings
wembed = ft.load_model('/usr2/mamille2/discourse_connectives/en_wiki_stanford_model_300.bin')

In [7]:
# Vocab
vocab = v_all.get_feature_names()

In [13]:
# Build weights
vocab_embed = np.empty((len(vocab),300))

for i, wd in enumerate(vocab):
    vocab_embed[i,:] = wembed[wd]
    
vocab_embed.shape

(15319, 300)

In [14]:
# Save vocab pretrained
np.save('../../vocab.npy', vocab_embed)

In [4]:
# Load vocab pretrained
vocab_embed = np.load('../../vocab.npy')

In [5]:
# Initialize Keras layer with weights
def pretrained(shape, dtype=None):
    return vocab_embed # shape (vocab, 300)

### Convert texts to sequences

In [42]:
tokenizer = Tokenizer(num_words=10000)
tokenizer.fit_on_texts(docs_train)
tokenizer.fit_on_texts(docs_test)

seqs_train = tokenizer.texts_to_sequences(docs_train)
seqs_test = tokenizer.texts_to_sequences(docs_test)

In [45]:
X_train = pad_sequences(seqs_train, maxlen=1000)
X_test = pad_sequences(seqs_test, maxlen=1000)

In [47]:
word_index = tokenizer.word_index
len(word_index) # don't know why this isn't 10k

27767

### Build classifier

In [58]:
model = Sequential()

model.add(Embedding(len(word_index) + 1, 300, input_length=1000, embeddings_initializer=pretrained, trainable=False))

model.add(Conv1D(128, kernel_size=5, activation='relu'))
model.add(MaxPooling1D(pool_size=5))
model.add(Conv1D(128, kernel_size=5, activation='relu'))
model.add(MaxPooling1D(pool_size=5))
# model.add(Conv1D(128, kernel_size=5, activation='relu'))
# model.add(MaxPooling1D(pool_size=35)) # large (global?) max pooling
model.add(Flatten()) # Not sure why need this
# model.add(Dense(128, activation='relu'))
model.add(Dense(32, activation='relu'))
model.add(Dense(3, activation='softmax')) # final classification layer

model.compile(loss='categorical_crossentropy', optimizer='rmsprop', metrics=['accuracy'])

### Train classifier

In [50]:
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

(6362, 1000)
(6362, 3)
(1759, 1000)
(1759, 3)


In [61]:
model.fit(X_train, y_train,
#          batch_size=128, epochs=2, validation_data=(X_test, y_test))
         batch_size=16, epochs=20, validation_data=(X_test, y_test))

Train on 6362 samples, validate on 1759 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x7f8a670bfa90>

In [62]:
model

<keras.models.Sequential at 0x7f8a67d85278>

In [63]:
model.layers

[<keras.layers.embeddings.Embedding at 0x7f8a67d85240>,
 <keras.layers.convolutional.Conv1D at 0x7f8a67d852b0>,
 <keras.layers.pooling.MaxPooling1D at 0x7f8a67d0b198>,
 <keras.layers.convolutional.Conv1D at 0x7f8a67d0b2e8>,
 <keras.layers.pooling.MaxPooling1D at 0x7f8a67cf3550>,
 <keras.layers.core.Flatten at 0x7f8a67cf3470>,
 <keras.layers.core.Dense at 0x7f8a67c57d68>,
 <keras.layers.core.Dense at 0x7f8a67c6e6d8>]

In [64]:
model.layers[-2]

<keras.layers.core.Dense at 0x7f8a67c57d68>

In [65]:
model.layers[-2].get_config()

{'activation': 'relu',
 'activity_regularizer': None,
 'bias_constraint': None,
 'bias_initializer': {'class_name': 'Zeros', 'config': {}},
 'bias_regularizer': None,
 'kernel_constraint': None,
 'kernel_initializer': {'class_name': 'VarianceScaling',
  'config': {'distribution': 'uniform',
   'mode': 'fan_avg',
   'scale': 1.0,
   'seed': None}},
 'kernel_regularizer': None,
 'name': 'dense_21',
 'trainable': True,
 'units': 32,
 'use_bias': True}

## Majority class

In [14]:
print(len([y for y in y_train if y=='d']))
print(len([y for y in y_train if y=='r']))
print(len([y for y in y_train if y=='i']))
print(len(y_train))

2848
2786
26
5660


In [28]:
preds = np.asarray(['d'] * len(y_test))
acc = np.mean(preds == y_test)
acc

0.49061967026719727

## Naive Bayes

In [40]:
def nb(X_train, X_test, y_train, y_test):
    """ Trains Naive Bayes classifier
    Returns (accuracy, classifier)
    """
    
    clf = MultinomialNB()
    clf.fit(X_train, y_train)

    preds = clf.predict(X_test)
    acc = np.mean(preds == y_test)
    return acc, clf

In [63]:
_, clf = nb(X_train, X_test, y_train, y_test)

In [66]:
print_top_features(v, clf, ['d', 'i','r'])

Class d
mr yield chairman gentleman speaker time amendment minutes gentlewoman balance committee energy vote california budget people ms new texas oil

Class i
mr speaker remains minutes jobs yield gentleman trade vote china time wto maryland amplify indiana long wages inquire ohio workers

Class r
chairman mr yield gentleman time speaker balance amendment minutes reserve committee madam gentlewoman energy new thank vote house ask support



In [52]:
clf

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [54]:
clf.coef_.shape

(3, 15319)

In [55]:
clf.class_count_

array([ 3183.,    26.,  3153.])

In [67]:
clf.coef_

array([[ -7.37282281, -10.43592784, -10.42524935, ..., -10.43592784,
        -10.43592784, -10.43592784],
       [ -9.55378543,  -9.62492998,  -9.62492998, ...,  -9.62492998,
         -9.62492998,  -9.62492998],
       [ -7.61797486, -10.37116572, -10.37116572, ..., -10.37116572,
        -10.37116572, -10.37116572]])

In [65]:
def print_top_features(vectorizer, clf, labels, n=20):
    """Prints features with the highest coefficient values"""
    feature_names = vectorizer.get_feature_names()
    
    for i in range(clf.coef_.shape[0]):
        print("Class {}".format(labels[i]))
        top = np.argsort(clf.coef_[i])[-1*n:]
        print(" ".join(reversed([feature_names[j] for j in top])))
        print()

## SVM (one-vs-the-rest classification)

In [35]:
clf = svm.LinearSVC()
clf.fit(X_train, y_train)
preds = clf.predict(X_test)
acc = np.mean(preds == y_test)
acc

0.7151790790221717

## Bag of ngrams (up to trigrams)

In [36]:
v = TfidfVectorizer(min_df=1, ngram_range=(1,3))

bow_train = train['text'].values
bow_test = test['text'].values
y_train = train['party'].values
y_test = test['party'].values

bow = v.fit(bow_train)
bow = v.fit(bow_test)

X_train = v.transform(bow_train)
X_test = v.transform(bow_test)

print(X_train.shape)
print(X_test.shape)

(6362, 488683)
(1759, 488683)


In [38]:
nb(X_train, X_test, y_train, y_test) # too many features--need feature selection

0.65321205230244461

# Create dataset

## Training set (and +dev)

In [29]:
data_dirpath = '/home/michael/school/research/convote/convote_v1.1/data_stage_one/training_set/'

outlines = []

for fname in sorted(os.listdir(data_dirpath)):
    party = fname[-7].lower()
    
    with open(os.path.join(data_dirpath, fname)) as f:
        text = f.read()
        
    id = fname[:-4]
    
    outlines.append([id, party, text])
    
len(outlines)

5660

In [5]:
pd.DataFrame(outlines, columns=['id', 'party', 'text']).to_csv('/home/michael/school/research/convote/convote_1train.csv', index=False)

### Add dev set

In [30]:
data_dirpath = '/home/michael/school/research/convote/convote_v1.1/data_stage_one/development_set/'

for fname in sorted(os.listdir(data_dirpath)):
    party = fname[-7].lower()
    
    with open(os.path.join(data_dirpath, fname)) as f:
        text = f.read()
        
    id = fname[:-4]
    
    outlines.append([id, party, text])
    
len(outlines)

6362

In [31]:
pd.DataFrame(outlines, columns=['id', 'party', 'text']).to_csv('/home/michael/school/research/convote/convote_1train_dev.csv', index=False)

## Test set

In [7]:
data_dirpath = '/home/michael/school/research/convote/convote_v1.1/data_stage_one/test_set/'

outlines = []

for fname in sorted(os.listdir(data_dirpath)):
    party = fname[-7].lower()
    
    with open(os.path.join(data_dirpath, fname)) as f:
        text = f.read()
        
    id = fname[:-4]
    
    outlines.append([id, party, text])
    
len(outlines)

1759

In [8]:
pd.DataFrame(outlines, columns=['id', 'party', 'text']).to_csv('/home/michael/school/research/convote/convote_1test.csv', index=False)