In [34]:
import os
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn import svm
import numpy as np

# Baselines

In [32]:
# Load train and test
train = pd.read_csv('/home/michael/school/research/convote/convote_1train_dev.csv')
test = pd.read_csv('/home/michael/school/research/convote/convote_1test.csv')
print(len(train))
print(len(test))

6362
1759


## Unigrams

In [62]:
v = TfidfVectorizer(min_df=1, stop_words='english')

bow_train = train['text'].values
bow_test = test['text'].values
y_train = train['party'].values
y_test = test['party'].values

bow = v.fit(bow_train)
bow = v.fit(bow_test)

X_train = v.transform(bow_train)
X_test = v.transform(bow_test)

print(X_train.shape)
print(X_test.shape)

(6362, 15030)
(1759, 15030)


In [63]:
_, clf = nb(X_train, X_test, y_train, y_test)

In [66]:
print_top_features(v, clf, ['d', 'i','r'])

Class d
mr yield chairman gentleman speaker time amendment minutes gentlewoman balance committee energy vote california budget people ms new texas oil

Class i
mr speaker remains minutes jobs yield gentleman trade vote china time wto maryland amplify indiana long wages inquire ohio workers

Class r
chairman mr yield gentleman time speaker balance amendment minutes reserve committee madam gentlewoman energy new thank vote house ask support



In [52]:
clf

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [54]:
clf.coef_.shape

(3, 15319)

In [55]:
clf.class_count_

array([ 3183.,    26.,  3153.])

In [67]:
clf.coef_

array([[ -7.37282281, -10.43592784, -10.42524935, ..., -10.43592784,
        -10.43592784, -10.43592784],
       [ -9.55378543,  -9.62492998,  -9.62492998, ...,  -9.62492998,
         -9.62492998,  -9.62492998],
       [ -7.61797486, -10.37116572, -10.37116572, ..., -10.37116572,
        -10.37116572, -10.37116572]])

In [65]:
def print_top_features(vectorizer, clf, labels, n=20):
    """Prints features with the highest coefficient values"""
    feature_names = vectorizer.get_feature_names()
    
    for i in range(clf.coef_.shape[0]):
        print("Class {}".format(labels[i]))
        top = np.argsort(clf.coef_[i])[-1*n:]
        print(" ".join(reversed([feature_names[j] for j in top])))
        print()

## Majority class

In [14]:
print(len([y for y in y_train if y=='d']))
print(len([y for y in y_train if y=='r']))
print(len([y for y in y_train if y=='i']))
print(len(y_train))

2848
2786
26
5660


In [28]:
preds = np.asarray(['d'] * len(y_test))
acc = np.mean(preds == y_test)
acc

0.49061967026719727

## Naive Bayes

In [40]:
def nb(X_train, X_test, y_train, y_test):
    """ Trains Naive Bayes classifier
    Returns (accuracy, classifier)
    """
    
    clf = MultinomialNB()
    clf.fit(X_train, y_train)

    preds = clf.predict(X_test)
    acc = np.mean(preds == y_test)
    return acc, clf

## SVM (one-vs-the-rest classification)

In [35]:
clf = svm.LinearSVC()
clf.fit(X_train, y_train)
preds = clf.predict(X_test)
acc = np.mean(preds == y_test)
acc

0.7151790790221717

## Bag of ngrams (up to trigrams)

In [36]:
v = TfidfVectorizer(min_df=1, ngram_range=(1,3))

bow_train = train['text'].values
bow_test = test['text'].values
y_train = train['party'].values
y_test = test['party'].values

bow = v.fit(bow_train)
bow = v.fit(bow_test)

X_train = v.transform(bow_train)
X_test = v.transform(bow_test)

print(X_train.shape)
print(X_test.shape)

(6362, 488683)
(1759, 488683)


In [38]:
nb(X_train, X_test, y_train, y_test) # too many features--need feature selection

0.65321205230244461

# Create dataset

## Training set (and +dev)

In [29]:
data_dirpath = '/home/michael/school/research/convote/convote_v1.1/data_stage_one/training_set/'

outlines = []

for fname in sorted(os.listdir(data_dirpath)):
    party = fname[-7].lower()
    
    with open(os.path.join(data_dirpath, fname)) as f:
        text = f.read()
        
    id = fname[:-4]
    
    outlines.append([id, party, text])
    
len(outlines)

5660

In [5]:
pd.DataFrame(outlines, columns=['id', 'party', 'text']).to_csv('/home/michael/school/research/convote/convote_1train.csv', index=False)

### Add dev set

In [30]:
data_dirpath = '/home/michael/school/research/convote/convote_v1.1/data_stage_one/development_set/'

for fname in sorted(os.listdir(data_dirpath)):
    party = fname[-7].lower()
    
    with open(os.path.join(data_dirpath, fname)) as f:
        text = f.read()
        
    id = fname[:-4]
    
    outlines.append([id, party, text])
    
len(outlines)

6362

In [31]:
pd.DataFrame(outlines, columns=['id', 'party', 'text']).to_csv('/home/michael/school/research/convote/convote_1train_dev.csv', index=False)

## Test set

In [7]:
data_dirpath = '/home/michael/school/research/convote/convote_v1.1/data_stage_one/test_set/'

outlines = []

for fname in sorted(os.listdir(data_dirpath)):
    party = fname[-7].lower()
    
    with open(os.path.join(data_dirpath, fname)) as f:
        text = f.read()
        
    id = fname[:-4]
    
    outlines.append([id, party, text])
    
len(outlines)

1759

In [8]:
pd.DataFrame(outlines, columns=['id', 'party', 'text']).to_csv('/home/michael/school/research/convote/convote_1test.csv', index=False)