In [1]:
import os
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn import svm
from sklearn.feature_selection import chi2, SelectKBest
import numpy as np
from nltk import word_tokenize
from nltk.stem import WordNetLemmatizer
from tqdm import tqdm_notebook as tqdm

# Experiment with DocuScope features

In [2]:
# Load train and test
train = pd.read_csv('/home/michael/school/research/convote/convote_1train_dev.csv', keep_default_na=False)
test = pd.read_csv('/home/michael/school/research/convote/convote_1test.csv', keep_default_na=False)

print(len(train))
print(len(test))

6362
1759


## Filter out non-stance-related DocuScope features

In [70]:
# Load hierarchy
desc_file = '/home/michael/school/research/docuscope/DocuScope 4.06.01 (2017.11.05)/dicts/en/default/_tones.txt'

hierarchy = {}

with open(desc_file) as f:
    cluster = ''
    dimension = ''
    
    # counters
    cluster_ctr = 0
    dimension_ctr = 0
    lat_ctr = 0
    
    for l in f.read().splitlines():
        typ, name = l.split(': ')
        
        if typ == 'CLUSTER':
            cluster = name
            if not name in hierarchy:
                hierarchy[name] = {}
            cluster_ctr += 1
            
        elif typ == 'DIMENSION':
            dimension = name
            if not name in hierarchy[cluster]:
                hierarchy[cluster][dimension] = []
            dimension_ctr += 1
            
        elif typ == 'LAT':
            hierarchy[cluster][dimension].append(name)
            lat_ctr += 1
            
print(cluster_ctr)
print(dimension_ctr)
print(lat_ctr)

26
173
19283


In [71]:
hierarchy.keys()

dict_keys(['Academic', 'Character', 'Citation', 'ConfidenceHedged', 'ConfidenceHigh', 'ConfidenceLow', 'Contingent', 'Description', 'Facilitate', 'Forceful', 'FirstPerson', 'Future', 'Information', 'Inquiry', 'Interactive', 'Metadiscourse', 'Narrative', 'Negative', 'Positive', 'Public', 'Reasoning', 'Strategic', 'SyntacticComplexity', 'Uncertainty', 'Updates', 'Orphaned'])

In [58]:
hierarchy.keys()

dict_keys(['Academic', 'Character', 'Citation', 'ConfidenceHedged', 'ConfidenceHigh', 'ConfidenceLow', 'Contingent', 'Description', 'Facilitate', 'Forceful', 'FirstPerson', 'Future', 'Information', 'Inquiry', 'Interactive', 'Metadiscourse', 'Narrative', 'Negative', 'Positive', 'Public', 'Reasoning', 'Strategic', 'SyntacticComplexity', 'Uncertainty', 'Updates', 'Orphaned'])

In [72]:
clusters_exclude = ['Description', 'Information', 'Narrative', 'SyntacticComplexity', 'Updates', 'Orphaned']

In [75]:
dims_exclude = [hierarchy[c] for c in clusters_exclude]
dims_exclude = [x for dims in dims_exclude for x in dims]
len(dims_exclude)

43

In [77]:
lats_exclude = []

for c in clusters_exclude:
    for d in hierarchy[c]:
        lats_exclude.extend(hierarchy[c][d])
        
len(lats_exclude)

6605

## Just DocuScope features

In [27]:
v = TfidfVectorizer(min_df=1, stop_words='english')

feats_train = train['lat_str'] + train['dim_str'] + train['cluster_str']
feats_test = test['lat_str'] + test['dim_str'] + test['cluster_str']

bow_train = feats_train.values
bow_test = feats_test.values

y_train = train['party'].values
y_test = test['party'].values

bow = v.fit(bow_train)
bow = v.fit(bow_test)

X_train = v.transform(bow_train)
X_test = v.transform(bow_test)

print(X_train.shape)
print(X_test.shape)

(6362, 5677)
(1759, 5677)


In [30]:
for c in ['nb', 'svm']:
    acc, _ = classify(X_train, X_test, y_train, y_test, classifier=c)
    print('{}: {}'.format(c,acc))

nb: 0.572484366117112
svm: 0.6361569073337123


## Unigrams + DocuScope

In [99]:
v = TfidfVectorizer(min_df=1, stop_words='english')

# feats_train = train['text'] + train['lat_str'] + train['dim_str'] + train['cluster_str']
# feats_test = test['text'] + test['lat_str'] + test['dim_str'] + test['cluster_str']

# feats_train = train['text'] + train['lat_restr'] + train['dim_restr'] + train['cluster_restr']
# feats_test = test['text'] + test['lat_restr'] + test['dim_restr'] + test['cluster_restr']

feats_train = train['text'] + train['lat_restr']
feats_test = test['text'] + test['lat_restr']

bow_train = feats_train.values
bow_test = feats_test.values
y_train = train['party'].values
y_test = test['party'].values

bow = v.fit(bow_train)
bow = v.fit(bow_test)

X_train = v.transform(bow_train)
X_test = v.transform(bow_test)

print(X_train.shape)
print(X_test.shape)

(6362, 18449)
(1759, 18449)


In [101]:
# Feature selection
selectors = {}
# for i in [1000, 2000, 5000, 10000, 'all']:
for i in [10000]:
    selector = SelectKBest(chi2, k=i).fit(X_train, y_train)
    X_train_reduced = selector.transform(X_train)
    X_test_reduced = selector.transform(X_test)

    print(i, end='\t')
    
#     for c in ['nb', 'svm']:
#     for c in ['nb']:
    for c in ['svm']:
        acc, clf = classify(X_train_reduced, X_test_reduced, y_train, y_test, classifier=c)
        print('{}: {}'.format(c,acc), end='\t')
        
    print()

10000	svm: 0.7151790790221717	


In [104]:
print_top_features(v, clf, clf.classes_)

Class d
busiest hematopoietic engineered imam charcognitivestatesdiverted breath cagnoli misfortune matches colors distrust milhorn frustrations dynes mirage charcognitivestatesdeliberateconsciouswillingly belts helpless hilton broke

Class i
acadcarsintromove3nichedescriberesearch disasters mammals dred considerable mandates charcognitivestatesawarenesslackawareness eminent kissimmee icsi appropriators misdemeanor apologized indentured forceignite merry gentle liz mere mcdermott

Class r
laden mistakes 1798 interquestionhowmany distribution drew mete intrusions fooled constraints chartypesobese cosponsoring kind crammed bedridden harness grabbed futureindefinitegeneration inqcuriositystudy 497



In [15]:
def most_informative_feature_for_class(vectorizer, classifier, classlabel, n=10):
    labelid = list(classifier.classes_).index(classlabel)
    feature_names = vectorizer.get_feature_names()
    topn = sorted(zip(classifier.coef_[labelid], feature_names))[-n:]

    for coef, feat in topn:
        print(classlabel, feat, coef)

In [16]:
for l in clf.classes_:
    most_informative_feature_for_class(v, clf, l, n=10)

d gorge 1.71833725563
d islamists 1.72706757969
d delegates 1.72765021448
d descriptmotionshandedbeing 1.74106707485
d blacks 1.75651952758
d boxes 1.78952963044
d charcognitivestatesmakesenseof 2.11793251342
d healing 2.22013273602
d drinking 2.48694784324
d booth 3.06154917367
i irvine 0.612978561987
i charcognitivestatesdeliberateconsciousvoluntarily 0.637849128751
i disagree 0.67470144268
i eighteenth 0.72428827712
i descriptspacerelationisolateremote 0.765414163687
i abrogation 0.876000092233
i decals 0.876000092233
i inspired 0.887199577573
i descriptmotionsflow 1.00442246093
i earmarked 1.31770830546
r chartypesmobster 1.52840648643
r faulty 1.53907594127
r 172 1.61020523923
r disagrees 1.66548417965
r detrimental 1.69184158738
r intricate 1.7513060369
r connecticut 1.75819087165
r induced 1.77223195851
r jacobs 1.89537628173
r inforeportomitleaveout 2.35190006864


In [3]:
def print_top_features(vectorizer, clf, labels, n=20):
    """Prints features with the highest coefficient values"""
    feature_names = vectorizer.get_feature_names()
    
    for i in range(clf.coef_.shape[0]):
        print("Class {}".format(labels[i]))
        top = np.argsort(clf.coef_[i])[-1*n:]
        print(" ".join(reversed([feature_names[j] for j in top])))
        print()

## Generic classifier (takes Naive Bayes, SVM, eg)

In [2]:
def classify(X_train, X_test, y_train, y_test, classifier='nb'):
    """ Trains classifiers
    Args:
        classifier: {'nb', 'svm'}
    
    Returns (accuracy, classifier)
    """
    
    if classifier == 'nb':
        clf = MultinomialNB()
        
    elif classifier == 'svm':
        clf = svm.LinearSVC()
        
    clf.fit(X_train, y_train)

    preds = clf.predict(X_test)
    acc = np.mean(preds == y_test)
    return acc, clf

# Adding in DocuScope features

In [88]:
# Get Docuscope features per document

for fold in ['training', 'development', 'test']:
    print(fold)
    docuscope_dirpath = '/home/michael/school/research/convote/convote_docuscope/_{}_set'.format(fold)
    
    ds_feats = [] # List of features to make a dataframe, will merge with convote text
    
    for fname in tqdm(sorted(os.listdir(docuscope_dirpath))):
        fpath = os.path.join(docuscope_dirpath, fname)

        data = pd.read_table(fpath, names=['token', 'lat', 'dimension', 'cluster'])

        lats = [l for l in data['lat'].tolist() if isinstance(l, str)]
        dims = [l for l in data['dimension'].tolist() if isinstance(l, str)]
        clusters = [l for l in data['cluster'].tolist() if isinstance(l, str)]

        lats_ex = [l for l in lats if not l in lats_exclude]
        dims_ex = [l for l in dims if not l in dims_exclude]
        clusters_ex = [l for l in clusters if not l in clusters_exclude]
        
        ds_feats.append([fname[:-4], ' '.join(lats), ' '.join(dims), ' '.join(clusters),
                        ' '.join(lats_ex), ' '.join(dims_ex), ' '.join(clusters_ex)])

    df = pd.read_csv('/home/michael/school/research/convote/convote_1{}_text.csv'.format(fold))

    # Merge in LATs as a string
    lat_df = pd.DataFrame(ds_feats, columns=['id', 'lat_str', 'dim_str', 'cluster_str', 
                                             'lat_restr', 'dim_restr', 'cluster_restr'])
    merged = pd.merge(df, lat_df)
    print(len(df))
    print(len(merged))

    merged.to_csv('/home/michael/school/research/convote/convote_1{}.csv'.format(fold), index=False)
    print()

training

5660
5660
development

702
702
test

1759
1759


In [91]:
# Make train+dev

train = pd.read_csv('/home/michael/school/research/convote/convote_1training.csv')
dev = pd.read_csv('/home/michael/school/research/convote/convote_1development.csv')

train_dev = pd.concat([train, dev])
print(len(train_dev))
print(train_dev.columns)

train_dev.to_csv('/home/michael/school/research/convote/convote_1train_dev.csv', index=False)

6362
Index(['id', 'party', 'text', 'lat_str', 'dim_str', 'cluster_str', 'lat_restr',
       'dim_restr', 'cluster_restr'],
      dtype='object')


# Baselines

In [3]:
# Load train and test
train = pd.read_csv('/home/michael/school/research/convote/convote_1train_dev.csv')
test = pd.read_csv('/home/michael/school/research/convote/convote_1test.csv')
print(len(train))
print(len(test))

6362
1759


## Unigrams

In [7]:
# Add in lemmatizer
class LemmaTokenizer(object):
    def __init__(self):
        self.wnl = WordNetLemmatizer()
    def __call__(self, doc):
        return [self.wnl.lemmatize(t) for t in word_tokenize(doc)]

v = TfidfVectorizer(min_df=1, stop_words='english', tokenizer=LemmaTokenizer())

bow_train = train['text'].values
bow_test = test['text'].values
y_train = train['party'].values
y_test = test['party'].values

bow = v.fit(bow_train)
bow = v.fit(bow_test)

X_train = v.transform(bow_train)
X_test = v.transform(bow_test)

print(X_train.shape)
print(X_test.shape)

(6362, 14702)
(1759, 14702)


In [8]:
# Feature selection
selectors = {}
for i in [1000, 2000, 5000, 10000, 'all']:
# for i in [10000]:
    selector = SelectKBest(chi2, k=i).fit(X_train, y_train)
    X_train_reduced = selector.transform(X_train)
    X_test_reduced = selector.transform(X_test)

    print(i, end='\t')
    
    for c in ['nb', 'svm']:
#     for c in ['nb']:
#     for c in ['svm']:
        acc, clf = classify(X_train_reduced, X_test_reduced, y_train, y_test, classifier=c)
        print('{}: {}'.format(c,acc), end='\t')
        
    print()

1000	nb: 0.6554860716316089	svm: 0.6810687890847072	
2000	nb: 0.671404206935759	svm: 0.6861853325753269	
5000	nb: 0.6895963615690733	svm: 0.6986924388857305	
10000	nb: 0.686753837407618	svm: 0.7043774872086412	
all	nb: 0.6839113132461626	svm: 0.7066515065378056	


In [66]:
print_top_features(v, clf, ['d', 'i','r'])

Class d
mr yield chairman gentleman speaker time amendment minutes gentlewoman balance committee energy vote california budget people ms new texas oil

Class i
mr speaker remains minutes jobs yield gentleman trade vote china time wto maryland amplify indiana long wages inquire ohio workers

Class r
chairman mr yield gentleman time speaker balance amendment minutes reserve committee madam gentlewoman energy new thank vote house ask support



In [52]:
clf

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [54]:
clf.coef_.shape

(3, 15319)

In [55]:
clf.class_count_

array([ 3183.,    26.,  3153.])

In [67]:
clf.coef_

array([[ -7.37282281, -10.43592784, -10.42524935, ..., -10.43592784,
        -10.43592784, -10.43592784],
       [ -9.55378543,  -9.62492998,  -9.62492998, ...,  -9.62492998,
         -9.62492998,  -9.62492998],
       [ -7.61797486, -10.37116572, -10.37116572, ..., -10.37116572,
        -10.37116572, -10.37116572]])

In [65]:
def print_top_features(vectorizer, clf, labels, n=20):
    """Prints features with the highest coefficient values"""
    feature_names = vectorizer.get_feature_names()
    
    for i in range(clf.coef_.shape[0]):
        print("Class {}".format(labels[i]))
        top = np.argsort(clf.coef_[i])[-1*n:]
        print(" ".join(reversed([feature_names[j] for j in top])))
        print()

## Majority class

In [14]:
print(len([y for y in y_train if y=='d']))
print(len([y for y in y_train if y=='r']))
print(len([y for y in y_train if y=='i']))
print(len(y_train))

2848
2786
26
5660


In [28]:
preds = np.asarray(['d'] * len(y_test))
acc = np.mean(preds == y_test)
acc

0.49061967026719727

## Naive Bayes

In [40]:
def nb(X_train, X_test, y_train, y_test):
    """ Trains Naive Bayes classifier
    Returns (accuracy, classifier)
    """
    
    clf = MultinomialNB()
    clf.fit(X_train, y_train)

    preds = clf.predict(X_test)
    acc = np.mean(preds == y_test)
    return acc, clf

## SVM (one-vs-the-rest classification)

In [35]:
clf = svm.LinearSVC()
clf.fit(X_train, y_train)
preds = clf.predict(X_test)
acc = np.mean(preds == y_test)
acc

0.7151790790221717

## Bag of ngrams (up to trigrams)

In [36]:
v = TfidfVectorizer(min_df=1, ngram_range=(1,3))

bow_train = train['text'].values
bow_test = test['text'].values
y_train = train['party'].values
y_test = test['party'].values

bow = v.fit(bow_train)
bow = v.fit(bow_test)

X_train = v.transform(bow_train)
X_test = v.transform(bow_test)

print(X_train.shape)
print(X_test.shape)

(6362, 488683)
(1759, 488683)


In [38]:
nb(X_train, X_test, y_train, y_test) # too many features--need feature selection

0.65321205230244461

# Create dataset of unigrams

## Training set (and +dev)

In [15]:
data_dirpath = '/home/michael/school/research/convote/convote_v1.1/data_stage_one/training_set/'

outlines = []

for fname in sorted(os.listdir(data_dirpath)):
    party = fname[-7].lower()
    
    with open(os.path.join(data_dirpath, fname)) as f:
        text = f.read()
        
    id = fname[:-4]
    
    outlines.append([id, party, text])
    
len(outlines)

5660

In [16]:
pd.DataFrame(outlines, columns=['id', 'party', 'text']).to_csv('/home/michael/school/research/convote/convote_1training_text.csv', index=False)

### Add dev set

In [5]:
data_dirpath = '/home/michael/school/research/convote/convote_v1.1/data_stage_one/development_set/'

for fname in sorted(os.listdir(data_dirpath)):
    party = fname[-7].lower()
    
    with open(os.path.join(data_dirpath, fname)) as f:
        text = f.read()
        
    id = fname[:-4]
    
    outlines.append([id, party, text])
    
len(outlines)

6362

In [6]:
pd.DataFrame(outlines, columns=['id', 'party', 'text']).to_csv('/home/michael/school/research/convote/convote_1train_dev_text.csv', index=False)

### Add dev set separately

In [12]:
data_dirpath = '/home/michael/school/research/convote/convote_v1.1/data_stage_one/development_set/'

outlines = []

for fname in sorted(os.listdir(data_dirpath)):
    party = fname[-7].lower()
    
    with open(os.path.join(data_dirpath, fname)) as f:
        text = f.read()
        
    id = fname[:-4]
    
    outlines.append([id, party, text])
    
len(outlines)

702

In [13]:
pd.DataFrame(outlines, columns=['id', 'party', 'text']).to_csv('/home/michael/school/research/convote/convote_1development_text.csv', index=False)

## Test set

In [9]:
data_dirpath = '/home/michael/school/research/convote/convote_v1.1/data_stage_one/test_set/'

outlines = []

for fname in sorted(os.listdir(data_dirpath)):
    party = fname[-7].lower()
    
    with open(os.path.join(data_dirpath, fname)) as f:
        text = f.read()
        
    id = fname[:-4]
    
    outlines.append([id, party, text])
    
len(outlines)

1759

In [10]:
pd.DataFrame(outlines, columns=['id', 'party', 'text']).to_csv('/home/michael/school/research/convote/convote_1test_text.csv', index=False)