In [1]:
%cd /afs/csail.mit.edu/u/o/oliveren/meng/check-worthy
!export PYTHONPATH=.
%load_ext autoreload
%autoreload 2

DATA_PATH = "/afs/csail.mit.edu/u/o/oliveren/meng/check-worthy/data/claim-rank/transcripts_all_sources/"

/afs/csail.mit.edu/u/o/oliveren/meng/check-worthy


In [2]:
from os.path import join
from nltk.tokenize import word_tokenize
from sklearn.preprocessing.data import MinMaxScaler
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.model_selection import train_test_split
from src.features.feature_sets import get_cb_pipeline

In [3]:
class Sentence(object):
    def __init__(self, id, text, label, speaker, debate, labels):
        self.id = id
        self.text = text
        self.label = label
        self.speaker = speaker
        self.debate = debate
        self.features = {}
        self.tokens = word_tokenize(text)
        self.labels = labels

In [4]:
def read_debates(debate_filename):
    sentences = []
    debate_file = open(join(DATA_PATH, debate_filename))
    debate_file.readline()
    for line in debate_file:
        line = line.strip()
        columns = line.split("\t")
        labels = columns[3:-1]
        label = 1 if int(columns[2].strip()) > 0 else 0
        s = Sentence(columns[0], columns[-1], label, columns[1], 'placeholder', labels)
        sentences.append(s)
    return sentences

In [5]:
sentences = read_debates('clinton_acceptance_speech_ann.tsv')

In [6]:
x_data = [sentence.text for sentence in sentences]
y_data = [sentence.label for sentence in sentences]

In [7]:
pipeline = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', SGDClassifier(loss='log')),
])

In [8]:
x_train, x_test, y_train, y_test =  train_test_split(x_data, y_data, test_size=0.20, random_state=42)

In [9]:
pipeline.fit(x_train, y_train)



Pipeline(memory=None,
     steps=[('vect', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip...='l2', power_t=0.5, random_state=None,
       shuffle=True, tol=None, verbose=0, warm_start=False))])

In [10]:
results = pipeline.predict_proba(x_test)

In [11]:
import csv
with open('records.tsv', 'w', newline='\n') as tsvfile:
    writer = csv.writer(tsvfile, delimiter='\t')
    writer.writerow(['Sentence Text', 'Check-worthy Probability', 'Actual Label'])
    for i in range(len(results)):
        writer.writerow([x_test[i], results[i][1], y_test[i]])

In [12]:
print(x_test[0])

Now, I don't think President Obama and Vice President Biden get the credit they deserve for saving us from the worst economic crisis of our lifetimes.


In [13]:
pipelineNN = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('nn', MLPClassifier()),
])

In [14]:
pipelineNN.fit(x_train, y_train)

Pipeline(memory=None,
     steps=[('vect', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip...=True, solver='adam', tol=0.0001, validation_fraction=0.1,
       verbose=False, warm_start=False))])

In [15]:
results = pipelineNN.predict_proba(x_test)

In [16]:
with open('records_nn.tsv', 'w', newline='\n') as tsvfile:
    writer = csv.writer(tsvfile, delimiter='\t')
    writer.writerow(['Sentence Text', 'Check-worthy Probability', 'Actual Label'])
    for i in range(len(results)):
        writer.writerow([x_test[i], results[i][1], y_test[i]])