In [2]:
# Description of the TSV format:

# Column 1: the ID of the statement ([ID].json).
# Column 2: the label.
# Column 3: the statement.
# Column 4: the subject(s).
# Column 5: the speaker.
# Column 6: the speaker's job title.
# Column 7: the state info.
# Column 8: the party affiliation.
# Column 9-13: the total credit history count, including the current statement.
# 9: barely true counts.
# 10: false counts.
# 11: half true counts.
# 12: mostly true counts.
# 13: pants on fire counts.
# Column 14: the context (venue / location of the speech or statement).
import csv
import collections
import codecs
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn import linear_model
from sklearn.metrics import accuracy_score
d = collections.OrderedDict()

def get_corpus_ratings(filename):
    statements =[]
    ratings=[]
    with open(filename, encoding='utf-8') as f:
        tsvin = csv.reader(f, delimiter='\t')
        for line in tsvin:
            ratings.append(line[1])
            statements.append(line[2])
    return statements, np.array(ratings)

In [3]:
corpus, ratings=get_corpus_ratings('liar_dataset/train.tsv')

In [4]:
ratings

array(['false', 'half-true', 'mostly-true', ..., 'half-true', 'false',
       'pants-fire'], 
      dtype='<U11')

In [5]:
corpus[:5]

['Says the Annies List political group supports third-trimester abortions on demand.',
 'When did the decline of coal start? It started when natural gas took off that started to begin in (President George W.) Bushs administration.',
 'Hillary Clinton agrees with John McCain "by voting to give George Bush the benefit of the doubt on Iran."',
 'Health care reform legislation is likely to mandate free sex change surgeries.',
 'The economic turnaround started at the end of my term.']

In [6]:
vectorizer = CountVectorizer()
X=vectorizer.fit_transform(corpus)
feature_list=vectorizer.get_feature_names()
dev_corpus, dev_ratings=get_corpus_ratings("liar_dataset/valid.tsv")
vectorizer_2=CountVectorizer(vocabulary = feature_list)
X2=vectorizer_2.fit_transform(dev_corpus)
for i in [1e-3,1e-2,1e-1,1e1,1e3,1e5,1e7]:
    logreg = linear_model.LogisticRegression(C=i)
    model = logreg.fit(X,ratings)
    predicted=logreg.predict(X2)
    score=accuracy_score(dev_ratings,predicted)
    print(score, " is dev score for C=",i)

0.247663551402  is dev score for C= 0.001
0.252336448598  is dev score for C= 0.01
0.249221183801  is dev score for C= 0.1
0.244548286604  is dev score for C= 10.0
0.228971962617  is dev score for C= 1000.0
0.224299065421  is dev score for C= 100000.0
0.224299065421  is dev score for C= 10000000.0


In [7]:
test_corpus,test_ratings=get_corpus_ratings("liar_dataset/test.tsv")
logreg=linear_model.LogisticRegression(C=1e-2)
model=logreg.fit(X,ratings)
X3=vectorizer_2.fit_transform(test_corpus)
predicted=logreg.predict(X3)
score=accuracy_score(test_ratings, predicted)
print(score, "is test score for C=1e-2")

0.235201262826 is test score for C=1e-2


In [8]:
len(feature_list)

12196

In [9]:
#removing stop words
from stop_words import get_stop_words

In [10]:
#same thing but bigrams
vectorizer = CountVectorizer(ngram_range=(1, 2))
X=vectorizer.fit_transform(corpus)
feature_list=vectorizer.get_feature_names()
dev_corpus, dev_ratings=get_corpus_ratings("liar_dataset/valid.tsv")
vectorizer_2=CountVectorizer(vocabulary = feature_list)
X2=vectorizer_2.fit_transform(dev_corpus)
for i in [1e-3,1e-2,1e-1,1e1,1e3,1e5,1e7]:
    logreg = linear_model.LogisticRegression(C=i)
    model = logreg.fit(X,ratings)
    predicted=logreg.predict(X2)
    score=accuracy_score(dev_ratings,predicted)
    print(score, " is dev score for C=",i)

0.247663551402  is dev score for C= 0.001
0.256230529595  is dev score for C= 0.01
0.239096573209  is dev score for C= 0.1
0.23753894081  is dev score for C= 10.0
0.235202492212  is dev score for C= 1000.0
0.234423676012  is dev score for C= 100000.0
0.232087227414  is dev score for C= 10000000.0


In [11]:
#ngrams and remove stop words
vectorizer = CountVectorizer(stop_words=get_stop_words('en'),ngram_range=(1, 2))
X=vectorizer.fit_transform(corpus)
feature_list=vectorizer.get_feature_names()
dev_corpus, dev_ratings=get_corpus_ratings("liar_dataset/valid.tsv")
vectorizer_2=CountVectorizer(vocabulary = feature_list)
X2=vectorizer_2.fit_transform(dev_corpus)
for i in [1e-3,1e-2,1e-1,1e1,1e3,1e5,1e7]:
    logreg = linear_model.LogisticRegression(C=i)
    model = logreg.fit(X,ratings)
    predicted=logreg.predict(X2)
    score=accuracy_score(dev_ratings,predicted)
    print(score, " is dev score for C=",i)

0.201713395639  is dev score for C= 0.001
0.248442367601  is dev score for C= 0.01
0.232866043614  is dev score for C= 0.1
0.235981308411  is dev score for C= 10.0
0.249221183801  is dev score for C= 1000.0
0.249221183801  is dev score for C= 100000.0
0.25  is dev score for C= 10000000.0


In [12]:
#same thing but trigrams
vectorizer = CountVectorizer(ngram_range=(1, 3))
X=vectorizer.fit_transform(corpus)
feature_list=vectorizer.get_feature_names()
dev_corpus, dev_ratings=get_corpus_ratings("liar_dataset/valid.tsv")
vectorizer_2=CountVectorizer(vocabulary = feature_list)
X2=vectorizer_2.fit_transform(dev_corpus)
for i in [1e-3,1e-2,1e-1,1e1,1e3,1e5,1e7]:
    logreg = linear_model.LogisticRegression(C=i)
    model = logreg.fit(X,ratings)
    predicted=logreg.predict(X2)
    score=accuracy_score(dev_ratings,predicted)
    print(score, " is dev score for C=",i)

0.246105919003  is dev score for C= 0.001
0.257788161994  is dev score for C= 0.01
0.242990654206  is dev score for C= 0.1
0.238317757009  is dev score for C= 10.0
0.238317757009  is dev score for C= 1000.0
0.233644859813  is dev score for C= 100000.0
0.235981308411  is dev score for C= 10000000.0


In [13]:
#same thing but 4-grams
vectorizer = CountVectorizer(ngram_range=(1, 4))
X=vectorizer.fit_transform(corpus)
feature_list=vectorizer.get_feature_names()
dev_corpus, dev_ratings=get_corpus_ratings("liar_dataset/valid.tsv")
vectorizer_2=CountVectorizer(vocabulary = feature_list)
X2=vectorizer_2.fit_transform(dev_corpus)
for i in [1e-3,1e-2,1e-1,1e1,1e3,1e5,1e7]:
    logreg = linear_model.LogisticRegression(C=i)
    model = logreg.fit(X,ratings)
    predicted=logreg.predict(X2)
    score=accuracy_score(dev_ratings,predicted)
    print(score, " is dev score for C=",i)

0.246884735202  is dev score for C= 0.001
0.258566978193  is dev score for C= 0.01
0.244548286604  is dev score for C= 0.1
0.246105919003  is dev score for C= 10.0
0.248442367601  is dev score for C= 1000.0
0.248442367601  is dev score for C= 100000.0
0.251557632399  is dev score for C= 10000000.0


In [15]:
#same thing but remove stopwords
vectorizer = CountVectorizer(ngram_range=(1, 4),stop_words='english')
X=vectorizer.fit_transform(corpus)
feature_list=vectorizer.get_feature_names()
dev_corpus, dev_ratings=get_corpus_ratings("liar_dataset/valid.tsv")
vectorizer_2=CountVectorizer(vocabulary = feature_list)
X2=vectorizer_2.fit_transform(dev_corpus)
for i in [1e-3,1e-2,1e-1,1e1]:
    logreg = linear_model.LogisticRegression(C=i)
    model = logreg.fit(X,ratings)
    predicted=logreg.predict(X2)
    score=accuracy_score(dev_ratings,predicted)
    print(score, " is dev score for C=",i)

0.202492211838  is dev score for C= 0.001
0.244548286604  is dev score for C= 0.01
0.239096573209  is dev score for C= 0.1
0.235202492212  is dev score for C= 10.0


In [16]:
#same thing but 5-grams
vectorizer = CountVectorizer(ngram_range=(1, 5))
X=vectorizer.fit_transform(corpus)
feature_list=vectorizer.get_feature_names()
dev_corpus, dev_ratings=get_corpus_ratings("liar_dataset/valid.tsv")
vectorizer_2=CountVectorizer(vocabulary = feature_list)
X2=vectorizer_2.fit_transform(dev_corpus)
for i in [1e-3,1e-2,1e-1,1e1,1e3,1e5,1e7]:
    logreg = linear_model.LogisticRegression(C=i)
    model = logreg.fit(X,ratings)
    predicted=logreg.predict(X2)
    score=accuracy_score(dev_ratings,predicted)
    print(score, " is dev score for C=",i)

0.246884735202  is dev score for C= 0.001
0.258566978193  is dev score for C= 0.01
0.250778816199  is dev score for C= 0.1
0.246884735202  is dev score for C= 10.0
0.251557632399  is dev score for C= 1000.0
0.248442367601  is dev score for C= 100000.0
0.250778816199  is dev score for C= 10000000.0


In [18]:
for i in range(1,5):
    vectorizer = CountVectorizer(ngram_range=(1, i))
    X=vectorizer.fit_transform(corpus)
    feature_list=vectorizer.get_feature_names()
    print(len(feature_list), " is feature list len for ngram: ",i)

12196  is feature list len for ngram:  1
95883  is feature list len for ngram:  2
228132  is feature list len for ngram:  3
369863  is feature list len for ngram:  4


In [20]:
import math
for percent in [0.5,0.75,1]:
    vectorizer = CountVectorizer(ngram_range=(1, 1),max_features=math.ceil(percent*12196))
    X=vectorizer.fit_transform(corpus)
    feature_list=vectorizer.get_feature_names()
    vectorizer_2=CountVectorizer(vocabulary = feature_list)
    X2=vectorizer_2.fit_transform(dev_corpus)
    for i in [1e-3,1e-2,1e-1,1e1,1e3]:
        logreg = linear_model.LogisticRegression(C=i)
        model = logreg.fit(X,ratings)
        predicted=logreg.predict(X2)
        score=accuracy_score(dev_ratings,predicted)
        print(score, " is dev score for C=",i, " and percent=",percent)          
        

0.246884735202  is dev score for C= 0.001  and percent= 0.5
0.251557632399  is dev score for C= 0.01  and percent= 0.5
0.246884735202  is dev score for C= 0.1  and percent= 0.5
0.244548286604  is dev score for C= 10.0  and percent= 0.5
0.236760124611  is dev score for C= 1000.0  and percent= 0.5
0.246884735202  is dev score for C= 0.001  and percent= 0.75
0.252336448598  is dev score for C= 0.01  and percent= 0.75
0.25  is dev score for C= 0.1  and percent= 0.75
0.246884735202  is dev score for C= 10.0  and percent= 0.75
0.222741433022  is dev score for C= 1000.0  and percent= 0.75
0.247663551402  is dev score for C= 0.001  and percent= 1
0.252336448598  is dev score for C= 0.01  and percent= 1
0.249221183801  is dev score for C= 0.1  and percent= 1
0.244548286604  is dev score for C= 10.0  and percent= 1
0.228971962617  is dev score for C= 1000.0  and percent= 1


In [21]:
for percent in [0.5,0.75,0.8,0.9,1]:
    vectorizer = CountVectorizer(ngram_range=(1, 1),max_features=math.ceil(percent*12196))
    X=vectorizer.fit_transform(corpus)
    feature_list=vectorizer.get_feature_names()
    print(len(feature_list), " is feature list length")
    vectorizer_2=CountVectorizer(vocabulary = feature_list)
    X2=vectorizer_2.fit_transform(dev_corpus)
    for i in [1e-3,1e-2,1e-1]:
        logreg = linear_model.LogisticRegression(C=i)
        model = logreg.fit(X,ratings)
        predicted=logreg.predict(X2)
        score=accuracy_score(dev_ratings,predicted)
        print(score, " is dev score for C=",i, " and percent=",percent)          

6098  is feature list length
0.246884735202  is dev score for C= 0.001  and percent= 0.5
0.251557632399  is dev score for C= 0.01  and percent= 0.5
0.246884735202  is dev score for C= 0.1  and percent= 0.5
9147  is feature list length
0.246884735202  is dev score for C= 0.001  and percent= 0.75
0.252336448598  is dev score for C= 0.01  and percent= 0.75
0.25  is dev score for C= 0.1  and percent= 0.75
9757  is feature list length
0.247663551402  is dev score for C= 0.001  and percent= 0.8
0.252336448598  is dev score for C= 0.01  and percent= 0.8
0.248442367601  is dev score for C= 0.1  and percent= 0.8
10977  is feature list length
0.247663551402  is dev score for C= 0.001  and percent= 0.9
0.252336448598  is dev score for C= 0.01  and percent= 0.9
0.248442367601  is dev score for C= 0.1  and percent= 0.9
12196  is feature list length
0.247663551402  is dev score for C= 0.001  and percent= 1
0.252336448598  is dev score for C= 0.01  and percent= 1
0.249221183801  is dev score for C= 0

In [22]:
for percent in [0.5,0.75,0.8,0.9,1]:
    vectorizer = CountVectorizer(ngram_range=(1, 4),max_features=math.ceil(percent*369863))
    X=vectorizer.fit_transform(corpus)
    feature_list=vectorizer.get_feature_names()
    print(len(feature_list), " is feature list length")
    vectorizer_2=CountVectorizer(vocabulary = feature_list)
    X2=vectorizer_2.fit_transform(dev_corpus)
    for i in [1e-3,1e-2,1e-1]:
        logreg = linear_model.LogisticRegression(C=i)
        model = logreg.fit(X,ratings)
        predicted=logreg.predict(X2)
        score=accuracy_score(dev_ratings,predicted)
        print(score, " is dev score for C=",i, " and percent=",percent)     

184932  is feature list length
0.246105919003  is dev score for C= 0.001  and percent= 0.5
0.257009345794  is dev score for C= 0.01  and percent= 0.5
0.242990654206  is dev score for C= 0.1  and percent= 0.5
277398  is feature list length
0.246105919003  is dev score for C= 0.001  and percent= 0.75
0.259345794393  is dev score for C= 0.01  and percent= 0.75
0.242990654206  is dev score for C= 0.1  and percent= 0.75
295891  is feature list length
0.246105919003  is dev score for C= 0.001  and percent= 0.8
0.260124610592  is dev score for C= 0.01  and percent= 0.8
0.243769470405  is dev score for C= 0.1  and percent= 0.8
332877  is feature list length
0.246884735202  is dev score for C= 0.001  and percent= 0.9
0.259345794393  is dev score for C= 0.01  and percent= 0.9
0.244548286604  is dev score for C= 0.1  and percent= 0.9
369863  is feature list length
0.246884735202  is dev score for C= 0.001  and percent= 1
0.258566978193  is dev score for C= 0.01  and percent= 1
0.244548286604  is 

In [26]:
from sklearn.feature_extraction.text import TfidfVectorizer
for ngram in range(1,5):
    vectorizer = TfidfVectorizer(ngram_range=(1, ngram))
    X=vectorizer.fit_transform(corpus)
    feature_list=vectorizer.get_feature_names()
    dev_corpus, dev_ratings=get_corpus_ratings("liar_dataset/valid.tsv")
    vectorizer_2=TfidfVectorizer(vocabulary = feature_list)
    X2=vectorizer_2.fit_transform(dev_corpus)
    print(ngram, " is the ngram val")
    for i in [1e-3,1e-2,1e-1,1e1,1e3,1e5,1e7]:
        logreg = linear_model.LogisticRegression(C=i)
        model = logreg.fit(X,ratings)
        predicted=logreg.predict(X2)
        score=accuracy_score(dev_ratings,predicted)
        print(score, " is tfidf dev score for C=",i)

1  is the ngram val
0.193146417445  is tfidf dev score for C= 0.001
0.195482866044  is tfidf dev score for C= 0.01
0.240654205607  is tfidf dev score for C= 0.1
0.242211838006  is tfidf dev score for C= 10.0
0.235202492212  is tfidf dev score for C= 1000.0
0.232866043614  is tfidf dev score for C= 100000.0
0.231308411215  is tfidf dev score for C= 10000000.0
2  is the ngram val
0.193146417445  is tfidf dev score for C= 0.001
0.193146417445  is tfidf dev score for C= 0.01
0.232866043614  is tfidf dev score for C= 0.1
0.245327102804  is tfidf dev score for C= 10.0
0.239096573209  is tfidf dev score for C= 1000.0
0.238317757009  is tfidf dev score for C= 100000.0
0.23753894081  is tfidf dev score for C= 10000000.0
3  is the ngram val
0.193146417445  is tfidf dev score for C= 0.001
0.193146417445  is tfidf dev score for C= 0.01
0.22507788162  is tfidf dev score for C= 0.1
0.246105919003  is tfidf dev score for C= 10.0
0.239096573209  is tfidf dev score for C= 1000.0
0.240654205607  is tfid

In [101]:
#4-grams and 5-grams have same results
#best is 4-gram C=0.01 @ .258

#run on test dataset
vectorizer = CountVectorizer(ngram_range=(1, 4))
X=vectorizer.fit_transform(corpus)
feature_list=vectorizer.get_feature_names()
dev_corpus, dev_ratings=get_corpus_ratings("liar_dataset/test.tsv")
vectorizer_2=CountVectorizer(vocabulary = feature_list)
X2=vectorizer_2.fit_transform(dev_corpus)
logreg = linear_model.LogisticRegression(C=1e-2)
model = logreg.fit(X,ratings)
predicted=logreg.predict(X2)
score=accuracy_score(dev_ratings,predicted)
print(score, " is test score for C=",1e-2)

0.231254932912  is test score for C= 0.01


In [102]:
import gzip
def getIndicesVector(text_arr, word_to_indx, max_length=50):
    nil_indx = 0
    text_indx = [ word_to_indx[x.lower().encode('utf8')] if x.lower().encode('utf8') in word_to_indx else nil_indx for x in text_arr][:max_length]
    if len(text_indx) < max_length:
        text_indx.extend( [nil_indx for _ in range(max_length - len(text_indx))])
    text_indx=np.array(text_indx)
    return text_indx

def getEmbeddingVector(filename):
    lines = []
    with gzip.open(filename) as file:
        lines = file.readlines()
        file.close()
    embedding_vector = []
    word_to_indx = {}
    for indx, l in enumerate(lines):
        word, emb = l.split()[0], l.split()[1:]
        vector = [float(x) for x in emb ]
        if indx == 0:
            embedding_vector.append( np.zeros( len(vector) ) )
        embedding_vector.append(vector)
        word_to_indx[word] = indx+1
    embedding_vector = np.array(embedding_vector)
    return embedding_vector, word_to_indx

In [103]:
embeddings, word_to_indx = getEmbeddingVector('word_vectors.txt.gz')
print(len(corpus), " is len corpus")
def corpus_to_embed(corpus):
    embeddings_corpus=[]
    for text in corpus:
        x=getIndicesVector(text.split(),word_to_indx)
        embeddings_corpus.append(x)
    embeddings_corpus=np.matrix(embeddings_corpus)
    return embeddings_corpus

10240  is len corpus


In [104]:
print('pony' in word_to_indx)
print('pony'.encode('utf-8') in word_to_indx)

False
True


In [105]:
#utf8_corpus=[x.encode('utf8') for x in corpus]
X=corpus_to_embed(corpus)
#utf8_dev_corpus=[x.encode('utf8') for x in dev_corpus]
X2=corpus_to_embed(dev_corpus)
for i in [1e-3,1e-2,1e-1,1e1,1e3]:
    logreg = linear_model.LogisticRegression(C=i)
    model = logreg.fit(X,ratings)
    predicted=logreg.predict(X2)
    score=accuracy_score(dev_ratings,predicted)
    print(score, " is embed logreg dev score for C=",i)

0.195737963694  is embed logreg dev score for C= 0.001
0.197316495659  is embed logreg dev score for C= 0.01
0.191791633781  is embed logreg dev score for C= 0.1
0.18863456985  is embed logreg dev score for C= 10.0
0.18863456985  is embed logreg dev score for C= 1000.0


In [106]:
#moving on to SVM
#start with linear, also try rbf
from sklearn.svm import SVC

for n in range(1,5):
    vectorizer = CountVectorizer(ngram_range=(1, n))
    X=vectorizer.fit_transform(corpus)
    feature_list=vectorizer.get_feature_names()
    dev_corpus, dev_ratings=get_corpus_ratings("liar_dataset/valid.tsv")
    vectorizer_2=CountVectorizer(vocabulary = feature_list)
    X2=vectorizer_2.fit_transform(dev_corpus)
    print("starting with the ",n,"-gram")
    for i in [1e-3,1e-2,1e-1,1e1,1e3,1e5,1e7]:
        svc = SVC(C=i,kernel='linear')
        model = svc.fit(X,ratings)
        predicted=svc.predict(X2)
        score=accuracy_score(dev_ratings,predicted)
        print(score, " is dev score for C=",i)
    
#best is C=0.01, 1-gram, bag of words

starting with the  1 -gram
0.193146417445  is dev score for C= 0.001
0.252336448598  is dev score for C= 0.01
0.239096573209  is dev score for C= 0.1
0.250778816199  is dev score for C= 10.0
0.250778816199  is dev score for C= 1000.0
0.250778816199  is dev score for C= 100000.0
0.250778816199  is dev score for C= 10000000.0
starting with the  2 -gram
0.193146417445  is dev score for C= 0.001
0.247663551402  is dev score for C= 0.01
0.242211838006  is dev score for C= 0.1
0.243769470405  is dev score for C= 10.0
0.243769470405  is dev score for C= 1000.0
0.243769470405  is dev score for C= 100000.0
0.243769470405  is dev score for C= 10000000.0
starting with the  3 -gram
0.193146417445  is dev score for C= 0.001
0.242990654206  is dev score for C= 0.01
0.23753894081  is dev score for C= 0.1
0.245327102804  is dev score for C= 10.0
0.245327102804  is dev score for C= 1000.0
0.245327102804  is dev score for C= 100000.0
0.245327102804  is dev score for C= 10000000.0
starting with the  4 -g

KeyboardInterrupt: 

In [108]:
#rbf kernel
for n in range(1,5):
    vectorizer = CountVectorizer(ngram_range=(1, n))
    X=vectorizer.fit_transform(corpus)
    feature_list=vectorizer.get_feature_names()
    dev_corpus, dev_ratings=get_corpus_ratings("liar_dataset/valid.tsv")
    vectorizer_2=CountVectorizer(vocabulary = feature_list)
    X2=vectorizer_2.fit_transform(dev_corpus)
    print("starting with the ",n,"-gram")
    for i in [1e-3,1e-1,1e1,1e3,1e5]:
        svc = SVC(C=i,kernel='rbf')
        model = svc.fit(X,ratings)
        predicted=svc.predict(X2)
        score=accuracy_score(dev_ratings,predicted)
        print(score, " is dev score for C=",i)
    
#best is C=0.01, 1-gram, bag of words

starting with the  1 -gram
0.193146417445  is dev score for C= 0.001
0.193146417445  is dev score for C= 0.1
0.193146417445  is dev score for C= 10.0
0.249221183801  is dev score for C= 1000.0
0.240654205607  is dev score for C= 100000.0
starting with the  2 -gram
0.193146417445  is dev score for C= 0.001
0.193146417445  is dev score for C= 0.1
0.193146417445  is dev score for C= 10.0
0.248442367601  is dev score for C= 1000.0
0.242211838006  is dev score for C= 100000.0
starting with the  3 -gram
0.193146417445  is dev score for C= 0.001
0.193146417445  is dev score for C= 0.1
0.193146417445  is dev score for C= 10.0
0.243769470405  is dev score for C= 1000.0
0.242211838006  is dev score for C= 100000.0
starting with the  4 -gram
0.193146417445  is dev score for C= 0.001
0.193146417445  is dev score for C= 0.1
0.193146417445  is dev score for C= 10.0
0.248442367601  is dev score for C= 1000.0
0.236760124611  is dev score for C= 100000.0
