In [10]:
import pandas as pd
import matplotlib.pyplot as plt
import collections
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.model_selection import train_test_split
import numpy as np
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
import multiprocessing

In [11]:
Coats = pd.read_table("Dan Coats.txt", header =None)
Coats.columns = ['text']
Coats['author'] = 'Dan Coats'

Mattis = pd.read_table("James Mattis.txt", header =None)
Mattis.columns = ['text']
Mattis['author'] = 'James Mattis'

Kelly = pd.read_table("John Kelly.txt", header =None)
Kelly.columns = ['text']
Kelly['author'] = 'John Kelly'

Hassett = pd.read_table("Kevin Hassett.txt", header =None)
Hassett.columns = ['text']
Hassett['author'] = 'Kevin Hassett'

Nielsen = pd.read_table("Kirstjen Nielsen.txt", header =None)
Nielsen.columns = ['text']
Nielsen['author'] = 'Kirstjen Nielsen'

Kudlow = pd.read_table("Larry Kudlow.txt", header =None)
Kudlow.columns = ['text']
Kudlow['author'] = 'Larry Kudlow'

Pence = pd.read_table("Mike Pence.txt", header =None)
Pence.columns = ['text']
Pence['author'] = 'Mike Pence'

Pompeo = pd.read_table("Mike Pompeo.txt", header =None)
Pompeo.columns = ['text']
Pompeo['author'] = 'Mike Pompeo'

train = pd.DataFrame()
train = pd.concat([Coats, Mattis,Kelly,Hassett,Nielsen,Kudlow,Pence,Pompeo])

# Convert the author strings into numbers
#train['author'] = train['author'].apply(lambda x: possibleAuthors[x])
#print(train)


text = train['text'].tolist()
author = train['author'].tolist()

In [12]:
documents = [TaggedDocument(doc, [i]) for i, doc in enumerate(text)]

In [13]:
documents[:2]

[TaggedDocument(words='Good evening, and sincere thanks to the Atlantic Council, Le Figaro, and the Tocqueville Foundation for organizing this important conversation about Democracy in the West.\xa0', tags=[0]),
 TaggedDocument(words='I enjoyed catching up with Atlantic council board member and a long-time friend Ambassador Boyden Gray and the Atlantic Council’s Executive Vice President Damon Wilson just a few minutes ago.', tags=[1])]

In [16]:
N_DIMS = 100
N_EPOCHS = 50

d2v = Doc2Vec(seed=1,
            workers=multiprocessing.cpu_count(),
            vector_size=N_DIMS,
            dm=0,  # use distributed bag of words

            min_count=0,
            window=15,
            epochs=N_EPOCHS)

print("Building vocab...")
d2v.build_vocab(documents)
#print("Word2Vec vocabulary length:", len(d2v.wv.vocab))
#print("num shuffled visits: ", d2v.corpus_count)
print("Training...")
d2v.train(documents, total_examples=d2v.corpus_count, epochs=d2v.epochs)
print("Saving model...")
#d2v.save(w2v_file)

Building vocab...
Training...


In [20]:
vectors = [d2v.infer_vector(document.words) for document in documents]

In [24]:
### Test the model
X_train, X_test, y_train, y_test = train_test_split(vectors, author, test_size=0.25, random_state=1337)
svm = LinearSVC()
svm.fit(X_train, y_train)

predictions = svm.predict(X_test)
#print(list(predictions[0:10]))
 
from sklearn.metrics import accuracy_score
print("The Linear SVC model is accurate: ", round((accuracy_score(y_test, predictions) *100),2), "% of the time.")



The Linear SVC model is accurate:  72.64 % of the time.


In [26]:
opEd = pd.read_table("OpEd.txt", header =None)
opEd.columns = ['text']

test = opEd['text'].tolist()
test_documents = [TaggedDocument(doc, [i]) for i, doc in enumerate(test)]
X_test = [d2v.infer_vector(document.words) for document in test_documents]

# testVector = vectorizer.fit_transform(test)
# X_test=vectorizer.transform(test)
# print(vectors.shape)


predictions = svm.predict(X_test)

#predictedAuthor = mode(predictions)


predictedAuthordf = pd.DataFrame(predictions)
predictedAuthordf.columns = ['Author']
predictedAuthordf = predictedAuthordf['Author'].value_counts().reset_index()
predictedAuthordf = pd.DataFrame(predictedAuthordf)
predictedAuthordf.columns = ['Author','Count']
predictedAuthordf["Probability"] = predictedAuthordf["Count"]/(predictedAuthordf['Count'].sum())
predictedAuthordf["logLikelihood"] = np.log( predictedAuthordf["Probability"])

prediction = predictedAuthordf['logLikelihood'].idxmax()
predictedAuthor = predictedAuthordf.at[prediction,'Author']

print(predictedAuthordf,"\n\n")


#predictedAuthor = (list(possibleAuthors.keys())[list(possibleAuthors.values()).index(predictedAuthor)]) 
print("The predicted author is: ", predictedAuthor)

          Author  Count  Probability  logLikelihood
0     John Kelly     11     0.407407      -0.897942
1   Larry Kudlow      8     0.296296      -1.216395
2    Mike Pompeo      5     0.185185      -1.686399
3      Dan Coats      2     0.074074      -2.602690
4  Kevin Hassett      1     0.037037      -3.295837 


The predicted author is:  John Kelly


# bigram model

In [39]:
from nltk.collocations import BigramCollocationFinder
import re
import codecs
import numpy as np
import string
 
# train char bigram models for each author
models = dict()
words_all = []
translate_table = dict((ord(char), None) for char in string.punctuation)
# reading the file in unicode format using codecs library    
for df in [Coats, Mattis,Kelly,Hassett,Nielsen,Kudlow,Pence,Pompeo]:
    text = df['text'].tolist()
    author = df['author'].iloc[0]
    for line in text:            
        # extracting the text sentence from each line         
        line = " ".join(line.split())
        line = line.lower()   # to lower case
        line = re.sub(r"\d+", "", line) # remove digits

        if len(line) != 0:
            line = line.translate(translate_table) # remove punctuations
            words_all += line
            words_all.append(" ") # append sentences with space

    all_str = ''.join(words_all)
    all_str = re.sub(' +',' ',all_str) # replace series of spaces with single space
    seq_all = [i for i in all_str]  # list of all characters chronologically

    # extracting the bi-grams and sorting them according to their frequencies
    finder = BigramCollocationFinder.from_words(seq_all)
#     finder.apply_freq_filter(5)
    bigram_model = finder.ngram_fd.items()#.viewitems()
    bigram_model = sorted(bigram_model, key=lambda item: item[1],reverse=True)
    models[author] = bigram_model  # list of [((gram1, gram2), count)]

In [None]:
from nltk.collocations import BigramCollocationFinder
import re
import codecs
import numpy as np
import string
         
def test_language(path,language,total):
    tp = 0
    fp = 0
    lang_name = ["english","german","french","italian","dutch","spanish"]
    model = [np.load(lang+".npy") for lang in lang_name]
     
    with codecs.open(path,"r","utf-8") as filep:
        translate_table = dict((ord(char), None) for char in string.punctuation)
        for l,line in enumerate(filep):
             
            line = " ".join(line.split()[1:])
            line = line.lower()
            line = re.sub(r"\d+", "", line)
            line = line.translate(translate_table)
             
            finder = BigramCollocationFinder.from_words(line)
                         
            freq_sum = np.zeros(6)                        
            for k,v in finder.ngram_fd.items():                 
                isthere = 0
                for i,lang in enumerate(lang_name):                    
                    for key,f in model[i]:
                        if k == key:                            
                            freq_sum[i] = freq_sum[i]+(f*10000)/total[i]
                            isthere = 1
                            break
                    if isthere == 0:
                        freq_sum[i] = freq_sum[i] + 1
                                 
            max_val = freq_sum.max()
            index= freq_sum.argmax()
            if max_val != 0:                
                if lang_name[index] == language:
                    tp = tp + 1
                else:
                    fp = fp + 1
            print "tp = ",tp,"fp = ",fp,freq_sum
    print "True Positive = ",tp
    print "False Positive = ",fp                
               

root = "test\\"
lang_name = ["english","german","french","italian","dutch","spanish"]

no_of_bigms = []
for i,lang in enumerate(lang_name):
    model = np.load(lang+".npy")
    total = 0
    for key,v in model:            
        total = total + v
    no_of_bigms.append(total) 
    print total

train_lang_path = ["eng_news_2015_10K\\eng_news_2015_10K-sentences.txt","deu_news_2015_10K\\deu_news_2015_10K-sentences.txt","fra_news_2010_10K-text\\fra_news_2010_10K-sentences.txt","ita_news_2010_10K-text\\ita_news_2010_10K-sentences.txt","nld_wikipedia_2016_10K\\nld_wikipedia_2016_10K-sentences.txt","spa_news_2011_10K\\spa_news_2011_10K-sentences.txt"]
for i,p in enumerate(train_lang_path):
    print "Testing of ",lang_name[i]
    test_language(root+p,lang_name[i],no_of_bigms)

John Kelley told Donald to let it go https://www.huffingtonpost.com/entry/stephen-colbert-identity-new-york-times-op-ed-author_us_5b971851e4b0162f47302cfei