In [2]:
#import packages
import markovify #https://github.com/jsvine/markovify
import pandas as pd

In [3]:
#override part of markovify Text Class to remove punctionation, numbers, and change all to lower case
import nltk
import re

class ModifiedText(markovify.Text):
   # word_split_pattern = re.compile(r"\s+|“+|”+|‘+|’+|'+|,")
    word_split_pattern = re.compile(r"\W|\d")
    def word_split(self, sentence):
        """
        Splits a sentence into a list of words.
        """
        words = re.split(self.word_split_pattern, sentence) 
        words = list(filter(None,words))
        
        words=[w.lower() for w in words]
        return words

In [4]:
#create markov chain in the form of nested dictionaries: {(n-gram):{next_word:count}}
#state size = n in n-gram
#create transition matrix dataframe from chain
def create_MV_df(filename,state_size = 1, remove_punc = False):
    with open(filename) as f:
        text = f.read()
    f.close()
    if remove_punc:
        text_model = ModifiedText(text,state_size)
    else:
        text_model = markovify.Text(text,state_size=1)
    
    chain = text_model.chain.model
    trans_matrix =pd.DataFrame.from_dict(chain)
    return  trans_matrix 

In [27]:
def make_sentence(filename,state_size = 2, num_tries = 10, num_sentences = 3,max_overlap_ratio = .7, num_char = None, remove_punc = False):
    with open(filename) as f:
        text = f.read()
    f.close()
    if remove_punc:
        text_model = ModifiedText(text,state_size)
    else:
        text_model = markovify.Text(text,state_size)
    
    chain = text_model.chain.model
    #make sentences using chain
    for i in range(num_sentences):
        if num_char:
            print(text_model.make_short_sentence(num_char))
        else:
            print(text_model.make_sentence(max_overlap_ratio = max_overlap_ratio,tries = num_tries))

### Make Sentences using Chains

In [34]:
make_sentence('data/breitbart_pol_feb_20_26.txt',state_size = 3,num_tries = 30)

For example, former FERC Commissioner Phillip Moeller left the commission to be a field representative for him in St. Charles and St. Louis.”
“I have to admit; it was a great opportunity to work with what I consider one of the most prominent of the civil rights organizations.
At the same time, the official said the State Department would approve or disapprove the sale.


In [20]:
make_sentence('data/fox_pol_feb_20_26.txt',state_size = 2,num_tries = 20, num_char = 140)

Mind you that made it its base for special-forces operations.
Even if he follows tradition, Trump will decide to completely shake up the EPA could come sooner rather than later.
Slapping restraints on the principle of religious tolerance.


In [24]:
make_sentence('data/nyt_pol_feb_20_26.txt',state_size = 2,num_tries = 20, num_char = 140)

“We appreciate that President Barack Obama.
For all his talk of creating a framework for this,” he said.
As Mr. Ryan and Representative Kevin Brady of Texas, chairman of Breitbart News, a website that has kept Europe at peace.


In [22]:
make_sentence('data/huffpost_pol_feb_20_26.txt',state_size = 2,num_tries = 20, num_char = 140)

Ivanka Trump, who converted to Judaism, tweeted about the so-called “border adjustment tax.”
With a clear and all societal expectations.
Poole had adapted a type of detention could be charged.


## Find Distance Between MCs

In [241]:
MC_bb =create_MV_df('data/breitbart_pol_feb_20_26.txt',1,True)

In [242]:
MC_fox =create_MV_df('data/fox_pol_feb_20_26.txt',1,True)

In [243]:
MC_nyt =create_MV_df('data/nyt_pol_feb_20_26.txt',1,True)

In [258]:
MC_hp =create_MV_df('data/huffpost_pol_feb_20_26.txt',1,True)

In [259]:
#for state = 2
#chain[('in', 'the')]
#trans_matrix['in']['the'][trans_matrix['in']['the'].notnull()]

In [260]:
print(MC_bb.shape)
print(MC_fox.shape)
print(MC_nyt.shape)
print(MC_hp.shape)

(5093, 5093)
(5031, 5031)
(5401, 5401)
(5556, 5556)


In [261]:
MC_list = [MC_bb,MC_fox,MC_nyt,MC_hp]

In [262]:
x= pd.concat(MC_list, axis = 0,keys = ['1','2','3','4'])

In [263]:
y = pd.concat(MC_list,axis = 1,keys = ['1','2','3','4'])

In [264]:
x1=x.ix['1']
x2=x.ix['2']
x3=x.ix['3']
x4=x.ix['4']

In [265]:
y1 = y['1']
y2 = y['2']
y3 = y['3']
y4 = y['4']

In [268]:
full_bb = pd.concat([x1,y1],axis=1,keys = ['1','2'])['1']
full_fox = pd.concat([x2,y2],axis=1,keys = ['1','2'])['1']
full_nyt = pd.concat([x3,y3],axis=1,keys = ['1','2'])['1']
full_hp = pd.concat([x4,y4],axis=1,keys = ['1','2'])['1']

In [95]:
import numpy as np

In [96]:
def df_to_mat(df):
    df = df.fillna(0)
    mat = df.as_matrix()
    return mat

In [97]:
def get_prob_mat(mat):
    return mat/np.sum(np.sum(mat,0))

In [269]:
bb_mat = df_to_mat(full_bb)
fox_mat = df_to_mat(full_fox)
nyt_mat = df_to_mat(full_nyt)
hp_mat = df_to_mat(full_hp)

In [270]:
bb_mat_prob = get_prob_mat(bb_mat)
fox_mat_prob = get_prob_mat(fox_mat)
nyt_mat_prob = get_prob_mat(nyt_mat)
hp_mat_prob = get_prob_mat(hp_mat)

In [316]:
def distance(a,b,norm = 'fro',before=True):
    if before:
        return np.linalg.norm(a-b,norm)
    else:
        return abs(np.linalg.norm(a,norm)-np.linalg.norm(b,norm))

In [317]:
def print_distances(mc_list,name_list,norm,measure_name,before = True):
    print("Distance Measure:", measure_name)
    for i in range(len(mc_list)):
        for j in range(i+1,len(mc_list)):
            print(distance(mc_list[i],mc_list[j],norm,before),": ",name_list[i],"-",name_list[j])
            

In [318]:
name_list = ["Breitbart","Fox","NYT","HuffPost"]
mc_list = [bb_mat,fox_mat,nyt_mat,hp_mat]
mc_list_prob = [bb_mat_prob,fox_mat_prob,nyt_mat_prob,hp_mat_prob]

In [319]:
print_distances(mc_list,name_list,'fro',"Frobenius Norm")

Distance Measure: Frobenius Norm
375.598189559 :  Breitbart - Fox
653.606915508 :  Breitbart - NYT
385.075317308 :  Breitbart - HuffPost
640.156230931 :  Fox - NYT
330.797521152 :  Fox - HuffPost
630.102372635 :  NYT - HuffPost


In [320]:
print_distances(mc_list_prob,name_list,'fro',"Frobenius Norm (Probability)")

Distance Measure: Frobenius Norm (Probability)
0.0114709460971 :  Breitbart - Fox
0.0152104323377 :  Breitbart - NYT
0.0109352647244 :  Breitbart - HuffPost
0.0146409541173 :  Fox - NYT
0.00971115094858 :  Fox - HuffPost
0.0141400050243 :  NYT - HuffPost


In [323]:
#print_distances(mc_list_prob,name_list,'fro',"Diff Between Each Frobenius Norm (Probability)",False)

In [324]:
#print_distances(mc_list_prob,name_list,np.inf,"Max Norm(Prob)")