In [42]:
#import packages
import markovify #https://github.com/jsvine/markovify
import pandas as pd
from nltk.stem.lancaster import LancasterStemmer
import numpy as np

In [18]:
#override part of markovify Text Class to remove punctionation, numbers, and change all to lower case
import nltk
import re

stopwords = nltk.corpus.stopwords.words('english')
lancaster = LancasterStemmer()

class ModifiedText(markovify.Text):
   # word_split_pattern = re.compile(r"\s+|“+|”+|‘+|’+|'+|,")
    word_split_pattern = re.compile(r"\W|\d")
    def word_split(self, sentence):
        """
        Splits a sentence into a list of words.
        """
        words = re.split(self.word_split_pattern, sentence) 
        words = list(filter(None,words))
        
        words=[lancaster.stem(w.lower()) for w in words]
        words=[w for w in words if w not in stopwords]
        
        words=[w.lower() for w in words]
        return words

In [19]:
#create markov chain in the form of nested dictionaries: {(n-gram):{next_word:count}}
#state size = n in n-gram
#create transition matrix dataframe from chain
def create_MV_df(filename,state_size = 1, remove_punc = False):
    with open(filename) as f:
        text = f.read()
    f.close()
    if remove_punc:
        text_model = ModifiedText(text,state_size)
    else:
        text_model = markovify.Text(text,state_size=1)
    
    chain = text_model.chain.model
    trans_matrix =pd.DataFrame.from_dict(chain)
    return  trans_matrix 

In [20]:
def make_sentence(filename,state_size = 2, num_tries = 10, num_sentences = 3,max_overlap_ratio = .7, num_char = None, remove_punc = False):
    with open(filename) as f:
        text = f.read()
    f.close()
    if remove_punc:
        text_model = ModifiedText(text,state_size)
    else:
        text_model = markovify.Text(text,state_size)
    
    chain = text_model.chain.model
    #make sentences using chain
    for i in range(num_sentences):
        if num_char:
            print(text_model.make_short_sentence(num_char))
        else:
            print(text_model.make_sentence(max_overlap_ratio = max_overlap_ratio,tries = num_tries))

### Make Sentences using Chains

In [5]:
make_sentence('data/breitbart_pol_feb_20_26.txt',state_size = 3,max_overlap_ratio = .5, num_tries = 30)

None
“Prior to joining DHS in 2015, Mary served as the fourth director of the Army Capabilities Integration Center at Fort Eustis, Virginia.
None


In [6]:
make_sentence('data/fox_pol_feb_20_26.txt',state_size = 2,num_tries = 20, num_char = 140)

Flynn was forced to go to match that record.
According to the people.”
Our first president was mostly a steward, seeking to define the subject of an offensive video in the quiet of the House was correct.


In [7]:
make_sentence('data/nyt_pol_feb_20_26.txt',state_size = 2,num_tries = 20, num_char = 140)

View all New York Times's products and services.
Sign Up for the bureau’s help disputing the story, and that the free speech was not given formal status.
Mr. Trump’s team did not mention the six cases, Justice Thomas wrote.


In [8]:
make_sentence('data/huffpost_pol_feb_20_26.txt',state_size = 2,num_tries = 20, num_char = 140)

1979 Revisited Through the first Gulf War.
Officials at CPAC, including its chairman, Matt Schlapp, said Yiannopoulos’s response on Facebook and on and on.
It’s not that the U.S. expect such success.


## Find Distance Between MCs

In [21]:
MC_bb =create_MV_df('data/breitbart_pol_feb_20_26.txt',1,True)

In [22]:
MC_fox =create_MV_df('data/fox_pol_feb_20_26.txt',1,True)

In [23]:
MC_nyt =create_MV_df('data/nyt_pol_feb_20_26.txt',1,True)

In [24]:
MC_hp =create_MV_df('data/huffpost_pol_feb_20_26.txt',1,True)

In [13]:
#for state = 2
#chain[('in', 'the')]
#trans_matrix['in']['the'][trans_matrix['in']['the'].notnull()]

In [25]:
print(MC_bb.shape)
print(MC_fox.shape)
print(MC_nyt.shape)
print(MC_hp.shape)

(3123, 3123)
(3170, 3170)
(3312, 3312)
(3163, 3163)


In [None]:
for m in MC_list

In [35]:
def print_word_freq(chain_list,w1,w2):
    w1=lancaster.stem(w1.lower())
    w2=lancaster.stem(w2.lower())
    for i in range(len(chain_list)):
        print(chain_list[i][w1][w2])

In [69]:
MC_hp["mr"]["trump"]

3.0

In [75]:
print_word_freq(MC_list,"Mr", "Trump")

1.0
3.0
240.0
3.0


In [76]:
print_word_freq(MC_list,"President", "Trump")

56.0
40.0
22.0
7.0


In [77]:
print_word_freq(MC_list,"Donald", "Trump")

39.0
19.0
5.0
39.0


In [58]:
MC_list = [MC_bb,MC_fox,MC_nyt,MC_hp]

In [38]:
x= pd.concat(MC_list, axis = 0,keys = ['1','2','3','4'])

In [39]:
y = pd.concat(MC_list,axis = 1,keys = ['1','2','3','4'])

In [40]:
x1=x.ix['1']
x2=x.ix['2']
x3=x.ix['3']
x4=x.ix['4']

In [41]:
y1 = y['1']
y2 = y['2']
y3 = y['3']
y4 = y['4']

In [42]:
full_bb = pd.concat([x1,y1],axis=1,keys = ['1','2'])['1']
full_fox = pd.concat([x2,y2],axis=1,keys = ['1','2'])['1']
full_nyt = pd.concat([x3,y3],axis=1,keys = ['1','2'])['1']
full_hp = pd.concat([x4,y4],axis=1,keys = ['1','2'])['1']

In [43]:
import numpy as np

In [44]:
def df_to_mat(df):
    df = df.fillna(0)
    mat = df.as_matrix()
    return mat

In [45]:
def get_prob_mat(mat):
    return mat/np.sum(np.sum(mat,0))

In [46]:
bb_mat = df_to_mat(full_bb)
fox_mat = df_to_mat(full_fox)
nyt_mat = df_to_mat(full_nyt)
hp_mat = df_to_mat(full_hp)

In [47]:
bb_mat_prob = get_prob_mat(bb_mat)
fox_mat_prob = get_prob_mat(fox_mat)
nyt_mat_prob = get_prob_mat(nyt_mat)
hp_mat_prob = get_prob_mat(hp_mat)

In [48]:
def distance(a,b,norm = 'fro',before=True):
    if before:
        return np.linalg.norm(a-b,norm)
    else:
        return abs(np.linalg.norm(a,norm)-np.linalg.norm(b,norm))

In [49]:
def print_distances(mc_list,name_list,norm,measure_name,before = True):
    print("Distance Measure:", measure_name)
    for i in range(len(mc_list)):
        for j in range(i+1,len(mc_list)):
            print(distance(mc_list[i],mc_list[j],norm,before),": ",name_list[i],"-",name_list[j])
            

In [50]:
name_list = ["Breitbart","Fox","NYT","HuffPost"]
mc_list = [bb_mat,fox_mat,nyt_mat,hp_mat]
mc_list_prob = [bb_mat_prob,fox_mat_prob,nyt_mat_prob,hp_mat_prob]

In [51]:
print_distances(mc_list,name_list,'fro',"Frobenius Norm")

Distance Measure: Frobenius Norm
331.888535506 :  Breitbart - Fox
635.831738749 :  Breitbart - NYT
427.31955256 :  Breitbart - HuffPost
640.156230931 :  Fox - NYT
436.838642979 :  Fox - HuffPost
645.382057389 :  NYT - HuffPost


In [52]:
print_distances(mc_list_prob,name_list,'fro',"Frobenius Norm (Probability)")

Distance Measure: Frobenius Norm (Probability)
0.0102253274847 :  Breitbart - Fox
0.0145545748391 :  Breitbart - NYT
0.00900426158816 :  Breitbart - HuffPost
0.0146409541173 :  Fox - NYT
0.00896531015959 :  Fox - HuffPost
0.0138007481185 :  NYT - HuffPost


In [323]:
#print_distances(mc_list_prob,name_list,'fro',"Diff Between Each Frobenius Norm (Probability)",False)

In [324]:
#print_distances(mc_list_prob,name_list,np.inf,"Max Norm(Prob)")