In [31]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import numpy as np
import numpy.linalg as LA
import re
import nltk

In [50]:
import json
from datetime import datetime as dt
import collections
import pandas as pd

In [136]:
sourcefnames = {'bloomberg' : 'data/scrapedbloomberg.json',
                'breitbart' : 'data/scrapedbreitbart.json',
                'cnn' : 'data/scrapedcnn.json',
                'fox' : 'data/scrapedfox.json',
                #'guardian' : 'data/scrapedguardian.json',
                'natl review' : 'data/scrapednatreview.json',
                'WaPo' : 'data/scrapedwapo.json'}
urls = {}
datecounts = {}
article_text = {}


In [200]:
for k, v in sourcefnames.items():
    urls[k] = set()
    datecounts[k] = {}
    with open(v) as f:
        for line in f:
            art = json.loads(line)
            max_articles = False
            if art['date'] and art['sourceurl'] not in urls[k]:
                dobj = dt.strptime(art['date'][0:10], '%Y-%m-%d')
                if dobj.year == 2017:    
                    if dobj in datecounts[k]:
                        if datecounts[k][dobj] < 1:
                            datecounts[k][dobj] += 1
                        else:
                            max_articles = True
                    else:
                        datecounts[k][dobj] = 1
                    if not max_articles:
                        urls[k].add(art['sourceurl'])
                        if (dobj.month,k) in article_text:
                            article_text[(dobj.month,k)]+=art['response_body']
                        else:
                            article_text[(dobj.month,k)]=art['response_body']
                            
                    

In [201]:
article_text = collections.OrderedDict(sorted(article_text.items()))

In [202]:
for t in article_text:
    print(len(article_text[t]),t[0],t[1])

572736 1 WaPo
149499 1 bloomberg
627156 1 breitbart
1512418 1 cnn
541400 1 fox
463521 1 natl review
561405 2 WaPo
237539 2 bloomberg
725659 2 breitbart
1183944 2 cnn
1133763 2 fox
482311 2 natl review
417259 3 WaPo
128191 3 bloomberg
250255 3 breitbart
186620 3 cnn
154417 3 fox
286454 3 natl review


In [196]:
datecounts

{'WaPo': {datetime.datetime(2017, 1, 2, 0, 0): 2,
  datetime.datetime(2017, 1, 5, 0, 0): 2,
  datetime.datetime(2017, 1, 7, 0, 0): 2,
  datetime.datetime(2017, 1, 8, 0, 0): 1,
  datetime.datetime(2017, 1, 15, 0, 0): 1,
  datetime.datetime(2017, 1, 16, 0, 0): 2,
  datetime.datetime(2017, 1, 17, 0, 0): 1,
  datetime.datetime(2017, 1, 18, 0, 0): 1,
  datetime.datetime(2017, 1, 21, 0, 0): 2,
  datetime.datetime(2017, 1, 23, 0, 0): 8,
  datetime.datetime(2017, 1, 24, 0, 0): 2,
  datetime.datetime(2017, 1, 25, 0, 0): 4,
  datetime.datetime(2017, 1, 28, 0, 0): 1,
  datetime.datetime(2017, 1, 30, 0, 0): 1,
  datetime.datetime(2017, 1, 31, 0, 0): 5,
  datetime.datetime(2017, 2, 1, 0, 0): 2,
  datetime.datetime(2017, 2, 5, 0, 0): 3,
  datetime.datetime(2017, 2, 8, 0, 0): 1,
  datetime.datetime(2017, 2, 9, 0, 0): 5,
  datetime.datetime(2017, 2, 10, 0, 0): 2,
  datetime.datetime(2017, 2, 11, 0, 0): 1,
  datetime.datetime(2017, 2, 12, 0, 0): 4,
  datetime.datetime(2017, 2, 13, 0, 0): 5,
  datetime.

In [190]:
month = 2
names = [t[1] for t in article_text if t[0] == month]
d = [article_text[t] for t in article_text if t[0] == month]

In [191]:
names

['WaPo', 'bloomberg', 'breitbart', 'cnn', 'fox', 'natl review']

In [192]:
stopWords = stopwords.words('english')
tokenize = lambda doc: list(filter(None,re.split(r"\W|\d",doc.lower())))

In [193]:
tfidf_vec = TfidfVectorizer(stop_words = stopWords,tokenizer=tokenize)
tfidf = tfidf_vec.fit_transform(d)

In [184]:
#tfidf_vec.get_feature_names()

In [185]:
from sklearn.metrics.pairwise import linear_kernel

In [186]:
def cosine_similarities_mat(m):
    n = m.shape[0]
    cs_array = np.zeros((n,n))
    for i in range(n):
        cs_array[i]=(linear_kernel(m[i:i+1], m).flatten())
    return cs_array

In [187]:
def pearson_corr_mat(m):
    n = m.shape[0]
    pc_array_all = np.corrcoef(m.toarray(), m.toarray())
    pc_array = np.zeros((n,n))
    for i in range(n):
        pc_array[i]=pc_array_all[i][0:n]
    return pc_array

In [188]:
cs_mat = cosine_similarities_mat(tfidf)
df_cs = pd.DataFrame(cs_mat, columns = names, index = names)
df_cs

Unnamed: 0,WaPo,bloomberg,breitbart,cnn,fox,natl review
WaPo,1.0,0.604432,0.533233,0.836252,0.799638,0.584214
bloomberg,0.604432,1.0,0.40139,0.626982,0.646044,0.475183
breitbart,0.533233,0.40139,1.0,0.581852,0.608903,0.512336
cnn,0.836252,0.626982,0.581852,1.0,0.810445,0.610755
fox,0.799638,0.646044,0.608903,0.810445,1.0,0.601729
natl review,0.584214,0.475183,0.512336,0.610755,0.601729,1.0


In [189]:
pc_mat = pearson_corr_mat(tfidf)
df_pc = pd.DataFrame(pc_mat, columns = names, index = names)
df_pc

Unnamed: 0,WaPo,bloomberg,breitbart,cnn,fox,natl review
WaPo,1.0,0.58834,0.507521,0.828452,0.789826,0.560692
bloomberg,0.58834,1.0,0.370086,0.609752,0.62916,0.446871
breitbart,0.507521,0.370086,1.0,0.553654,0.580424,0.472199
cnn,0.828452,0.609752,0.553654,1.0,0.798319,0.583096
fox,0.789826,0.62916,0.580424,0.798319,1.0,0.570893
natl review,0.560692,0.446871,0.472199,0.583096,0.570893,1.0


## Original

In [79]:
def read_text (file):
    with open(file,'r') as f:
        text = f.read()
    f.close()
    return text

In [80]:
bb = read_text('data/breitbart_pol_feb_20_26.txt')
nyt = read_text('data/nyt_pol_feb_20_26.txt')
hp = read_text('data/huffpost_pol_feb_20_26.txt')
fox = read_text('data/fox_pol_feb_20_26.txt')


In [81]:
d = [bb,nyt,fox,hp]
names = ["breitbart","nyt","fox","hp"]

In [82]:
stopWords = stopwords.words('english')
tokenize = lambda doc: list(filter(None,re.split(r"\W|\d",doc.lower())))


In [98]:
tfidf_vec = TfidfVectorizer(stop_words = stopWords,tokenizer=tokenize)
tfidf = tfidf_vec.fit_transform(d)
#print(td_mat.shape)
#count_vect.get_feature_names()

In [99]:
#tf and tf-idf
# tf_transformer = TfidfTransformer(use_idf=False)
# tf = tf_transformer.fit_transform(td_mat)
# tfidf_transformer = TfidfTransformer()
# tfidf = tfidf_transformer.fit_transform(td_mat)

In [100]:
from sklearn.metrics.pairwise import linear_kernel

In [12]:
count_vect = CountVectorizer(max_features = 50,stop_words = stopWords,tokenizer=tokenize)
td_mat = count_vect.fit_transform(d)
#count_vect.get_feature_names()
tfidf_transformer = TfidfTransformer()
tfidf = tfidf_transformer.fit_transform(td_mat)

In [106]:
def jaccard_similarity(d1, d2):
    intersection = set(d1).intersection(set(d2))
    union = set(d1).union(set(d2))
    return len(intersection)/len(union)

In [107]:
from nltk.probability import FreqDist

In [108]:
d_tokenized =[tokenize(doc) for doc in d]
d_no_stop =[[w for w in d if w not in stopWords] for d in d_tokenized]
d_Freq =[FreqDist(doc) for doc in d_no_stop]
d_common_50 =[f.most_common(50) for f in d_Freq]

In [109]:
d_most_common_words = [list(zip(*d))[0] for d in d_common_50]

In [111]:
def jaccard_similarity_mat(m):
    n = len(m)
    array = np.zeros((n,n))
    for i in range(n):
        for j in range(i,n):
            sim = jaccard_similarity(m[i],m[j])
            array[i][j] = sim
            array[j][i] = sim
    return array

In [112]:
jaccard_similarity_mat(d_most_common_words)

array([[ 1.        ,  0.2987013 ,  0.36986301,  0.33333333],
       [ 0.2987013 ,  1.        ,  0.36986301,  0.35135135],
       [ 0.36986301,  0.36986301,  1.        ,  0.42857143],
       [ 0.33333333,  0.35135135,  0.42857143,  1.        ]])