In [1]:
import os
import re
import codecs
import pathlib

import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

import nltk

from nltk.tokenize import sent_tokenize
from nltk.tokenize import word_tokenize

from sklearn.feature_extraction.text import CountVectorizer

In [2]:
# import a clean function data
def clean_text(text):
    return text.strip()

In [3]:
# read in sources data
# data is separated by 1 or 2 tabs. 
sources = pd.read_csv("data/raw/sources.txt", 
                      sep="\t{1,2}", encoding="ISO-8859-1", skiprows=2, engine='python')
# rename columns
sources.columns = ["id", "n_words", "date", "country", "website", "url", "title"]

# date column -> pandas.DateTime"
sources["date"] = pd.to_datetime(sources["date"], format="%y-%m-%d")

pd.set_option('display.max_colwidth', 40 )
# print(sources.shape)
# sources.head()

In [4]:
# read in text data
# find lines that start with @@, extract id and text from them
with open("data/raw/text.txt", "r") as f:
    text = pd.DataFrame(
        [re.search("(\d+)\s(.*)", l[2:]).groups() for l in f.readlines() if l.startswith("@@")],
        columns=["id", "text"]
    )
    f.close()

# id should be an integer
text["id"] = text["id"].astype(int)
text['text'] = text.text.str.lower()
text["text"] = text["text"].apply(clean_text)

In [5]:
# combine the data
reports = sources.merge(text, on="id", how="outer")
reports["year"] = reports["date"].dt.strftime("%Y")
reports = reports[reports['text'].notna()] # remove NaN values

In [6]:
reports['text'] = reports.text.str.replace('{', '')
reports['text'] = reports.text.str.replace('}', '')
reports['text'] = reports.text.str.replace("\n", '')
reports['text'] = reports.text.str.rstrip("\n") #remove empty lines
reports['text'] = reports.text.str.replace("@ @ @ @ @ @ @ @ @ @ ", '')
reports['text'] = reports.text.str.replace(" @", '')
reports['text'] = reports.text.str.replace(" '", "'")
reports['text'] = reports.text.str.replace("\"", "")
reports['text'] = reports.text.str.replace(",", "")
reports['text'] = reports.text.str.replace("(", "")
reports['text'] = reports.text.str.replace(")", "")
reports['text'] = reports.text.str.replace(" <p>", ".")
reports['text'] = reports.text.str.replace(" <h>", ".")
reports['text'] = reports.text.str.replace("<p>", "")
reports['text'] = reports.text.str.replace("<h>", "")
reports['text'] = reports.text.str.replace('<', '')
reports['text'] = reports.text.str.replace('>', '')
reports['text'] = reports.text.str.replace(":", "")
reports['text'] = reports.text.str.replace("?", ".")
reports['text'] = reports.text.str.replace("!", ".")
reports['text'] = reports.text.str.replace(r"\.\s[\.\s]+", ". ") #converting . . to .
reports['text'] = reports.text.str.replace(r"\.+", ".") #converting ... to .
reports['text'] = reports.text.str.replace("--", "") 
reports['text'] = reports.text.str.replace("-", " ")
reports['text'] = reports.text.str.replace(" +", " ")
reports['text'] = reports.text.str.replace(" n't", "n't")

In [7]:
pd.set_option('display.max_colwidth', 40 )
# reports.head()

In [8]:
# article_list = ['Times of India', 'The Guardian', 'Toronto Star', 'The Nation Newspaper', 'Vanguard',
#  'Telegraph.co.uk', 'Irish Times', 'BBC News', 'Irish Examiner', 'News24', 'ABC Online',
#  'Independent Online', 'Stuff.co.nz', 'The Hindu', 'GhanaWeb', 'The Independent', 'Otago Daily Times',
#  'Toronto Star', 'Globe and Mail', 'The Independent', 'News24', 'ABC Online', 'The Nation Newspaper', 'Otago Daily Times', 'Vanguard', 'InterAksyon']

# article_list = ['Independent Online', 'Stuff.co.nz', 'The Hindu', 'GhanaWeb', 'The Independent', 'Otago Daily Times']

# article_list = list(reports['website'].value_counts().index)

In [9]:
article_val_cnt = reports['website'].value_counts()
article_df = pd.DataFrame({'article':article_val_cnt.index, 'count':article_val_cnt.values})
# article_df.shape
display(article_df)

total_len = article_df['count'].sum()
print("total len: ", total_len)

print("percentage of article of 1: ", article_df[article_df['count'] ==1]['count'].sum() / total_len *100)
print("percentage of article of above 1: ", article_df[article_df['count'] >1]['count'].sum() / total_len *100)
print("percentage of article of above 2: ", article_df[article_df['count'] >2]['count'].sum() / total_len *100)
print("percentage of article of above 3: ", article_df[article_df['count'] >3]['count'].sum() / total_len *100)
print("percentage of article of above 4: ", article_df[article_df['count'] >4]['count'].sum() / total_len *100)
print("percentage of article of above 5: ", article_df[article_df['count'] >5]['count'].sum() / total_len *100)


Unnamed: 0,article,count
0,Times of India,91
1,Telegraph.co.uk,52
2,Independent Online,49
3,Daily Mail,44
4,Irish Independent,41
...,...,...
1075,TheTyee.ca,1
1076,Bucks Free Press,1
1077,Varsity Online,1
1078,Comcast SportsNet New England,1


total len:  2864
percentage of article of 1:  23.743016759776538
percentage of article of above 1:  76.25698324022346
percentage of article of above 2:  64.66480446927375
percentage of article of above 3:  57.437150837988824
percentage of article of above 4:  51.43156424581006
percentage of article of above 5:  46.892458100558656


### Look at 5 gram, with frequency above 5

In [10]:
reports_temp =reports.copy()
reports_temp['text_vect'] = reports_temp.text.apply(sent_tokenize)
reports_temp.head(5)

Unnamed: 0,id,n_words,date,country,website,url,title,text,year,text_vect
0,11241,397.0,2013-01-06,US,Kotaku,http://kotaku.com/5973495/author-of-...,"Author of The Warriors, Cult Film Ad...",sol yurick the writer whose 1965 no...,2013,[ sol yurick the writer whose 1965 n...
1,11242,757.0,2013-01-06,US,Michigan Radio,http://michiganradio.org/post/thats-...,That's What They Say: Dialect Societ...,that's what they say dialect societ...,2013,[ that's what they say dialect socie...
2,11243,755.0,2013-01-06,US,New York Daily News,http://www.nydailynews.com/life-styl...,Best of New York: Croissant,a sublime croissant at french tart ...,2013,[ a sublime croissant at french tart...
3,11244,1677.0,2013-01-06,US,OregonLive.com,http://www.oregonlive.com/performanc...,Reflecting on a quarter-century of g...,reflecting on a quarter century of ...,2013,[ reflecting on a quarter century of...
4,21242,794.0,2013-01-11,US,Ars Technica,http://arstechnica.com/gadgets/2013/...,Ask Ars: Does Facebook auto-delete c...,ask ars does facebook auto delete c...,2013,[ ask ars does facebook auto delete ...


In [11]:
pd.set_option('display.max_colwidth', 80)

# article_list = list(article_df[article_df['count'] >0].article.values)
article_list = list(article_df.article.values)

# article_list = ['Independent Online'] # temporary to check


for article in article_list:
    temp_df = reports_temp[reports_temp['website'] ==article][['id', 'text','text_vect']]
    word_vectorizer = CountVectorizer(ngram_range=(5,5), stop_words=[])
    flatten = [item for sublist in temp_df['text_vect'] for item in sublist]
    sparse_matrix = word_vectorizer.fit_transform(flatten)
    
    frequency = sum(sparse_matrix).toarray()[0]
    frequency_df = pd.DataFrame(frequency, index=word_vectorizer.get_feature_names(),columns = ['frequency']).sort_values(by=['frequency'],ascending=False)
    
    freq_above_10 = frequency_df[frequency_df['frequency'] >=10]
    phrase_list = list(freq_above_10.index)
       
    sentences_to_remove = []
    
    print("")
    print("==============================================")
    print(article)
    print("==============================================")
    print(freq_above_10.head(69))
    
    for phrase in phrase_list:
#         removing_sent = set([sent for sent in flatten if ((phrase in sent) or (len(sent.split()) < 4))])
        removing_sent = set([sent for sent in flatten if phrase in sent ])

                  
        print(phrase, "-----", removing_sent)
        print("")

        for sent in removing_sent:
            
  
            sentences_to_remove.append(sent)
    
    if("." in sentences_to_remove):
        while("." in sentences_to_remove):
            sentences_to_remove.remove(".")
    if(" ." in sentences_to_remove):
        while(" ." in sentences_to_remove):
            sentences_to_remove.remove(" .") 
            
    sentences_to_remove = list(set(sentences_to_remove))
    sentences_to_remove.sort(key= len, reverse = True)  # sort starting by largest sentence, in case smaller sentence get chosen beforehand
    
    print("\n  ===============================================")
    print("Deleted sentence for this article: \n")
    print(sentences_to_remove)
    for sent in sentences_to_remove:
            reports_temp.text = reports_temp.text.apply(lambda x: str(x).replace(sent, ""))      


Times of India
                                      frequency
from the times of india                     156
more from the times of                      154
guidelines by marking them offensive         82
that do not follow these                     81
follow these guidelines by marking           81
...                                         ...
till it gets response from                   10
admission in two california based            10
in two california based universities         10
to defer their departure till                10
from the us government on                    10

[69 rows x 1 columns]
from the times of india ----- {'more from the times of india.', 'what better than donating blood and saving lives gupta added from the times of india.', 'from the times of india.', 'sex ratio has improved from 1991 to 2001 and till now more from the times of india.'}

more from the times of ----- {'more from the times of india.', 'sex ratio has improved from 1991 to 2001 and till no


Telegraph.co.uk
Empty DataFrame
Columns: [frequency]
Index: []

Deleted sentence for this article: 

[]

Independent Online
                                     frequency
until small triangle appears on             20
addresses all users on independent          20
appears on the right hand                   20
for more information please read            20
wait until small triangle appears           20
moderators will take action if              20
take action if need be                      20
and wait until small triangle               20
hover your mouse over the                   20
your mouse over the comment                 20
will take action if need                    20
on the right hand side                      20
email addresses all users on                20
verified email addresses all users          20
triangle appears on the right               20
our moderators will take action             20
comment and wait until small                20
small triangle appears on the


Daily Mail
Empty DataFrame
Columns: [frequency]
Index: []

Deleted sentence for this article: 

[]

Irish Independent
                                              frequency
contributors and the moderator decision              15
enter into debate with individual                    15
with individual contributors and the                 15
debate with individual contributors and              15
individual contributors and the moderator            15
that are judged to be                                15
comments that are judged to                          15
moderator will not enter into                        15
concise and to the point                             15
into debate with individual contributors             15
the moderator will not enter                         15
will not enter into debate                           15
not enter into debate with                           15
are judged to be defamatory                          15
be concise and to the                    


Stuff.co.nz
Empty DataFrame
Columns: [frequency]
Index: []

Deleted sentence for this article: 

[]

Irish Times
                                 frequency
cookies see our cookie policy           11
to our use of cookies                   11
using this website you consent          11
more information on cookies see         11
on cookies see our cookie               11
information on cookies see our          11
you consent to our use                  11
by using this website you               11
this website you consent to             11
consent to our use of                   11
website you consent to our              11
for more information on cookies         11
cookies see our cookie policy ----- {'for more information on cookies see our cookie policy .'}

to our use of cookies ----- {' by using this website you consent to our use of cookies .'}

using this website you consent ----- {' by using this website you consent to our use of cookies .'}

more information on cookies see -----


InterAksyon
Empty DataFrame
Columns: [frequency]
Index: []

Deleted sentence for this article: 

[]

Vanguard
Empty DataFrame
Columns: [frequency]
Index: []

Deleted sentence for this article: 

[]

CBC.ca
                                          frequency
and publicize those comments or                  13
or any part thereof in                           13
comments or any part thereof                     13
that comments are moderated and                  13
publicize those comments or any                  13
published according to our submission            13
moderated and published according to             13
are moderated and published according            13
please note that comments are                    13
note that comments are moderated                 13
comments are moderated and published             13
thereof in any manner whatsoever                 13
any part thereof in any                          13
according to our submission guidelines           13
part thereof 


The Nation Newspaper
Empty DataFrame
Columns: [frequency]
Index: []

Deleted sentence for this article: 

[]

Otago Daily Times
Empty DataFrame
Columns: [frequency]
Index: []

Deleted sentence for this article: 

[]

The42
                                   frequency
the office of the press                   30
office of the press ombudsman             30
media does not control and                25
journal media does not control            25
and is not responsible for                23
...                                      ...
method of dealing with complaints         15
cookies please refer to our               15
created content and their own             14
and to provide services and               14
articles that appear on our               14

[69 rows x 1 columns]
the office of the press ----- {' thejournal.ie is a full participating member of the press council of ireland and supports the office of the press ombudsman .', 'thejournal.ie is and supports the office of the pre


Mirror.co.uk
Empty DataFrame
Columns: [frequency]
Index: []

Deleted sentence for this article: 

[]

The Straits Times
                                            frequency
log ins and apologise for                          13
and apologise for the inconvenience                13
been experiencing some problems with               13
have been experiencing some problems               13
we have been experiencing some                     13
apologise for the inconvenience caused             13
follow st the straits times                        13
subscriber log ins and apologise                   13
problems with subscriber log ins                   13
experiencing some problems with subscriber         13
ins and apologise for the                          13
issues subscribers need not log                    13
subscribers need not log in                        13
the issues subscribers need not                    13
until we resolve the issues                        13
log in is still

Deccan Herald
Empty DataFrame
Columns: [frequency]
Index: []

Deleted sentence for this article: 

[]

Irish Mirror
Empty DataFrame
Columns: [frequency]
Index: []

Deleted sentence for this article: 

[]

Free Malaysia Today
                                  frequency
violate the letter or spirit             13
the letter or spirit of                  13
and do not use swear                     13
comments that violate the letter         13
please be polite and do                  13
...                                     ...
those of our users and                   11
expressed in the contents are            11
account to comment on this               11
words or crude or sexual                 11
not use swear words or                   11

[69 rows x 1 columns]
violate the letter or spirit ----- {'fmt also holds the right to remove comments that violate the letter or spirit of the general commenting rules .', 'please be polite and do not use swear fmt also holds the right to remove


Catch News
Empty DataFrame
Columns: [frequency]
Index: []

Deleted sentence for this article: 

[]

THISDAY Live
                                      frequency
this page once it has                        12
once it has been approved                    12
it has been approved by                      12
has been approved by moderator               12
appear on this page once                     12
appear next to your comment                  12
comment will appear on this                  12
keep you updated by email                    12
updated by email whenever someone            12
on this page once it                         12
by email whenever someone else               12
ll also keep you updated                     12
we ll also keep you                          12
also keep you updated by                     12
someone else comments on this                12
whenever someone else comments on            12
will appear next to your                     12
will appear on this pa


Channel News Asia
Empty DataFrame
Columns: [frequency]
Index: []

Deleted sentence for this article: 

[]

Livemint
Empty DataFrame
Columns: [frequency]
Index: []

Deleted sentence for this article: 

[]

National Post
Empty DataFrame
Columns: [frequency]
Index: []

Deleted sentence for this article: 

[]

Rediff
Empty DataFrame
Columns: [frequency]
Index: []

Deleted sentence for this article: 

[]

thejournal.ie
                                 frequency
media does not control and              19
office of the press ombudsman           19
the office of the press                 19
journal media does not control          18
does not control and is                 17
...                                    ...
such content and their ability          10
are fully responsible for their         10
journal media in relation to            10
1890 208 080 or go                      10
in relation to such content             10

[69 rows x 1 columns]
media does not control and ----- {'journal


Sydney Morning Herald
Empty DataFrame
Columns: [frequency]
Index: []

Deleted sentence for this article: 

[]

NDTV
Empty DataFrame
Columns: [frequency]
Index: []

Deleted sentence for this article: 

[]

Huffington Post
Empty DataFrame
Columns: [frequency]
Index: []

Deleted sentence for this article: 

[]

TopNews
Empty DataFrame
Columns: [frequency]
Index: []

Deleted sentence for this article: 

[]

Firstpost
Empty DataFrame
Columns: [frequency]
Index: []

Deleted sentence for this article: 

[]

Economic Times
Empty DataFrame
Columns: [frequency]
Index: []

Deleted sentence for this article: 

[]

Philippine Star
Empty DataFrame
Columns: [frequency]
Index: []

Deleted sentence for this article: 

[]

CTV News
Empty DataFrame
Columns: [frequency]
Index: []

Deleted sentence for this article: 

[]

News Ghana
Empty DataFrame
Columns: [frequency]
Index: []

Deleted sentence for this article: 

[]

RTE.ie
Empty DataFrame
Columns: [frequency]
Index: []

Deleted sentence for this artic


BreakingNews.ie
Empty DataFrame
Columns: [frequency]
Index: []

Deleted sentence for this article: 

[]

TIME
Empty DataFrame
Columns: [frequency]
Index: []

Deleted sentence for this article: 

[]

SuperSport
Empty DataFrame
Columns: [frequency]
Index: []

Deleted sentence for this article: 

[]

Entertainment.ie
Empty DataFrame
Columns: [frequency]
Index: []

Deleted sentence for this article: 

[]

The Star Online
Empty DataFrame
Columns: [frequency]
Index: []

Deleted sentence for this article: 

[]

TODAY.ng
Empty DataFrame
Columns: [frequency]
Index: []

Deleted sentence for this article: 

[]

Peace FM Online
Empty DataFrame
Columns: [frequency]
Index: []

Deleted sentence for this article: 

[]

Vancouver Sun
Empty DataFrame
Columns: [frequency]
Index: []

Deleted sentence for this article: 

[]

The News International
Empty DataFrame
Columns: [frequency]
Index: []

Deleted sentence for this article: 

[]

The Rakyat Post
Empty DataFrame
Columns: [frequency]
Index: []

Deleted


[]

The Standard
Empty DataFrame
Columns: [frequency]
Index: []

Deleted sentence for this article: 

[]

Mail &amp; Guardian Online
Empty DataFrame
Columns: [frequency]
Index: []

Deleted sentence for this article: 

[]

Montreal Gazette
Empty DataFrame
Columns: [frequency]
Index: []

Deleted sentence for this article: 

[]

Business Recorder 
Empty DataFrame
Columns: [frequency]
Index: []

Deleted sentence for this article: 

[]

New Zealand Herald
Empty DataFrame
Columns: [frequency]
Index: []

Deleted sentence for this article: 

[]

The News Journal
                             frequency
relative to its stock price         10
relative to its stock price ----- {"the current p/c value outlines the company's ability to generate cash relative to its stock price rather than what it records on earnings relative to its stock price ."}


Deleted sentence for this article: 

["the current p/c value outlines the company's ability to generate cash relative to its stock price rather than wha


Bella Naija
Empty DataFrame
Columns: [frequency]
Index: []

Deleted sentence for this article: 

[]

DigitalJournal.com
Empty DataFrame
Columns: [frequency]
Index: []

Deleted sentence for this article: 

[]

Greater Kashmir
Empty DataFrame
Columns: [frequency]
Index: []

Deleted sentence for this article: 

[]

Scoop.co.nz 
Empty DataFrame
Columns: [frequency]
Index: []

Deleted sentence for this article: 

[]

Eyewitness News
Empty DataFrame
Columns: [frequency]
Index: []

Deleted sentence for this article: 

[]

BBC Sport
Empty DataFrame
Columns: [frequency]
Index: []

Deleted sentence for this article: 

[]

IPPmedia
Empty DataFrame
Columns: [frequency]
Index: []

Deleted sentence for this article: 

[]

Chicago Tribune
Empty DataFrame
Columns: [frequency]
Index: []

Deleted sentence for this article: 

[]

Health24
Empty DataFrame
Columns: [frequency]
Index: []

Deleted sentence for this article: 

[]

sportal.co.nz
Empty DataFrame
Columns: [frequency]
Index: []

Deleted sentence

Mayo News
Empty DataFrame
Columns: [frequency]
Index: []

Deleted sentence for this article: 

[]

Daily Maverick
Empty DataFrame
Columns: [frequency]
Index: []

Deleted sentence for this article: 

[]

The Standard Digital News (satire) 
Empty DataFrame
Columns: [frequency]
Index: []

Deleted sentence for this article: 

[]

Scoop.co.nz
Empty DataFrame
Columns: [frequency]
Index: []

Deleted sentence for this article: 

[]

Straits Times
Empty DataFrame
Columns: [frequency]
Index: []

Deleted sentence for this article: 

[]

NJ.com
Empty DataFrame
Columns: [frequency]
Index: []

Deleted sentence for this article: 

[]

New York Times
Empty DataFrame
Columns: [frequency]
Index: []

Deleted sentence for this article: 

[]

Lankaweb
Empty DataFrame
Columns: [frequency]
Index: []

Deleted sentence for this article: 

[]

Business Recorder (press release) (registration) (blog)
Empty DataFrame
Columns: [frequency]
Index: []

Deleted sentence for this article: 

[]

ABC News
Empty DataFrame



TVNZ
Empty DataFrame
Columns: [frequency]
Index: []

Deleted sentence for this article: 

[]

International Business Times, India Edition
Empty DataFrame
Columns: [frequency]
Index: []

Deleted sentence for this article: 

[]

Western Star
Empty DataFrame
Columns: [frequency]
Index: []

Deleted sentence for this article: 

[]

BusinessWorld Online Edition
Empty DataFrame
Columns: [frequency]
Index: []

Deleted sentence for this article: 

[]

Music Feeds
Empty DataFrame
Columns: [frequency]
Index: []

Deleted sentence for this article: 

[]

Soccer Laduma
Empty DataFrame
Columns: [frequency]
Index: []

Deleted sentence for this article: 

[]

YorkRegion.com
Empty DataFrame
Columns: [frequency]
Index: []

Deleted sentence for this article: 

[]

Taranaki Daily News
Empty DataFrame
Columns: [frequency]
Index: []

Deleted sentence for this article: 

[]

Limerick Leader
Empty DataFrame
Columns: [frequency]
Index: []

Deleted sentence for this article: 

[]

Stuff
Empty DataFrame
Columns:


Las Vegas Review-Journal
                                 frequency
see the inside of wayne                 41
the inside of wayne newton              41
can finally see the inside              40
inside of wayne newton estate           40
of wayne newton estate photos           40
finally see the inside of               39
you can finally see the                 39
casa de shenandoah on monday            38
for public tours on friday              38
de shenandoah on monday sept            38
at sunset and pecos roads               38
open for public tours on                38
scheduled to open for public            38
to open for public tours                38
ranch at sunset and pecos               37
at casa de shenandoah on                37
and pecos roads in las                  37
vegas is scheduled to open              37
las vegas is scheduled to               37
in las vegas is scheduled               37
sept 14 2015 in las                     37
14 2015 in las vegas        


KitGuru
                              frequency
vaf 221171 rating from votes         14
vaf 221171 rating from votes ----- set()


Deleted sentence for this article: 

[]

Slate Magazine
Empty DataFrame
Columns: [frequency]
Index: []

Deleted sentence for this article: 

[]

News Pakistan
Empty DataFrame
Columns: [frequency]
Index: []

Deleted sentence for this article: 

[]

Hong Kong Standard (press release)
Empty DataFrame
Columns: [frequency]
Index: []

Deleted sentence for this article: 

[]

People Magazine
Empty DataFrame
Columns: [frequency]
Index: []

Deleted sentence for this article: 

[]

Washington Times
Empty DataFrame
Columns: [frequency]
Index: []

Deleted sentence for this article: 

[]

Bellevision
Empty DataFrame
Columns: [frequency]
Index: []

Deleted sentence for this article: 

[]

Ars Technica
Empty DataFrame
Columns: [frequency]
Index: []

Deleted sentence for this article: 

[]

Tempo
Empty DataFrame
Columns: [frequency]
Index: []

Deleted sentence for this ar


Randfontein Herald
Empty DataFrame
Columns: [frequency]
Index: []

Deleted sentence for this article: 

[]

GlamSham
Empty DataFrame
Columns: [frequency]
Index: []

Deleted sentence for this article: 

[]

SaharaReporters.com
Empty DataFrame
Columns: [frequency]
Index: []

Deleted sentence for this article: 

[]

GSMArena.com
Empty DataFrame
Columns: [frequency]
Index: []

Deleted sentence for this article: 

[]

UN News Centre
Empty DataFrame
Columns: [frequency]
Index: []

Deleted sentence for this article: 

[]

Coconuts Hong Kong
Empty DataFrame
Columns: [frequency]
Index: []

Deleted sentence for this article: 

[]

The Times 
Empty DataFrame
Columns: [frequency]
Index: []

Deleted sentence for this article: 

[]

The Barrie Examiner
Empty DataFrame
Columns: [frequency]
Index: []

Deleted sentence for this article: 

[]

IndiaGlitz
Empty DataFrame
Columns: [frequency]
Index: []

Deleted sentence for this article: 

[]

The Standard Digital News
Empty DataFrame
Columns: [frequency


USA TODAY
Empty DataFrame
Columns: [frequency]
Index: []

Deleted sentence for this article: 

[]

Yentha
Empty DataFrame
Columns: [frequency]
Index: []

Deleted sentence for this article: 

[]

Scroll.in
Empty DataFrame
Columns: [frequency]
Index: []

Deleted sentence for this article: 

[]

Jalopnik
Empty DataFrame
Columns: [frequency]
Index: []

Deleted sentence for this article: 

[]

DestinyMan
Empty DataFrame
Columns: [frequency]
Index: []

Deleted sentence for this article: 

[]

afaqs 
Empty DataFrame
Columns: [frequency]
Index: []

Deleted sentence for this article: 

[]

P.M. News
Empty DataFrame
Columns: [frequency]
Index: []

Deleted sentence for this article: 

[]

rabble.ca
Empty DataFrame
Columns: [frequency]
Index: []

Deleted sentence for this article: 

[]

indiatvnews.com
Empty DataFrame
Columns: [frequency]
Index: []

Deleted sentence for this article: 

[]

The Verge
Empty DataFrame
Columns: [frequency]
Index: []

Deleted sentence for this article: 

[]

FilmiBeat


Donegal Now
Empty DataFrame
Columns: [frequency]
Index: []

Deleted sentence for this article: 

[]

Ventures Africa
Empty DataFrame
Columns: [frequency]
Index: []

Deleted sentence for this article: 

[]

Danville Commercial News
Empty DataFrame
Columns: [frequency]
Index: []

Deleted sentence for this article: 

[]

3News NZ
Empty DataFrame
Columns: [frequency]
Index: []

Deleted sentence for this article: 

[]

South Africa.info
Empty DataFrame
Columns: [frequency]
Index: []

Deleted sentence for this article: 

[]

Ubyssey Online
Empty DataFrame
Columns: [frequency]
Index: []

Deleted sentence for this article: 

[]

Community Newspaper Group
Empty DataFrame
Columns: [frequency]
Index: []

Deleted sentence for this article: 

[]

Channel 24
Empty DataFrame
Columns: [frequency]
Index: []

Deleted sentence for this article: 

[]

Gizmag
Empty DataFrame
Columns: [frequency]
Index: []

Deleted sentence for this article: 

[]

gearburn
Empty DataFrame
Columns: [frequency]
Index: []

De


Michigan Radio
Empty DataFrame
Columns: [frequency]
Index: []

Deleted sentence for this article: 

[]

PR.com 
Empty DataFrame
Columns: [frequency]
Index: []

Deleted sentence for this article: 

[]

NOLA.com
Empty DataFrame
Columns: [frequency]
Index: []

Deleted sentence for this article: 

[]

Victoria Times Colonist
Empty DataFrame
Columns: [frequency]
Index: []

Deleted sentence for this article: 

[]

Westmeath Examiner
Empty DataFrame
Columns: [frequency]
Index: []

Deleted sentence for this article: 

[]

Mid-Day
Empty DataFrame
Columns: [frequency]
Index: []

Deleted sentence for this article: 

[]

Brock Press
Empty DataFrame
Columns: [frequency]
Index: []

Deleted sentence for this article: 

[]

Business Recorder
Empty DataFrame
Columns: [frequency]
Index: []

Deleted sentence for this article: 

[]

Edmonton Journal 
Empty DataFrame
Columns: [frequency]
Index: []

Deleted sentence for this article: 

[]

Know Your Mobile
Empty DataFrame
Columns: [frequency]
Index: []

De


The Irish World Newspaper
Empty DataFrame
Columns: [frequency]
Index: []

Deleted sentence for this article: 

[]

LNG Industry
Empty DataFrame
Columns: [frequency]
Index: []

Deleted sentence for this article: 

[]

Pain In The Arsenal
Empty DataFrame
Columns: [frequency]
Index: []

Deleted sentence for this article: 

[]

Australian Personal Computer
Empty DataFrame
Columns: [frequency]
Index: []

Deleted sentence for this article: 

[]

NME.com
Empty DataFrame
Columns: [frequency]
Index: []

Deleted sentence for this article: 

[]

Minneapolis Star Tribune
                               frequency
on wednesday february 17 2016         10
on wednesday february 17 2016 ----- {"gallery dillon semolina's dog tags are seen on display at the home of his mother and stepfather mike and lisa de la cruz in bloomington on wednesday february 17 2016 .", 'gallery the dress blues that belonged to dillon semolina hang in the bloomington home of his mother and stepfather on wednesday february 17 20


Crawley Observer
Empty DataFrame
Columns: [frequency]
Index: []

Deleted sentence for this article: 

[]

Financial Times
Empty DataFrame
Columns: [frequency]
Index: []

Deleted sentence for this article: 

[]

India Infoline.com
Empty DataFrame
Columns: [frequency]
Index: []

Deleted sentence for this article: 

[]

Portage Daily Graphic
Empty DataFrame
Columns: [frequency]
Index: []

Deleted sentence for this article: 

[]

Sunshine Coast Daily
Empty DataFrame
Columns: [frequency]
Index: []

Deleted sentence for this article: 

[]

The Republic
Empty DataFrame
Columns: [frequency]
Index: []

Deleted sentence for this article: 

[]

Australia Business Review Weekly 
Empty DataFrame
Columns: [frequency]
Index: []

Deleted sentence for this article: 

[]

ITProPortal
Empty DataFrame
Columns: [frequency]
Index: []

Deleted sentence for this article: 

[]

Herald
Empty DataFrame
Columns: [frequency]
Index: []

Deleted sentence for this article: 

[]

SooToday.com
Empty DataFrame
Columns:


World Science
Empty DataFrame
Columns: [frequency]
Index: []

Deleted sentence for this article: 

[]

Press-Register - al.com
Empty DataFrame
Columns: [frequency]
Index: []

Deleted sentence for this article: 

[]

CJOB
Empty DataFrame
Columns: [frequency]
Index: []

Deleted sentence for this article: 

[]

Fox17
Empty DataFrame
Columns: [frequency]
Index: []

Deleted sentence for this article: 

[]

FOX 61
Empty DataFrame
Columns: [frequency]
Index: []

Deleted sentence for this article: 

[]

Vancouver Courier
Empty DataFrame
Columns: [frequency]
Index: []

Deleted sentence for this article: 

[]

NBC 6 South Florida
Empty DataFrame
Columns: [frequency]
Index: []

Deleted sentence for this article: 

[]

San Antonio Express
Empty DataFrame
Columns: [frequency]
Index: []

Deleted sentence for this article: 

[]

Wetinhappen Magazine 
Empty DataFrame
Columns: [frequency]
Index: []

Deleted sentence for this article: 

[]

Sportsnet.ca
Empty DataFrame
Columns: [frequency]
Index: []

D


Fusion
Empty DataFrame
Columns: [frequency]
Index: []

Deleted sentence for this article: 

[]

NPR 
Empty DataFrame
Columns: [frequency]
Index: []

Deleted sentence for this article: 

[]

MINING.com
Empty DataFrame
Columns: [frequency]
Index: []

Deleted sentence for this article: 

[]

The Killeen Daily Herald
Empty DataFrame
Columns: [frequency]
Index: []

Deleted sentence for this article: 

[]

KNIA / KRLS Radio
Empty DataFrame
Columns: [frequency]
Index: []

Deleted sentence for this article: 

[]

KARE
Empty DataFrame
Columns: [frequency]
Index: []

Deleted sentence for this article: 

[]

Valdosta Daily Times
Empty DataFrame
Columns: [frequency]
Index: []

Deleted sentence for this article: 

[]

Leitrim Observer
Empty DataFrame
Columns: [frequency]
Index: []

Deleted sentence for this article: 

[]

Reading Post
Empty DataFrame
Columns: [frequency]
Index: []

Deleted sentence for this article: 

[]

RealWire 
Empty DataFrame
Columns: [frequency]
Index: []

Deleted sentence f


Limerick Post
Empty DataFrame
Columns: [frequency]
Index: []

Deleted sentence for this article: 

[]

Rip It Up
Empty DataFrame
Columns: [frequency]
Index: []

Deleted sentence for this article: 

[]

MovieWeb
Empty DataFrame
Columns: [frequency]
Index: []

Deleted sentence for this article: 

[]

ABN Newswire 
Empty DataFrame
Columns: [frequency]
Index: []

Deleted sentence for this article: 

[]

The Press
Empty DataFrame
Columns: [frequency]
Index: []

Deleted sentence for this article: 

[]

Poynter.org
Empty DataFrame
Columns: [frequency]
Index: []

Deleted sentence for this article: 

[]

PolitiFact
Empty DataFrame
Columns: [frequency]
Index: []

Deleted sentence for this article: 

[]

Investor's Business Daily
Empty DataFrame
Columns: [frequency]
Index: []

Deleted sentence for this article: 

[]

Cloverdale Reporter
Empty DataFrame
Columns: [frequency]
Index: []

Deleted sentence for this article: 

[]

WWLTV.com
Empty DataFrame
Columns: [frequency]
Index: []

Deleted senten


countylive.ca 
Empty DataFrame
Columns: [frequency]
Index: []

Deleted sentence for this article: 

[]

Android Police
Empty DataFrame
Columns: [frequency]
Index: []

Deleted sentence for this article: 

[]

Farms.com
Empty DataFrame
Columns: [frequency]
Index: []

Deleted sentence for this article: 

[]

fox8.com
Empty DataFrame
Columns: [frequency]
Index: []

Deleted sentence for this article: 

[]

Media Update
Empty DataFrame
Columns: [frequency]
Index: []

Deleted sentence for this article: 

[]

GameSpot
Empty DataFrame
Columns: [frequency]
Index: []

Deleted sentence for this article: 

[]

Sudbury Star
Empty DataFrame
Columns: [frequency]
Index: []

Deleted sentence for this article: 

[]

BBC News 
Empty DataFrame
Columns: [frequency]
Index: []

Deleted sentence for this article: 

[]

Radio Canada International
Empty DataFrame
Columns: [frequency]
Index: []

Deleted sentence for this article: 

[]

On Cars India
Empty DataFrame
Columns: [frequency]
Index: []

Deleted sentenc


1888 Press Release 
Empty DataFrame
Columns: [frequency]
Index: []

Deleted sentence for this article: 

[]

Everest News
Empty DataFrame
Columns: [frequency]
Index: []

Deleted sentence for this article: 

[]

FOX Illinois
Empty DataFrame
Columns: [frequency]
Index: []

Deleted sentence for this article: 

[]

TamilEelamNews.com
Empty DataFrame
Columns: [frequency]
Index: []

Deleted sentence for this article: 

[]

K24 TV
Empty DataFrame
Columns: [frequency]
Index: []

Deleted sentence for this article: 

[]

CNBC.com
Empty DataFrame
Columns: [frequency]
Index: []

Deleted sentence for this article: 

[]

Androinica 
Empty DataFrame
Columns: [frequency]
Index: []

Deleted sentence for this article: 

[]

Daily Echo
Empty DataFrame
Columns: [frequency]
Index: []

Deleted sentence for this article: 

[]

SBS - The World Game
Empty DataFrame
Columns: [frequency]
Index: []

Deleted sentence for this article: 

[]

CanIndia News
Empty DataFrame
Columns: [frequency]
Index: []

Deleted sen

Empty DataFrame
Columns: [frequency]
Index: []

Deleted sentence for this article: 

[]

The Sunday Business Post
Empty DataFrame
Columns: [frequency]
Index: []

Deleted sentence for this article: 

[]

Cebu Daily News
Empty DataFrame
Columns: [frequency]
Index: []

Deleted sentence for this article: 

[]

Calgary Sun
Empty DataFrame
Columns: [frequency]
Index: []

Deleted sentence for this article: 

[]

Colombo Page
Empty DataFrame
Columns: [frequency]
Index: []

Deleted sentence for this article: 

[]

Evening Echo Cork
Empty DataFrame
Columns: [frequency]
Index: []

Deleted sentence for this article: 

[]

The Online Citizen 
Empty DataFrame
Columns: [frequency]
Index: []

Deleted sentence for this article: 

[]

New Zealand Herald 
Empty DataFrame
Columns: [frequency]
Index: []

Deleted sentence for this article: 

[]

Singletrack
Empty DataFrame
Columns: [frequency]
Index: []

Deleted sentence for this article: 

[]

Thought Leader 
Empty DataFrame
Columns: [frequency]
Index: []



Biztech Africa
Empty DataFrame
Columns: [frequency]
Index: []

Deleted sentence for this article: 

[]

Model D
Empty DataFrame
Columns: [frequency]
Index: []

Deleted sentence for this article: 

[]

Kawartha Media Group
Empty DataFrame
Columns: [frequency]
Index: []

Deleted sentence for this article: 

[]

iPolitics.ca 
Empty DataFrame
Columns: [frequency]
Index: []

Deleted sentence for this article: 

[]

News On 6
Empty DataFrame
Columns: [frequency]
Index: []

Deleted sentence for this article: 

[]

Vox
Empty DataFrame
Columns: [frequency]
Index: []

Deleted sentence for this article: 

[]

Institute for Defence Studies and Analyses
Empty DataFrame
Columns: [frequency]
Index: []

Deleted sentence for this article: 

[]

Khabar India
Empty DataFrame
Columns: [frequency]
Index: []

Deleted sentence for this article: 

[]

Spectator.co.uk 
Empty DataFrame
Columns: [frequency]
Index: []

Deleted sentence for this article: 

[]

FootballliveNG 
Empty DataFrame
Columns: [frequency]


Empty DataFrame
Columns: [frequency]
Index: []

Deleted sentence for this article: 

[]

Portadown Times
Empty DataFrame
Columns: [frequency]
Index: []

Deleted sentence for this article: 

[]

The Business of Fashion
Empty DataFrame
Columns: [frequency]
Index: []

Deleted sentence for this article: 

[]

FOXSports.com
Empty DataFrame
Columns: [frequency]
Index: []

Deleted sentence for this article: 

[]

KOLO
Empty DataFrame
Columns: [frequency]
Index: []

Deleted sentence for this article: 

[]

11alive.com
Empty DataFrame
Columns: [frequency]
Index: []

Deleted sentence for this article: 

[]

The Southern
Empty DataFrame
Columns: [frequency]
Index: []

Deleted sentence for this article: 

[]

Abbotsford News 
Empty DataFrame
Columns: [frequency]
Index: []

Deleted sentence for this article: 

[]

Northcliff Melville Times
Empty DataFrame
Columns: [frequency]
Index: []

Deleted sentence for this article: 

[]

Mas Market News
                             frequency
shares of the com


U.S. News &amp; World Report 
Empty DataFrame
Columns: [frequency]
Index: []

Deleted sentence for this article: 

[]

PR Web 
Empty DataFrame
Columns: [frequency]
Index: []

Deleted sentence for this article: 

[]

PC Advisor
Empty DataFrame
Columns: [frequency]
Index: []

Deleted sentence for this article: 

[]

Go Jamaica 
Empty DataFrame
Columns: [frequency]
Index: []

Deleted sentence for this article: 

[]

India TV
Empty DataFrame
Columns: [frequency]
Index: []

Deleted sentence for this article: 

[]

Castanet.net
Empty DataFrame
Columns: [frequency]
Index: []

Deleted sentence for this article: 

[]

Insurance Business
Empty DataFrame
Columns: [frequency]
Index: []

Deleted sentence for this article: 

[]

Billboard
Empty DataFrame
Columns: [frequency]
Index: []

Deleted sentence for this article: 

[]

Galway Bay FM
Empty DataFrame
Columns: [frequency]
Index: []

Deleted sentence for this article: 

[]

CBSSports.com
Empty DataFrame
Columns: [frequency]
Index: []

Deleted se

Business Insider
Empty DataFrame
Columns: [frequency]
Index: []

Deleted sentence for this article: 

[]

kwwl.com
Empty DataFrame
Columns: [frequency]
Index: []

Deleted sentence for this article: 

[]

Paul Tan's Automotive News
Empty DataFrame
Columns: [frequency]
Index: []

Deleted sentence for this article: 

[]

VOCM
Empty DataFrame
Columns: [frequency]
Index: []

Deleted sentence for this article: 

[]

Australian Macworld
Empty DataFrame
Columns: [frequency]
Index: []

Deleted sentence for this article: 

[]

Hit The Floor Magazine
Empty DataFrame
Columns: [frequency]
Index: []

Deleted sentence for this article: 

[]

Islington Gazette
Empty DataFrame
Columns: [frequency]
Index: []

Deleted sentence for this article: 

[]

GQ Magazine
Empty DataFrame
Columns: [frequency]
Index: []

Deleted sentence for this article: 

[]

PSFK
Empty DataFrame
Columns: [frequency]
Index: []

Deleted sentence for this article: 

[]

Economy Lead (blog)
Empty DataFrame
Columns: [frequency]
Index:


Longford Leader
Empty DataFrame
Columns: [frequency]
Index: []

Deleted sentence for this article: 

[]

Stony Plain Reporter
Empty DataFrame
Columns: [frequency]
Index: []

Deleted sentence for this article: 

[]

Daily Cannon (satire) (blog)
Empty DataFrame
Columns: [frequency]
Index: []

Deleted sentence for this article: 

[]

ChannelLife NZ
Empty DataFrame
Columns: [frequency]
Index: []

Deleted sentence for this article: 

[]

Finextra
Empty DataFrame
Columns: [frequency]
Index: []

Deleted sentence for this article: 

[]

Linlithgow Journal &amp; Gazette
Empty DataFrame
Columns: [frequency]
Index: []

Deleted sentence for this article: 

[]

Kotaku
Empty DataFrame
Columns: [frequency]
Index: []

Deleted sentence for this article: 

[]

The Province
Empty DataFrame
Columns: [frequency]
Index: []

Deleted sentence for this article: 

[]

CFJC Today Kamloops, British Columbia
Empty DataFrame
Columns: [frequency]
Index: []

Deleted sentence for this article: 

[]

Drowned In Sound



Raptors Republic
Empty DataFrame
Columns: [frequency]
Index: []

Deleted sentence for this article: 

[]

Indiantelevision.com
Empty DataFrame
Columns: [frequency]
Index: []

Deleted sentence for this article: 

[]

TheTyee.ca
Empty DataFrame
Columns: [frequency]
Index: []

Deleted sentence for this article: 

[]

Bucks Free Press
Empty DataFrame
Columns: [frequency]
Index: []

Deleted sentence for this article: 

[]

Varsity Online
Empty DataFrame
Columns: [frequency]
Index: []

Deleted sentence for this article: 

[]

Comcast SportsNet New England
Empty DataFrame
Columns: [frequency]
Index: []

Deleted sentence for this article: 

[]

stv.tv
Empty DataFrame
Columns: [frequency]
Index: []

Deleted sentence for this article: 

[]


In [12]:
list(reports[reports['website'] =='Times of India'].text.values)

[" indian railway's total earning up by 20%. the total approximate earnings of indian railways on originating basis during 1st april 2012 to 31st january 2013 was rs 101223.95 crore compared to rs 84083.74 crore during the same period last year . tnn feb 11 2013 03.34 pm ist. mumbai the total approximate earnings of indian railways on originating basis during 1st april 2012 to 31st january 2013 was rs 101223.95 crore compared to rs 84083.74 crore during the same period last year registering an increase of 20.38 per cent . a press release issued by the ministry of railways said the total goods earnings have gone up from rs 56163.30 crore during 1st april 2011 31st january 2012 to rs 70067.36 crore during 1st april 2012 31st january 2013 registering an increase of 24.76 per cent . the total passenger revenue earnings during 1st april 2012 31st january 2013 was rs 25924.29 crore compared to rs. 23344.42 crore during the same period last year registering an increase of 11.05 the revenue ea

In [13]:
list(reports_temp[reports_temp['website'] =='Times of India'].text.values)

[" indian railway's total earning up by 20%. the total approximate earnings of indian railways on originating basis during 1st april 2012 to 31st january 2013 was rs 101223.95 crore compared to rs 84083.74 crore during the same period last year . tnn feb 11 2013 03.34 pm ist. mumbai the total approximate earnings of indian railways on originating basis during 1st april 2012 to 31st january 2013 was rs 101223.95 crore compared to rs 84083.74 crore during the same period last year registering an increase of 20.38 per cent . a press release issued by the ministry of railways said the total goods earnings have gone up from rs 56163.30 crore during 1st april 2011 31st january 2012 to rs 70067.36 crore during 1st april 2012 31st january 2013 registering an increase of 24.76 per cent . the total passenger revenue earnings during 1st april 2012 31st january 2013 was rs 25924.29 crore compared to rs. 23344.42 crore during the same period last year registering an increase of 11.05 the revenue ea

In [14]:
reports_update = reports_temp.copy()

In [15]:
reports_update[reports_update['website']=='Times of India'].text.iloc[0]

" indian railway's total earning up by 20%. the total approximate earnings of indian railways on originating basis during 1st april 2012 to 31st january 2013 was rs 101223.95 crore compared to rs 84083.74 crore during the same period last year . tnn feb 11 2013 03.34 pm ist. mumbai the total approximate earnings of indian railways on originating basis during 1st april 2012 to 31st january 2013 was rs 101223.95 crore compared to rs 84083.74 crore during the same period last year registering an increase of 20.38 per cent . a press release issued by the ministry of railways said the total goods earnings have gone up from rs 56163.30 crore during 1st april 2011 31st january 2012 to rs 70067.36 crore during 1st april 2012 31st january 2013 registering an increase of 24.76 per cent . the total passenger revenue earnings during 1st april 2012 31st january 2013 was rs 25924.29 crore compared to rs. 23344.42 crore during the same period last year registering an increase of 11.05 the revenue ear