In [1]:
import os
import re
import codecs
import pathlib

import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

import nltk

from nltk.tokenize import sent_tokenize
from nltk.tokenize import word_tokenize

from sklearn.feature_extraction.text import CountVectorizer

### Preprocessing 'source' file

In [2]:
# read in sources data, data is separated by 1 or 2 tabs. 
sources = pd.read_csv("data/raw/sources.txt", 
                      sep="\t{1,2}", encoding="ISO-8859-1", skiprows=2, engine='python')
# rename columns
sources.columns = ["id", "n_words", "date", "country", "website", "url", "title"]

# date column -> pandas.DateTime"
sources["date"] = pd.to_datetime(sources["date"], format="%y-%m-%d")

pd.set_option('display.max_colwidth', 40 )
# print(sources.shape)
# sources.head()

### Preprocessing 'text' file

In [3]:
# import a clean function data
def clean_text(text):
    return text.strip()

In [4]:
# read in text data
# find lines that start with @@, extract id and text from them
with open("data/raw/text.txt", "r") as f:
    text = pd.DataFrame(
        [re.search("(\d+)\s(.*)", l[2:]).groups() for l in f.readlines() if l.startswith("@@")],
        columns=["id", "text"]     )
    f.close()

# id should be an integer
text["id"] = text["id"].astype(int)
text['text'] = text.text.str.lower()
text["text"] = text["text"].apply(clean_text)

# print(text.shape)
# text.head()

### Combining 'source' and 'text' file

In [5]:
# combine the data
reports = sources.merge(text, on="id", how="outer")
reports["year"] = reports["date"].dt.strftime("%Y")
reports = reports[reports['text'].notna()] # remove NaN values

### further preprocessing text fil on the resulting reports dataframe

In [6]:
# Puncutation preprocesing
reports['text'] = reports.text.str.replace('{', '')
reports['text'] = reports.text.str.replace('}', '')
reports['text'] = reports.text.str.replace("\n", '')
reports['text'] = reports.text.str.rstrip("\n") #remove empty lines
reports['text'] = reports.text.str.replace("@ @ @ @ @ @ @ @ @ @ ", '')
reports['text'] = reports.text.str.replace(" @", '')
reports['text'] = reports.text.str.replace(" '", "'")
reports['text'] = reports.text.str.replace("\"", "")
reports['text'] = reports.text.str.replace(",", "")
reports['text'] = reports.text.str.replace("(", "")
reports['text'] = reports.text.str.replace(")", "")
reports['text'] = reports.text.str.replace(" <p>", ".")
reports['text'] = reports.text.str.replace(" <h>", ".")
reports['text'] = reports.text.str.replace("<p>", "")
reports['text'] = reports.text.str.replace("<h>", "")
reports['text'] = reports.text.str.replace('<', '')
reports['text'] = reports.text.str.replace('>', '')
reports['text'] = reports.text.str.replace(":", "")
reports['text'] = reports.text.str.replace("?", ".")
reports['text'] = reports.text.str.replace("!", ".")
reports['text'] = reports.text.str.replace(r"\.\s[\.\s]+", ". ") #converting . . to .
reports['text'] = reports.text.str.replace(r"\.+", ".") #converting ... to .
reports['text'] = reports.text.str.replace("--", "") 
reports['text'] = reports.text.str.replace("-", " ")
reports['text'] = reports.text.str.replace(" +", " ")
reports['text'] = reports.text.str.replace(" n't", "n't")

### Frequency of news articles sources (i.e. # of papers by news articles)

In [7]:
article_val_cnt = reports['website'].value_counts()
article_df = pd.DataFrame({'article':article_val_cnt.index, 'count':article_val_cnt.values})

display(article_df)

total_len = article_df['count'].sum()
print("total len: ", total_len)

print("percentage of article of 1: ", article_df[article_df['count'] ==1]['count'].sum() / total_len *100)
print("percentage of article of above 1: ", article_df[article_df['count'] >1]['count'].sum() / total_len *100)
print("percentage of article of above 2: ", article_df[article_df['count'] >2]['count'].sum() / total_len *100)
print("percentage of article of above 3: ", article_df[article_df['count'] >3]['count'].sum() / total_len *100)
print("percentage of article of above 4: ", article_df[article_df['count'] >4]['count'].sum() / total_len *100)
print("percentage of article of above 5: ", article_df[article_df['count'] >5]['count'].sum() / total_len *100)


Unnamed: 0,article,count
0,Times of India,91
1,Telegraph.co.uk,52
2,Independent Online,49
3,Daily Mail,44
4,Irish Independent,41
...,...,...
1075,660 News,1
1076,InfoWorld,1
1077,News24 Nigeria,1
1078,Daily Cannon (satire) (blog),1


total len:  2864
percentage of article of 1:  23.743016759776538
percentage of article of above 1:  76.25698324022346
percentage of article of above 2:  64.66480446927375
percentage of article of above 3:  57.437150837988824
percentage of article of above 4:  51.43156424581006
percentage of article of above 5:  46.892458100558656


### Trimming text strings that are not necessary, i.e. useless information

#### Look at 5 gram, trim of senteces that contain certain phrases that appear with  frequency above 10

In [8]:
# temporaray dataframe, which later will be trimmed of removing useless text strings
reports_temp = reports.copy()
reports_temp['text_vect'] = reports_temp.text.apply(sent_tokenize)
reports_temp.head(5)



Unnamed: 0,id,n_words,date,country,website,url,title,text,year,text_vect
0,11241,397.0,2013-01-06,US,Kotaku,http://kotaku.com/5973495/author-of-...,"Author of The Warriors, Cult Film Ad...",sol yurick the writer whose 1965 no...,2013,[ sol yurick the writer whose 1965 n...
1,11242,757.0,2013-01-06,US,Michigan Radio,http://michiganradio.org/post/thats-...,That's What They Say: Dialect Societ...,that's what they say dialect societ...,2013,[ that's what they say dialect socie...
2,11243,755.0,2013-01-06,US,New York Daily News,http://www.nydailynews.com/life-styl...,Best of New York: Croissant,a sublime croissant at french tart ...,2013,[ a sublime croissant at french tart...
3,11244,1677.0,2013-01-06,US,OregonLive.com,http://www.oregonlive.com/performanc...,Reflecting on a quarter-century of g...,reflecting on a quarter century of ...,2013,[ reflecting on a quarter century of...
4,21242,794.0,2013-01-11,US,Ars Technica,http://arstechnica.com/gadgets/2013/...,Ask Ars: Does Facebook auto-delete c...,ask ars does facebook auto delete c...,2013,[ ask ars does facebook auto delete ...


In [9]:
sent_tokenized = reports.text.apply(sent_tokenize)
flatten = [item for sublist in sent_tokenized for item in sublist]

In [10]:
[a for a in flatten if len(a.split()) < 4]

['advertisement.',
 'sponsored.',
 'your two cents.',
 'steven nehl/the oregonian.',
 'in spanish .',
 '35366 .',
 ' published bystanford medicine.',
 ' the bay bridge.',
 'a surfboardetc .',
 'an iphonean opera.',
 'listen.',
 'no .',
 'hidden mural revealed.',
 'more posts about.',
 'read more.',
 'recent posts.',
 'but .',
 'mary patel.',
 'shares.',
 'send a story.',
 'advertising department.',
 'the editor.',
 'alastair machray.',
 'his .',
 'national post.',
 'related.',
 'recommended by colombia.',
 'recommended by colombia.',
 'comments.',
 'share on twitter.',
 'sign in with.',
 'facebookgoogleemail.',
 'read more.',
 'most popular.',
 'cp.',
 'book review.',
 'history.',
 'multimedia.',
 'hacking the brain.',
 'restrictions.',
 ' video.',
 'video.',
 'pharmaceutical companies .',
 'related articles.',
 'share this article.',
 'share.',
 'charlotte squire.',
 'charlotte squire/fairfax nz.',
 'relevant offers.',
 'east coast swamped.',
 'relevant offers.',
 'save &amp; share.',

In [11]:
len(flatten)

87761

In [12]:
[a for a in flatten]

[' sol yurick the writer whose 1965 novel the warriors was adapted into a film 14 years later which then became one of the best adapted works ever in video gaming died this weekend .',
 'he was 88 .',
 "yurick's work itself was a loose adaptation of a story told 2300 years before anabasis which chronicles the journey of greek mercenaries through hostile territory after the death of their leader .",
 "yurick's book and the warriors both open with a grand council of street gangs convened in the bronx and the murder of the leader who called for the gathering cyrus a direct reference to the leader of the greeks in anabasis .",
 'but the stories then diverge significantly .',
 "walter hill the director of the warriors strove to give a comic book depiction of the gang's flight from the bronx back to their coney island turf .",
 "indeed in yurick's book the gang's mascot junior reads a comic book version of each faction was given a name and a costume theme invoking it typified by the iconic b

In [15]:
def g(x):
    print(x)
    if("." in x):
        while("." in x):
            x.remove(".")
    if(" ." in x):
        while(" ." in x):
            x.remove(" .")

In [16]:
pd.set_option('display.max_colwidth', 80)

# article_list = list(article_df[article_df['count'] >0].article.values)
article_list = list(article_df.article.values)
#article_list = ['Times of India']
# article_list = ['Independent Online'] # temporary to check

for article in article_list:
    temp_df = reports_temp[reports_temp['website'] ==article][['id', 'text','text_vect']]
    word_vectorizer = CountVectorizer(ngram_range=(5,5), stop_words=[])
    flatten = [item for sublist in temp_df['text_vect'] for item in sublist]
    
    sparse_matrix = word_vectorizer.fit_transform(flatten)
    
    frequency = sum(sparse_matrix).toarray()[0]
    frequency_df = pd.DataFrame(frequency, index=word_vectorizer.get_feature_names(),columns = ['frequency']).sort_values(by=['frequency'],ascending=False)
    
    freq_above_10 = frequency_df[frequency_df['frequency'] >=10]
    phrase_list = list(freq_above_10.index)
       
    sentences_to_remove = []
    
    for phrase in phrase_list:
        removing_sent = set([sent for sent in flatten if ((phrase in sent) or (len(sent.split()) < 4))])

        for sent in removing_sent:
            sentences_to_remove.append(sent)
    
    if("." in sentences_to_remove):
        while("." in sentences_to_remove):
            sentences_to_remove.remove(".")
    if(" ." in sentences_to_remove):
        while(" ." in sentences_to_remove):
            sentences_to_remove.remove(" .")
        
#    print(sentences_to_remove)
    sentences_to_remove = list(set(sentences_to_remove))
    sentences_to_remove.sort(key= len, reverse = True)  # sort starting by largest sentence, in case smaller sentence get chosen beforehand
    
    for sent in sentences_to_remove:
        reports_temp.text = reports_temp.text.apply(lambda x: str(x).replace(sent, ""))  


 sol yurick the writer whose 1965 novel the warriors was adapted into a film 14 years later which then became one of the best adapted works ever in video gaming died this weekend . he was 88 . yurick's work itself was a loose adaptation of a story told 2300 years before anabasis which chronicles the journey of greek mercenaries through hostile territory after the death of their leader . yurick's book and the warriors both open with a grand council of street gangs convened in the bronx and the murder of the leader who called for the gathering cyrus a direct reference to the leader of the greeks in anabasis . but the stories then diverge significantly . walter hill the director of the warriors strove to give a comic book depiction of the gang's flight from the bronx back to their coney island turf . indeed in yurick's book the gang's mascot junior reads a comic book version of each faction was given a name and a costume theme invoking it typified by the iconic baseball furies the protago

AttributeError: 'str' object has no attribute 'remove'

### Testing whether it removed those sentences correctly

#### Can also refer to 'Preprocess-진우_4.ipynb'

In [None]:
list(reports[reports['website'] =='Times of India'].text.values)



In [None]:
list(reports_temp[reports_temp['website'] =='Times of India'].text.values)



### Updating reports datframe

In [None]:
reports_update = reports_temp.copy()


In [None]:
i = 2

In [None]:

reports[reports['website'] == 'Times of India']['text'].iloc[i]


In [None]:
reports_update[reports_update['website'] == 'Times of India']['text'].iloc[i]

