# Cleaning data

In [1]:
import pandas as pd
import pickle

In [53]:
# Connect to Mongo
from pymongo import MongoClient
client = MongoClient('ec2-##-##-###-##.compute-1.amazonaws.com', 27017)

In [74]:
# Check collection
client.idetect.aws_0.count()

99292

In [75]:
# Create dataframe
df = pd.DataFrame(list(client.idetect.aws_0.find()))

In [76]:
df.shape

(99292, 9)

In [77]:
with open('0.pkl', "wb") as picklefile:
    pickle.dump(df, picklefile)

In [78]:
df.head()

Unnamed: 0,_id,authors,date,image,keywords,summary,text,url,valid
0,58c94fc7d897da23f9f5a246,[],NaT,,"[somalilandpress, meant, appear, javascript, r...",In order for you to see this page as it is mea...,Oops! It appears that you have disabled your J...,http://www.somalilandpress.com/iom-supports-dr...,True
1,58c94fc9d897da23f9f5a247,[],NaT,,"[requested, updated, return, unavailable, nort...",You may have followed an outdated link or have...,The page you requested is currently unavailabl...,http://www.9and10news.com/story/33699640/syria...,True
2,58c94fcad897da23f9f5a248,[],2016-12-19 19:32:57,https://c.tribune.com.pk/2016/12/1268397-saudi...,"[defence, bombs, sold, coalition, british, war...","“However, Saudi Arabia has now confirmed it wi...","The war has killed more than 10,000 people and...",http://tribune.com.pk/story/1268397/saudi-led-...,True
3,58c94fd4d897da23f9f5a249,[Reuters Staff Writers News Corp Australia Net...,2016-12-20 02:51:00,http://cdn.newsapi.com.au/image/v1/0152297c79d...,"[homes, oil, followed, quake, deadly, coast, e...",That quake flattened homes and buildings up an...,A 5.7 MAGNITUDE earthquake followed by 37 afte...,http://www.couriermail.com.au/news/world/ecuad...,True
4,58c94fd4d897da23f9f5a24a,[],NaT,,[file],,,http://journalstar.com/news/world/un-approves-...,True


In [80]:
df.isnull().sum()

_id             0
authors         0
date        62589
image           0
keywords        0
summary         0
text            0
url             0
valid           0
dtype: int64

In [81]:
# Encode
df['decode_text'] = df.text.apply(lambda x: x.encode("ascii","ignore"))

In [82]:
type(df.iloc[1]['decode_text'])

str

In [83]:
# Filter out empty text
df2 = df[df.decode_text != '']

In [84]:
df2.shape

(84685, 10)

In [85]:
df2.head()

Unnamed: 0,_id,authors,date,image,keywords,summary,text,url,valid,decode_text
0,58c94fc7d897da23f9f5a246,[],NaT,,"[somalilandpress, meant, appear, javascript, r...",In order for you to see this page as it is mea...,Oops! It appears that you have disabled your J...,http://www.somalilandpress.com/iom-supports-dr...,True,Oops! It appears that you have disabled your J...
1,58c94fc9d897da23f9f5a247,[],NaT,,"[requested, updated, return, unavailable, nort...",You may have followed an outdated link or have...,The page you requested is currently unavailabl...,http://www.9and10news.com/story/33699640/syria...,True,The page you requested is currently unavailabl...
2,58c94fcad897da23f9f5a248,[],2016-12-19 19:32:57,https://c.tribune.com.pk/2016/12/1268397-saudi...,"[defence, bombs, sold, coalition, british, war...","“However, Saudi Arabia has now confirmed it wi...","The war has killed more than 10,000 people and...",http://tribune.com.pk/story/1268397/saudi-led-...,True,"The war has killed more than 10,000 people and..."
3,58c94fd4d897da23f9f5a249,[Reuters Staff Writers News Corp Australia Net...,2016-12-20 02:51:00,http://cdn.newsapi.com.au/image/v1/0152297c79d...,"[homes, oil, followed, quake, deadly, coast, e...",That quake flattened homes and buildings up an...,A 5.7 MAGNITUDE earthquake followed by 37 afte...,http://www.couriermail.com.au/news/world/ecuad...,True,A 5.7 MAGNITUDE earthquake followed by 37 afte...
6,58c94fd6d897da23f9f5a24c,[],NaT,http://ftpcontent.worldnow.com/wncustom/custom...,"[requested, updated, removed, return, unavaila...",You may have followed an outdated link or have...,The page you requested is currently unavailabl...,http://www.nbc-2.com/story/32756524/the-latest...,True,The page you requested is currently unavailabl...


In [86]:
def server_error(text):
    if text.startswith("Server Error") or text.startswith("404") or text.startswith("Not Found")\
    or text.startswith("E-mail address"):
        return "error"
    else:
        return "all good"

In [1]:
df2['error'] = df2.decode_text.apply(server_error)

In [88]:
# Filter out some errors
df2.error.value_counts()

all good    81358
error        3327
Name: error, dtype: int64

In [90]:
df3 = df2[df2.error == 'all good']

In [91]:
df3.shape

(81358, 11)

## Merge tags

In [92]:
bigdata = pd.DataFrame.from_csv("idmc_uniteideas_input_url.csv")

In [93]:
bigdata = bigdata.rename(columns = {'DocumentIdentifier':'url'})
bigdata = bigdata.rename(columns={'DATE':"date"})

In [94]:
bigdata.head()

Unnamed: 0_level_0,date,url
GKGRECORDID,Unnamed: 1_level_1,Unnamed: 2_level_1
20160512000000-2927,20160512000000,http://www.somalilandpress.com/iom-supports-dr...
20161114080000-854,20161114080000,http://www.9and10news.com/story/33699640/syria...
20161219200000-704,20161219200000,http://tribune.com.pk/story/1268397/saudi-led-...
20161219200000-177,20161219200000,http://www.tv360nigeria.com/nigerias-humanitar...
20161219200000-900,20161219200000,http://www.couriermail.com.au/news/world/ecuad...


In [95]:
both_df = pd.merge(df3, bigdata, how='inner', on='url')

In [96]:
both_df.shape

(81358, 12)

## Detect and filter by language

In [97]:
from langdetect import detect

In [98]:
detect(both_df.iloc[0]['decode_text'])

u'en'

In [99]:
def language_detect(x):
    try: 
        return detect(x)
    except:
        return "Dunno"

In [100]:
both_df['language'] = both_df.decode_text.apply(language_detect)

In [101]:
both_df.language.value_counts()

en       81140
fr         120
af          19
id          19
de          19
Dunno       18
es           5
ca           4
nl           4
it           3
da           2
cy           1
pl           1
tl           1
so           1
no           1
Name: language, dtype: int64

In [102]:
final_0 = both_df[both_df.language == 'en']

In [103]:
final_0.shape

(81140, 13)

In [105]:
with open('0.pkl', "wb") as picklefile:
    pickle.dump(final_0, picklefile)
    