## Some URLs with errors can be further expanded

```
grep $'3$' url_expanded.full.txt | cut -f1 > error_urls.1.txt
python download_expanded.py --input error_urls.1.txt --output url_expanded.error.1.txt --jobs 10 --batches 3
```

In [1]:
import pandas as pd

## Merge expanded urls with URL cats

In [2]:
df = pd.read_csv("url_expanded.full.txt", sep="\t", header=None)
df.shape

(97512, 3)

In [3]:
df.head()

Unnamed: 0,0,1,2
0,http://www.investmentnews.com/article/20160801...,http://www.investmentnews.com/article/20160801...,0
1,http://ow.ly/3avNPe,https://www.reddit.com/r/cahideas/comments/42i...,0
2,http://stratcom.kma-assc.com/uncategorized/pre...,http://stratcom.kma-assc.com/uncategorized/pre...,3
3,http://ln.is/mabelsaveforschool.com/gbEtv,http://linkis.com/mabelsaveforschool.com/gbEtv,0
4,http://kiw.im/16LfJirkfzE,https://kiwi.qa/LFHKX8RLIFI7O8/39656070290663927,0


In [4]:
df.columns = ["URL", "EXPANDED", "EXPANDED_STATUS"]
df.head()

Unnamed: 0,URL,EXPANDED,EXPANDED_STATUS
0,http://www.investmentnews.com/article/20160801...,http://www.investmentnews.com/article/20160801...,0
1,http://ow.ly/3avNPe,https://www.reddit.com/r/cahideas/comments/42i...,0
2,http://stratcom.kma-assc.com/uncategorized/pre...,http://stratcom.kma-assc.com/uncategorized/pre...,3
3,http://ln.is/mabelsaveforschool.com/gbEtv,http://linkis.com/mabelsaveforschool.com/gbEtv,0
4,http://kiw.im/16LfJirkfzE,https://kiwi.qa/LFHKX8RLIFI7O8/39656070290663927,0


In [5]:
df.EXPANDED_STATUS.value_counts()

0    92362
1     3651
3     1489
2       10
Name: EXPANDED_STATUS, dtype: int64

In [6]:
df[df.EXPANDED_STATUS == 1].head()

Unnamed: 0,URL,EXPANDED,EXPANDED_STATUS
14,http://dailydose.topratedviral.com/article/wom...,http://dailydose.topratedviral.com/article/wom...,1
15,http://gvwy.io/v9h3w9l,http://mabelsaveforschool.com/contest-entry,1
23,http://s.einnews.com/tGmrKnfQ1C,http://s.einnews.com/tGmrKnfQ1C,1
30,http://gvwy.io/lygewah,http://mabelsaveforschool.com/contest-entry,1
59,http://gvwy.io/3ogfrpp,http://mabelsaveforschool.com/contest-entry,1


In [7]:
df[df.EXPANDED_STATUS == 3].head(100)

Unnamed: 0,URL,EXPANDED,EXPANDED_STATUS
2,http://stratcom.kma-assc.com/uncategorized/pre...,http://stratcom.kma-assc.com/uncategorized/pre...,3
27,http://dlvr.it/KxCjYs,http://post/142016360553?utm_source=dlvr.it&ut...,3
64,http://soco.space/-m0zuC,http://soco.space/-m0zuC,3
120,http://seusnews.com/?p=1756,http://seusnews.com/?p=1756,3
145,http://deals.buycheap2day.com/US/lndng-st/twt/...,http://deals.buycheap2day.com/US/lndng-st/twt/...,3
240,http://sociably.me/L0pQLX,http://sociably.me/L0pQLX,3
257,http://j.mp/1Zyj2Lp,http://feeds.huffingtonpost.com/c/35496/f/6770...,3
536,https://videotube.livehost.fr/2016/11/28/learn...,https://videotube.livehost.fr/2016/11/28/learn...,3
548,http://dlvr.it/Kv4tfx,http://vulture.feedsportal.com/c/35348/f/66160...,3
560,http://www.fashionisme.us/2013/07/useful-foods...,http://www.fashionisme.us/2013/07/useful-foods...,3


In [8]:
df[df.EXPANDED_STATUS == 3].EXPANDED.str.split("/").apply(lambda x: x[2]).value_counts()

reuters.us.feedsportal.com                   143
feeds.huffingtonpost.com                      92
soco.space                                    63
personalhealthdiary.co                        53
www.ynn.io                                    45
rss.feedsportal.com                           30
post                                          19
l.herald.ly                                   17
www.trendgizmo.com                            17
cnet.com.feedsportal.com                      16
zerohedge.feedsportal.com                     15
pumpkin-dukan-diet.7legend.net                15
zdnet.com.feedsportal.com                     15
healthlogics.press                            15
nydailynews.com.feedsportal.com               14
appleinsider.com.feedsportal.com              14
dailyeeuu.tusueldo.com                        13
politics.tusueldo.com                         13
telegraph.feedsportal.com                     13
www.techwens.com                              12
advertising-educatio

In [9]:
df[(df.EXPANDED_STATUS == 3) & (df.EXPANDED.str.split("/").apply(lambda x: x[2]) == "www.huffingtonpost.com")].head()

Unnamed: 0,URL,EXPANDED,EXPANDED_STATUS


In [10]:
df_err = pd.read_csv("url_expanded.error.1.txt", sep="\t", header=None)
df_err.shape

(1489, 3)

In [11]:
df_err.columns = ["URL", "EXPANDED", "EXPANDED_STATUS"]
df_err.head()

Unnamed: 0,URL,EXPANDED,EXPANDED_STATUS
0,http://ift.tt/1mBLaPF,http://reuters.us.feedsportal.com/c/35217/f/65...,3
1,http://logs.wsj.com/pharmalot/2015/06/08/merck...,http://logs.wsj.com/pharmalot/2015/06/08/merck...,3
2,http://bit.ly/1oRL1bE,http://rss.feedsportal.com/c/34793/f/641580/s/...,3
3,http://stratcom.kma-assc.com/uncategorized/pre...,http://stratcom.kma-assc.com/uncategorized/pre...,3
4,http://americagunban.com/moscow-says-usa-actio...,http://americagunban.com/moscow-says-usa-actio...,0


In [12]:
df_err.EXPANDED_STATUS.value_counts()

3    1396
0      71
1      22
Name: EXPANDED_STATUS, dtype: int64

In [13]:
df_err[df_err.EXPANDED_STATUS == 3].EXPANDED.str.split("/").apply(lambda x: x[2]).value_counts()

reuters.us.feedsportal.com                   143
feeds.huffingtonpost.com                      92
soco.space                                    63
personalhealthdiary.co                        53
www.ynn.io                                    45
rss.feedsportal.com                           30
l.herald.ly                                   17
www.trendgizmo.com                            17
cnet.com.feedsportal.com                      16
zdnet.com.feedsportal.com                     15
pumpkin-dukan-diet.7legend.net                15
zerohedge.feedsportal.com                     15
appleinsider.com.feedsportal.com              14
nydailynews.com.feedsportal.com               14
politics.tusueldo.com                         13
telegraph.feedsportal.com                     13
dailyeeuu.tusueldo.com                        13
advertising-education.live-newsx.com          12
master-of-education.goitstar.com              12
www.techwens.com                              12
www.youthsnews.com  

In [14]:
df = df.set_index("URL")
df_err = df_err.set_index("URL")
df.shape, df_err.shape

((97512, 2), (1489, 2))

In [15]:
df.head()

Unnamed: 0_level_0,EXPANDED,EXPANDED_STATUS
URL,Unnamed: 1_level_1,Unnamed: 2_level_1
http://www.investmentnews.com/article/20160801/FREE/160809992/if-history-is-a-guide-market-volatility-is-about-to-spike,http://www.investmentnews.com/article/20160801...,0
http://ow.ly/3avNPe,https://www.reddit.com/r/cahideas/comments/42i...,0
http://stratcom.kma-assc.com/uncategorized/press-releases-visit-of-republic-of-korea-r-o-k-deputy-national-security-advisor-cho-tae-yong/,http://stratcom.kma-assc.com/uncategorized/pre...,3
http://ln.is/mabelsaveforschool.com/gbEtv,http://linkis.com/mabelsaveforschool.com/gbEtv,0
http://kiw.im/16LfJirkfzE,https://kiwi.qa/LFHKX8RLIFI7O8/39656070290663927,0


In [16]:
df.ix[df_err.index, ["EXPANDED", "EXPANDED_STATUS"]] = df_err[["EXPANDED", "EXPANDED_STATUS"]]

In [17]:
df.ix[df_err.index]["EXPANDED_STATUS"].value_counts()

3    1396
0      71
1      22
Name: EXPANDED_STATUS, dtype: int64

In [18]:
df.to_csv("url_expanded.merged.txt", sep="\t")
! head url_expanded.merged.txt

URL	EXPANDED	EXPANDED_STATUS
http://www.investmentnews.com/article/20160801/FREE/160809992/if-history-is-a-guide-market-volatility-is-about-to-spike	http://www.investmentnews.com/article/20160801/FREE/160809992/if-history-is-a-guide-market-volatility-is-about-to-spike	0
http://ow.ly/3avNPe	https://www.reddit.com/r/cahideas/comments/42i3ew/w_farting_mid_rimjob/	0
http://stratcom.kma-assc.com/uncategorized/press-releases-visit-of-republic-of-korea-r-o-k-deputy-national-security-advisor-cho-tae-yong/	http://stratcom.kma-assc.com/uncategorized/press-releases-visit-of-republic-of-korea-r-o-k-deputy-national-security-advisor-cho-tae-yong/	3
http://ln.is/mabelsaveforschool.com/gbEtv	http://linkis.com/mabelsaveforschool.com/gbEtv	0
http://kiw.im/16LfJirkfzE	https://kiwi.qa/LFHKX8RLIFI7O8/39656070290663927	0
http://fb.me/241s7UtEJ	https://www.facebook.com/story.php?story_fbid=1251035921618693&id=100001368900242	0
http://owl.li/XkyUO	https://www.youtube.com/watch?v=xtspq5T7B44&feature=em-