# Extracting distinct urls from raw text

*This example will show you how to use ural to extract distinct normalized urls from raw text or html contained in a CSV file's column.*

In [1]:
# Some path wizardry to make python acknowledge relative paths, just ignore this part...
import sys; sys.path.append('..')

In [8]:
import pandas as pd
from ural import (
    urls_from_text,
    urls_from_html,
    normalize_url
)

In [3]:
# Loading our dummy CSV data
df = pd.read_csv('./data/comments.csv')
df.head()

Unnamed: 0,user,comment_text,comment_html
0,George,This is the media I most read: http://www.lemo...,"<p>This is the media I most read: <a href=""htt..."
1,Judy,I don't care about anything else than http://l...,<p>I don't care about anything else than <a hr...
2,Philip,Whatever...,<p>Whatever...</p>
3,Albert,You should check out https://www.lefigaro.fr a...,"<p>You should check out <a href=""https://www.l..."
4,Maria,I think http://lemonde.fr is better actually :3.,"<p>I think <a href=""http://lemonde.fr"">http://..."


In [4]:
# Reading our very interesting comments
for row in df.itertuples():
    print("%s: %s" % (row.user, row.comment_text))
    print()

George: This is the media I most read: http://www.lemonde.fr. It is accurate enough.

Judy: I don't care about anything else than http://lefigaro.fr?utm_campaign=4. What do you make of it? https://www.google.com/url?sa=t&rct=j&q=&esrc=s&source=web&cd=&cad=rja&uact=8&ved=2ahUKEwiCjrb7hpb2AhVOx4UKHXGeBS8QFnoECAwQAQ&url=https%3A%2F%2Fwww.lemonde.fr%2F&usg=AOvVaw1mI-cPIQr3docEINuU9JU2 seems better.

Philip: Whatever...

Albert: You should check out https://www.lefigaro.fr at all cost.

Maria: I think http://lemonde.fr is better actually :3.



In [9]:
# Using ural to extract urls found in the comment text
# NOTE: I am using a lambda here to consume the iterator returned by `urls_from_text`
df['urls'] = df['comment_text'].apply(lambda text: list(urls_from_text(text)))
urls = df['urls'].explode().dropna()
urls

0                                http://www.lemonde.fr
1                    http://lefigaro.fr?utm_campaign=4
1    https://www.google.com/url?sa=t&rct=j&q=&esrc=...
3                              https://www.lefigaro.fr
4                                    http://lemonde.fr
Name: urls, dtype: object

In [10]:
# If you have the original HTML and want to take advantage of this
df['urls'] = df['comment_html'].apply(lambda html: list(urls_from_html(html)))
urls = df['urls'].explode().dropna()
urls

0                                http://www.lemonde.fr
1                    http://lefigaro.fr?utm_campaign=4
1    https://www.google.com/url?sa=t&rct=j&q=&esrc=...
3                              https://www.lefigaro.fr
4                                    http://lemonde.fr
Name: urls, dtype: object

In [12]:
urls.value_counts()

http://www.lemonde.fr                                                                                                                                                                                 1
https://www.google.com/url?sa=t&rct=j&q=&esrc=s&source=web&cd=&cad=rja&uact=8&ved=2ahUKEwiCjrb7hpb2AhVOx4UKHXGeBS8QFnoECAwQAQ&url=https%3A%2F%2Fwww.lemonde.fr%2F&usg=AOvVaw1mI-cPIQr3docEINuU9JU2    1
https://www.lefigaro.fr                                                                                                                                                                               1
http://lemonde.fr                                                                                                                                                                                     1
http://lefigaro.fr?utm_campaign=4                                                                                                                                                                     1


In [14]:
# As you can see, you should probably "normalize" your urls if you want to compute accurate stats
urls.apply(normalize_url).value_counts()

lemonde.fr     3
lefigaro.fr    2
Name: urls, dtype: int64