In [1]:
import pandas as pd
import re

hn = pd.read_csv("hacker_news.csv")
titles = hn['title']

In [2]:

pattern = r"sql"
sql_counts = titles.str.contains(pattern, flags=re.I).sum()

In [3]:
# new dataframe, hn_sql, including only rows that mention a SQL flavor
hn_sql = hn[hn['title'].str.contains(r"\w+SQL", flags=re.I)].copy()

# Create a new column called flavor in the hn_sql dataframe, 
# containing extracted mentions of SQL flavors, defined as:
# Any time 'SQL' is preceded by one or more word characters.
# Ignoring all case variation.
hn_sql["flavor"] = hn_sql["title"].str.extract(r"(\w+SQL)", re.I)


# Use the Series.str.lower() method to converting to lowercase - cleaning
hn_sql["flavor"] = hn_sql["flavor"].str.lower()

# Use the DataFrame.pivot_table() method to create a pivot table, sql_pivot.
# index of the pivot table = flavor column.
# values of the pivot table = the mean of the num_comments column, aggregated by SQL flavor.
sql_pivot = hn_sql.pivot_table(index="flavor", values="num_comments", aggfunc='mean')

In [5]:
pattern = r"[Pp]ython ([\d\.]+)"

# DataQuest platform is using Pandas version 0.22.0. 
# expand parameter is set to False by default in str.extract(). 
# expand= False will return a Series if there is only 1 capture group.
py_versions = titles.str.extract(pattern, re.I, expand=False)

py_versions_freq = dict(py_versions.value_counts())

In [8]:
def first_10_matches(pattern):
    """
    Return the first 10 story titles that match
    the provided regular expression
    """
    all_matches = titles[titles.str.contains(pattern)]
    first_10 = all_matches.head(10)
    return first_10

#  use a negative set to prevent matches for 
# the + character and the . character (cases C++ or C.E.O.)
pattern = r"\b[Cc]\b[^.+]"
first_ten = first_10_matches(pattern)

In [9]:
### Using Lookarounds

test_cases = ['Red_Green_Blue',
              'Yellow_Green_Red',
              'Red_Green_Red',
              'Yellow_Green_Blue',
              'Green']

def run_test_cases(pattern):
    for tc in test_cases:
        result = re.search(pattern, tc)
        print(result or "NO MATCH")
        
        
        
run_test_cases(r"Green(?=_Blue)")

<re.Match object; span=(4, 9), match='Green'>
NO MATCH
NO MATCH
<re.Match object; span=(7, 12), match='Green'>
NO MATCH


In [10]:
# Match instances of C or c where they are not preceded or followed by another letter.
# Exclude instances where the match is followed by a . or + character, 
# without removing instances where the match occurs at the end of the string.
# Exclude instances where the word 'Series' immediately precedes the match.

pattern = r"(?<!Series\s)\b[Cc]\b(?![\+\.])"

c_mentions = titles.str.contains(pattern).sum()

In [12]:
# identify story titles that have repeated words

pattern = r"\b(\w+)\s\1\b"

repeated_words = titles[titles.str.contains(pattern)]

In [13]:
email_variations = pd.Series(['email', 'Email', 'e Mail',
                        'e mail', 'E-mail', 'e-mail',
                        'eMail', 'E-Mail', 'EMAIL'])

pattern = r"e[\-\s]?mail"

email_uniform = email_variations.str.replace(pattern, "email", flags=re.I)

titles_clean = titles.str.replace(pattern, "email", flags=re.I)

In [14]:
test_urls = pd.Series([
 'https://www.amazon.com/Technology-Ventures-Enterprise-Thomas-Byers/dp/0073523429',
 'http://www.interactivedynamicvideo.com/',
 'http://www.nytimes.com/2007/11/07/movies/07stein.html?_r=0',
 'http://evonomics.com/advertising-cannot-maintain-internet-heres-solution/',
 'HTTPS://github.com/keppel/pinn',
 'Http://phys.org/news/2015-09-scale-solar-youve.html',
 'https://iot.seeed.cc',
 'http://www.bfilipek.com/2016/04/custom-deleters-for-c-smart-pointers.html',
 'http://beta.crowdfireapp.com/?beta=agnipath',
 'https://www.valid.ly?param'
])
pattern = r"https?://([\w\.]+)"

test_urls_clean = test_urls.str.extract(pattern, flags=re.I, expand=False)
domains = hn['url'].str.extract(pattern, flags=re.I, expand=False)
top_domains = domains.value_counts().head(20)

In [16]:
# capture   protocol://domain/path
pattern = r"(.+)://([\w\.]+)/?(.*)"

test_url_parts = test_urls.str.extract(pattern, flags=re.I, expand=False)

url_parts = hn['url'].str.extract(pattern, flags=re.I, expand=False)

In [17]:
# Using named capture groups
pattern = r"(?P<protocol>.+)://(?P<domain>[\w\.]+)/?(?P<path>.*)"


url_parts = hn['url'].str.extract(pattern, flags=re.I, expand=False)

print(url_parts)

      protocol                           domain  \
0         http  www.interactivedynamicvideo.com   
1         http                   hueniverse.com   
2         http                  www.thewire.com   
3        https                   www.amazon.com   
4         http                  www.nytimes.com   
...        ...                              ...   
20095    https                          puri.sm   
20096    https                       medium.com   
20097     http               blog.darknedgy.net   
20098    https                       medium.com   
20099    https                       github.com   

                                                    path  
0                                                         
1      2016/01/26/how-to-use-open-source-and-shut-the...  
2      entertainment/2013/04/florida-djs-april-fools-...  
3      Technology-Ventures-Enterprise-Thomas-Byers/dp...  
4                    2007/11/07/movies/07stein.html?_r=0  
...                              