In [1]:
import os.path
from os import path
from bs4 import BeautifulSoup
import requests
import pandas as pd
import json
import re
from collections import defaultdict
from gensim import corpora
from gensim.models import TfidfModel
import nltk
import pickle

# Scrape Datacamp project page
Every time I finished a [DataCamp project](https://www.datacamp.com/projects) I added it to my [DataCamp_projects repo](https://github.com/mrbarkis/DataCamp_projects). Unfortunately, I forgot to add descriptions and keywords, which annoys me to no end ; I remember using some technique in the projects but can't remember in which. Luckily, we can now use what DataCamp thought us; 1) we can scrape descriptions from the DataCamp projects page; 2) we can use NLP-techniques to determine the keywords for each project. 

## Plan:
    1. Store relevant DataCamp and GitHub pages as local html-files
    2. Find out which projects are listed on GitHub (my completed projects)
    3. Find every project on DataCamp, and collect their descriptions.
    4. Join the data from GitHub and DataCamp
    5. Extract texts, functions, methods, attributes, imports from the notebooks
        5.1. Find urls to the notebooks from GitHub
        5.2. Define helper functions that can parse code, text, etc from the notebooks.
        5.3. Download and parse all the notebooks
    6. Analyze which words, and functions are most descriptive
        6.1. Tokenize the imports and texts, which are still sentenses
        6.2. Sort the tokens using the tf-idf, or term frequency–inverse document frequency, metric.
    7. Exract long descriptions from DataCamp
    8. Generate markdown description of each project.
    
If nothing works, you can load the backed up data
```python
#joined = pd.read_csv("./data/data_backup.csv")
joined = pd.read_csv("./data/raw_data_backup.csv")
```
and start from section 6

## 1. Store relevant DataCamp and GitHub pages as local html-files

In [2]:
# Urls to scrape:
datacamp_url = "https://www.datacamp.com/projects" # Old, cannot be scraped any more
dc_json_url = "https://static.datacamp.com/page-data/projects/page-data.json" # New page uses this json
repo_url = "https://github.com/mrbarkis/DataCamp_projects"

# Store the pages as local files (to avoid downloading projects multiple times)
datacamp_file = "./localdata/datacamp_projects.html"
repo_file = "./localdata/repo.html"
dc_json_file = "./localdata/dc.json"

# Force an update even if the file exists
update_datacamp_data = False
update_repo_data = False
backup = True

def downloadPage(url, save_as, overwrite=False, binary=False):
    """Download and save the page locally if it does not exist."""

    if path.exists(save_as) & ~overwrite:
        print("Using an old file: " + save_as)
    else:
        print("Downloading from: " + url)
        r = requests.get(url) 
        r.encoding = 'utf-8'
        if binary:
            flag = 'wb'
            content = r.content
        else:
            flag = 'w'
            content = r.text
        print("Storing locally to: " + save_as)
        with open(save_as, flag) as file:
            file.write(content)

downloadPage(datacamp_url, datacamp_file, overwrite=update_datacamp_data)
downloadPage(dc_json_url, dc_json_file, overwrite=update_datacamp_data)
downloadPage(repo_url, repo_file, overwrite=update_repo_data)


Using an old file: ./localdata/datacamp_projects.html
Using an old file: ./localdata/dc.json
Using an old file: ./localdata/repo.html


## 2. Find out which projects are listed on GitHub (my completed projects)
On my [GitHub repo](https://github.com/mrbarkis/DataCamp_projects), the completed projects are listed as follows
![folder links in github](img/repo.png "GitHub repo")
Here, the folders are links that are defined with the following tags

```html
<a class="js-navigation-open link-gray-dark" title="Introduction to DataCamp Projects" id="81072776ec78e14d9ae418bc284e3ff2-26df00cc7b5540117f4a7a790b33cae72fe611a1" href="/mrbarkis/DataCamp_projects/tree/master/Introduction%20to%20DataCamp%20Projects">Introduction to DataCamp Projects</a>
```

We can use Beatifulsoup to find and process these tags. Let's also build a DataFrame that contains titles and hrefs for later use.

In [3]:
with open(repo_file, 'r') as html_file:
    html_repo = BeautifulSoup(html_file)

completed_projects = pd.DataFrame()

for tag in html_repo.findAll("a", {"class": "js-navigation-open link-gray-dark"}):
    title = tag.text
    link = tag.get('href')
    if title not in ["README.md", ".gitignore"]:
        completed_projects = completed_projects.append(
                                {"title": title.strip(),
                                 "repo_url": "https://github.com" + link},
                                ignore_index=True)
        

completed_projects["language"] = "python" # For now, I have completed only Python projects
completed_projects.head(3)

Unnamed: 0,repo_url,title,language
0,https://github.com/mrbarkis/DataCamp_projects/...,A Network Analysis of Game of Thrones,python
1,https://github.com/mrbarkis/DataCamp_projects/...,A New Era of Data Analysis in Baseball,python
2,https://github.com/mrbarkis/DataCamp_projects/...,A Visual History of Nobel Prize Winners,python


## 3. Find every project on DataCamp, and collect their descriptions
On the [DataCamp projects page](https://www.datacamp.com/projects), the projects are listed as follows

![project wreppers](img/datacamp_projects.png "GitHub repo")

In the past, each project has a wrapper tag that contains the following 
```html
    <div class="dc-project-block-wrapper">
        <h5 class="dc-project-block__title">Visualizing COVID-19</h5>
        <small class="dc-project-block__description">
            Visualize the rise of COVID-19 cases globally withggplot2. 
        </small>
        <p class="dc-project-block__author-name">Richie Cotton</p>
        <p class="dc-project-block__author-bio">Curriculum Architect at DataCamp</p>
        <img alt="R icon" class="dc-project-block__technology" src="___.svg"/>
        <a class="shim ds-snowplow-link-project-block" href="/projects/870"></a>
    </div>
```
The commented code below, uses Beatifulsoup to find and process these tags. Unfortunately, the pages have changed, so it no longer works. Currently, the info is stored in a json file.

"https://static.datacamp.com/page-data/projects/page-data.json"

,which we can download and read easily.

In [4]:
#with open(datacamp_file, 'r') as html_file:
#    html = BeautifulSoup(html_file)
#
##print("### TYPICAL PROJECT WRAPPER LOOKS LIKE THIS ###")
##print(html.prettify())
#
#project = html.find("div", {"class": "dc-project-block-wrapper"})
#print(project.prettify())
#
#def scrape_info(project):
#    """Scrape relevant info from bs4 tag element, such as
#    project = html.find("div", {"class": "dc-project-block-wrapper"})
#    """
#    language = project.find('img', {"class": "dc-project-block__technology"}).get('alt')[0]
#    title = project.find('h5').text.strip()
#    description = project.find('small').text.strip()
#    author = project.find("p", {"class": "dc-project-block__author-name"}).text.strip()
#   bio = project.find("p", {"class": "dc-project-block__author-bio"}).text.strip()
#    url = ("https://www.datacamp.com"
#                + project.find("a", {"class": "shim ds-snowplow-link-project-block"}).get('href'))
#     return {"language": language,
#             "title": title,
#             "description": description,
#             "datacamp_url": url,
#             "author": author,
#             "bio": bio}


# all_projects = pd.DataFrame()

# for project in html.findAll("div", {"class": "dc-project-block-wrapper"}):
#     all_projects = all_projects.append(scrape_info(project), ignore_index=True)

# #print(projects_df.info())
# all_projects.head(3)

In [5]:
with open(dc_json_file, 'r') as file:
       j = json.load(file)
        
# Explore the dictionary
#print(j['result']['data'].keys())
#print(j['result']['data']['allProject']['nodes'][0])

all_projects = pd.DataFrame()
for p in j['result']['data']['allProject']['nodes']: # p in projects
    info = dict()
    #p = j['result']['data']['allProject']['nodes']
    select_as_is = {'title', 'description', 'language'}
    info = {key: p[key] for key in p if key in select_as_is}
    info["datacamp_url"] = datacamp_url+ "/" + p["id"].split('-')[-1]
    info["author"] = p["instructors"][0]["fullName"]
    info["bio"] = p["instructors"][0]["marketingBiography"]
    all_projects = all_projects.append(info, ignore_index=True)

all_projects.head(3)

Unnamed: 0,author,bio,datacamp_url,description,language,title
0,Rasmus Bååth,Senior Data Scientist at King (Activision Bliz...,https://www.datacamp.com/projects/33,"If you've never done a DataCamp project, this ...",python,Introduction to DataCamp Projects
1,Rasmus Bååth,Senior Data Scientist at King (Activision Bliz...,https://www.datacamp.com/projects/41,"If you have never done a DataCamp project, thi...",r,Introduction to DataCamp Projects
2,Richie Cotton,Curriculum Architect at DataCamp,https://www.datacamp.com/projects/870,Visualize the rise of COVID-19 cases globally ...,r,Visualizing COVID-19


## 4. Join the data from GitHub and DataCamp
We can now add info from "all_projects" to the "completed_projects" by "left-joining" them together.

In [6]:
#completed_projects.join(all_projects, on=['title', 'language'], how='right')
joined = pd.merge(completed_projects, all_projects,
         left_on=['title', 'language'], right_on=['title', 'language'],
         how='left')

# There might be missing values if DataCamp has changed their project names
missing = joined[joined.isna().any(axis=1)]
#missing.title
if len(missing) != 0:
    print("Some info is missing!")
    missing.title
else:
    display(joined.head(3))

Unnamed: 0,repo_url,title,language,author,bio,datacamp_url,description
0,https://github.com/mrbarkis/DataCamp_projects/...,A Network Analysis of Game of Thrones,python,Mridul Seth,Data science enthusiast,https://www.datacamp.com/projects/76,Analyze the network of characters in Game of T...
1,https://github.com/mrbarkis/DataCamp_projects/...,A New Era of Data Analysis in Baseball,python,David Venturi,Curriculum Manager at DataCamp,https://www.datacamp.com/projects/250,Use MLB's Statcast data to compare New York Ya...
2,https://github.com/mrbarkis/DataCamp_projects/...,A Visual History of Nobel Prize Winners,python,Rasmus Bååth,Senior Data Scientist at King (Activision Bliz...,https://www.datacamp.com/projects/441,Explore a dataset from Kaggle containing a cen...


## 5. Extract texts, functions, methods, attributes, imports from the notebooks
This is a bit more complicated step. We want to know what words and functions describe each project the most. The plan is to:
    1. Find urls to the notebooks from GitHub.
    2. Define helper functions that can parse code, text, etc from the notebooks.
    3. Download and parse all the notebooks
    

### 5.1. Find urls to the notebooks from GitHub.
At each "repo_url", the notebook.ipynb is shown as a link, for example
```html
<a class="js-navigation-open link-gray-dark" title="notebook.ipynb" id="2c23a4371ad3651c48a34c44792ac573-32f3c1c2ba85a153ed852a192a62d11b8d6cd9a9" href="/mrbarkis/DataCamp_projects/blob/master/A%20Network%20Analysis%20of%20Game%20of%20Thrones/notebook.ipynb">notebook.ipynb</a>
```
Here, the href cannot be used for downloading, because it opens the notebook in browser instead. To get the download link, we can modify the url by removing the "/blob" and replacing the "github/" with "raw.githubusercontent/".


In [7]:
urls_file = "localdata/urls.p"
if not path.exists(urls_file) or update_repo_data:
    print("Scraping from GitHub:")
    urls = []
    dl_urls = []
    for i, row in joined.iterrows():
        print(f"Url: {row.repo_url}")
        r = requests.get(row.repo_url)
        r.encoding = 'utf-8'
        page = BeautifulSoup(r.text)
        href = page.find("a", {"title": "notebook.ipynb"}).get('href')
        url = "https://www.github.com" + href
        dl_url = "https://raw.githubusercontent.com" + href.replace("/blob", "")
        urls.append(url)
        dl_urls.append(dl_url)
        
    pickle.dump((urls, dl_urls), open(urls_file, "wb"))
else:
    print(f"Loading from local picle: {urls_file}")
    urls, dl_urls = pickle.load(open(urls_file, "rb"))

joined["notebook_url"] = pd.Series(urls)
joined["notebook_dl_url"] = pd.Series(dl_urls)
joined.head(1)

Loading from local picle: localdata/urls.p


Unnamed: 0,repo_url,title,language,author,bio,datacamp_url,description,notebook_url,notebook_dl_url
0,https://github.com/mrbarkis/DataCamp_projects/...,A Network Analysis of Game of Thrones,python,Mridul Seth,Data science enthusiast,https://www.datacamp.com/projects/76,Analyze the network of characters in Game of T...,https://www.github.com/mrbarkis/DataCamp_proje...,https://raw.githubusercontent.com/mrbarkis/Dat...


### 5.2. Define helper functions that can parse code, text, etc from the notebooks.

In [8]:
def extract_text_and_code(file_name):
    """Read notebook and separate text from code."""
    with open(file_name, 'r') as file:
        j = json.load(file)

    txt = ""
    code = []
    for cell in j["cells"]:
        if cell["cell_type"] == "markdown":
            txt += BeautifulSoup("".join(cell["source"])).text +"\n\n"
        elif cell["cell_type"] == "code":
           code.extend(cell["source"])
    
    return txt, code


def parse_lines(pattern, lines, suffix=""):
    """Seach for a pattern line by line and collect the results as a list."""
    p = re.compile(pattern)
    matches = []
    for line in lines:
        match = p.findall(line)
            
        if match:
            match = [m + suffix for m in match]
            matches.extend(match)

    return matches

In [9]:
# Testing the functions:
# Load a notebook:
row = joined.loc[0]
file_url = row["notebook_dl_url"]
file_name = "./localdata/" + row["title"].replace(" ", "_") + ".ipynb"
downloadPage(file_url, file_name, overwrite=False, binary=True)

# Separate the text from the code:
txt, code = extract_text_and_code(file_name)

# Test parsing using regex:
imports = parse_lines(r"^import.*$", code)
imports.extend(parse_lines(r"^from.*$", code))
methods = parse_lines(r"\.\w+\(", code,  suffix=")")
functions = parse_lines(r"(?<!\.)\b\w+\(", code, suffix=")")
attributes = parse_lines(r"\.\w+\b(?!\()",code)


print("\n## The first five lines of code are:\n" + "".join(code[:5]))

print("\n## The parsed methods are:")
print(methods)

print("\n## The parsed imports are:")
print(imports)

print("\n##T he parsed functions are:")
print(functions)

print("\n## The parsed attributes are:")
print(attributes)


Using an old file: ./localdata/A_Network_Analysis_of_Game_of_Thrones.ipynb

## The first five lines of code are:
# Importing modules
import pandas as pd

# Reading in datasets/book1.csv
book1 = pd.read_csv('datasets/book1.csv')


## The parsed methods are:
['.read_csv()', '.head()', '.set_xticks()', '.arange()', '.set_xticklabels()', '.arange()', '.set_xlabel()', '.Graph()', '.iterrows()', '.add_edge()', '.read_csv()', '.Graph()', '.iterrows()', '.add_edge()', '.append()', '.degree_centrality()', '.degree_centrality()', '.items()', '.items()', '.degree_centrality()', '.from_records()', '.head()', '.plot()', '.set_ylabel()', '.set_title()', '.betweenness_centrality()', '.from_records()', '.fillna()', '.sort_values()', '.plot()', '.set_ylabel()', '.set_title()', '.pagerank()', '.from_records()', '.sort_values()', '.plot()', '.set_ylabel()', '.set_title()', '.pagerank()', '.betweenness_centrality()', '.degree_centrality()', '.from_records()', '.corr()', '.idxmax()']

## The parsed imports

### 5.3. Download and parse all the notebooks

In [10]:
imports, attributes, methods, functions, texts = [], [], [], [], []

for _, row in joined.iterrows():
    # Load the file:
    file_url = row["notebook_dl_url"]
    file_name = "./localdata/" + row["title"].replace(" ", "_") + ".ipynb"
    downloadPage(file_url, file_name, overwrite=False, binary=True)
    
    # Separate the text and the code:
    txt, code = extract_text_and_code(file_name)
    
    # Parse code:
    i = parse_lines(r"^import.*$", code)
    i.extend(parse_lines(r"^from.*$", code)) # using ^(import|from) causes headache
    m = parse_lines(r"\.\w+\(", code,  suffix=")")
    f = parse_lines(r"(?<!\.)\b\w+\(", code, suffix=")")
    a = parse_lines(r"\.\w+\b(?!\()",code)

    # Store results:
    texts.append(txt)
    imports.append(i)
    attributes.append(a)
    methods.append(m)
    functions.append(f)

# Add to DataFrame:
joined["imports"] = pd.Series(imports)
joined["attributes"] = pd.Series(attributes)
joined["methods"] = pd.Series(methods)
joined["functions"] = pd.Series(functions)
joined["texts"] = pd.Series(texts)

joined.head(3)

Using an old file: ./localdata/A_Network_Analysis_of_Game_of_Thrones.ipynb
Using an old file: ./localdata/A_New_Era_of_Data_Analysis_in_Baseball.ipynb
Using an old file: ./localdata/A_Visual_History_of_Nobel_Prize_Winners.ipynb
Using an old file: ./localdata/Analyze_Your_Runkeeper_Fitness_Data.ipynb
Using an old file: ./localdata/Bad_passwords_and_the_NIST_guidelines.ipynb
Using an old file: ./localdata/Book_Recommendations_from_Charles_Darwin.ipynb
Using an old file: ./localdata/Comparing_Cosmetics_by_Ingredients.ipynb
Using an old file: ./localdata/Disney_Movies_and_Box_Office_Success.ipynb
Using an old file: ./localdata/Do_Left-handed_People_Really_Die_Young?.ipynb
Using an old file: ./localdata/Dr._Semmelweis_and_the_Discovery_of_Handwashing.ipynb
Using an old file: ./localdata/Exploring_the_Bitcoin_Cryptocurrency_Market.ipynb
Using an old file: ./localdata/Exploring_the_Evolution_of_Linux.ipynb
Using an old file: ./localdata/Exploring_the_History_of_Lego.ipynb
Using an old file: .

Unnamed: 0,repo_url,title,language,author,bio,datacamp_url,description,notebook_url,notebook_dl_url,imports,attributes,methods,functions,texts
0,https://github.com/mrbarkis/DataCamp_projects/...,A Network Analysis of Game of Thrones,python,Mridul Seth,Data science enthusiast,https://www.datacamp.com/projects/76,Analyze the network of characters in Game of T...,https://www.github.com/mrbarkis/DataCamp_proje...,https://raw.githubusercontent.com/mrbarkis/Dat...,"[import pandas as pd, import networkx as nx, i...","[.csv, .csv, .csv, .csv, .csv, .csv, .DataFram...","[.read_csv(), .head(), .set_xticks(), .arange(...","[setBookAxes(), sorted(), sorted(), print(), p...",## 1. Winter is Coming. Let's load the dataset...
1,https://github.com/mrbarkis/DataCamp_projects/...,A New Era of Data Analysis in Baseball,python,David Venturi,Curriculum Manager at DataCamp,https://www.datacamp.com/projects/250,Use MLB's Statcast data to compare New York Ya...,https://www.github.com/mrbarkis/DataCamp_proje...,https://raw.githubusercontent.com/mrbarkis/Dat...,"[import pandas as pd, import matplotlib.pyplot...","[.pyplot, .csv, .csv, .max_columns, .str, .str...","[.read_csv(), .read_csv(), .set_option(), .tai...","[print(), print(), print(), print(), assign_x_...",## 1. The Statcast revolution\n\nThis is Aaron...
2,https://github.com/mrbarkis/DataCamp_projects/...,A Visual History of Nobel Prize Winners,python,Rasmus Bååth,Senior Data Scientist at King (Activision Bliz...,https://www.datacamp.com/projects/441,Explore a dataset from Kaggle containing a cen...,https://www.github.com/mrbarkis/DataCamp_proje...,https://raw.githubusercontent.com/mrbarkis/Dat...,"[import pandas as pd, import seaborn as sns, i...","[.csv, .pyplot, .rcParams, .figsize, .ticker, ...","[.read_csv(), .head(), .value_counts(), .value...","[display(), len(), display(), PercentFormatter...",## 1. The most Nobel of Prizes\n\nThe Nobel Pr...


In [11]:
if backup:
    joined.to_csv("./data/raw_data_backup.csv", index=False)

 ## 6. Analyse which words, and functions are the most descriptive
    1) Tokenize the imports and texts, which are still sentenses
    2) Sort the tokens using the tf-idf, or term frequency–inverse document frequency, metric.

### 6.1 Tokenize the imports and texts, which are still sentenses

In [12]:
# Tokenize the import statements
print_index = 1
imports = list(joined.imports)

print(f"# Original import statements:\n{imports[print_index]}")
imports = [" ".join(i) for i in imports]

print(f"\n# As a string:\n{imports[print_index]}")
imports = [i.split(" ") for i in imports]

print(f"\n# As a list of tokens:\n{imports[print_index]}")

stoplist = set('import as from pd np sns nx plt sm ,'.split())

imports = [[token for token in tokens if token not in stoplist]
           for tokens in imports]
print(f"\n# Without stopwords:\n{imports[print_index]}")
joined["imports"]=pd.Series(imports)
joined.head(2)

# Original import statements:
['import pandas as pd', 'import matplotlib.pyplot as plt', 'import seaborn as sns']

# As a string:
import pandas as pd import matplotlib.pyplot as plt import seaborn as sns

# As a list of tokens:
['import', 'pandas', 'as', 'pd', 'import', 'matplotlib.pyplot', 'as', 'plt', 'import', 'seaborn', 'as', 'sns']

# Without stopwords:
['pandas', 'matplotlib.pyplot', 'seaborn']


Unnamed: 0,repo_url,title,language,author,bio,datacamp_url,description,notebook_url,notebook_dl_url,imports,attributes,methods,functions,texts
0,https://github.com/mrbarkis/DataCamp_projects/...,A Network Analysis of Game of Thrones,python,Mridul Seth,Data science enthusiast,https://www.datacamp.com/projects/76,Analyze the network of characters in Game of T...,https://www.github.com/mrbarkis/DataCamp_proje...,https://raw.githubusercontent.com/mrbarkis/Dat...,"[pandas, networkx, numpy]","[.csv, .csv, .csv, .csv, .csv, .csv, .DataFram...","[.read_csv(), .head(), .set_xticks(), .arange(...","[setBookAxes(), sorted(), sorted(), print(), p...",## 1. Winter is Coming. Let's load the dataset...
1,https://github.com/mrbarkis/DataCamp_projects/...,A New Era of Data Analysis in Baseball,python,David Venturi,Curriculum Manager at DataCamp,https://www.datacamp.com/projects/250,Use MLB's Statcast data to compare New York Ya...,https://www.github.com/mrbarkis/DataCamp_proje...,https://raw.githubusercontent.com/mrbarkis/Dat...,"[pandas, matplotlib.pyplot, seaborn]","[.pyplot, .csv, .csv, .max_columns, .str, .str...","[.read_csv(), .read_csv(), .set_option(), .tai...","[print(), print(), print(), print(), assign_x_...",## 1. The Statcast revolution\n\nThis is Aaron...


In [13]:
# Tokenize texts
texts = list(joined.texts)
print_index = 3

print("# A snippet from the the original text:")
print(texts[print_index][:100])

print("\n# First tokens:")
tokenizer = nltk.tokenize.RegexpTokenizer(pattern='\w+')
all_tokens = [tokenizer.tokenize(text=text) for text in texts]
print(all_tokens[print_index][:10])

print("\n# Tokens to lower case:")
all_tokens = [[token.lower() for token in tokens]
              for tokens in all_tokens]
print(all_tokens[print_index][:10])

print("\n# First non-generic tokens:")
#nltk.download('stopwords')
sw = nltk.corpus.stopwords.words('english')
all_tokens = [[token for token in tokens if token not in sw]
              for tokens in all_tokens]
print(all_tokens[print_index][:10])

print("\n# Stem the tokens:")
porter = nltk.stem.PorterStemmer()
all_tokens = [[porter.stem(token) for token in tokens]
             for tokens in all_tokens]
print(all_tokens[print_index][:10])


# A snippet from the the original text:
## 1. Obtain and review raw data
One day, my old running friend and I were chatting about our runnin

# First tokens:
['1', 'Obtain', 'and', 'review', 'raw', 'data', 'One', 'day', 'my', 'old']

# Tokens to lower case:
['1', 'obtain', 'and', 'review', 'raw', 'data', 'one', 'day', 'my', 'old']

# First non-generic tokens:
['1', 'obtain', 'review', 'raw', 'data', 'one', 'day', 'old', 'running', 'friend']

# Stem the tokens:
['1', 'obtain', 'review', 'raw', 'data', 'one', 'day', 'old', 'run', 'friend']


In [14]:
joined.texts = pd.Series(all_tokens)
joined.head(2)

Unnamed: 0,repo_url,title,language,author,bio,datacamp_url,description,notebook_url,notebook_dl_url,imports,attributes,methods,functions,texts
0,https://github.com/mrbarkis/DataCamp_projects/...,A Network Analysis of Game of Thrones,python,Mridul Seth,Data science enthusiast,https://www.datacamp.com/projects/76,Analyze the network of characters in Game of T...,https://www.github.com/mrbarkis/DataCamp_proje...,https://raw.githubusercontent.com/mrbarkis/Dat...,"[pandas, networkx, numpy]","[.csv, .csv, .csv, .csv, .csv, .csv, .DataFram...","[.read_csv(), .head(), .set_xticks(), .arange(...","[setBookAxes(), sorted(), sorted(), print(), p...","[1, winter, come, let, load, dataset, asap, he..."
1,https://github.com/mrbarkis/DataCamp_projects/...,A New Era of Data Analysis in Baseball,python,David Venturi,Curriculum Manager at DataCamp,https://www.datacamp.com/projects/250,Use MLB's Statcast data to compare New York Ya...,https://www.github.com/mrbarkis/DataCamp_proje...,https://raw.githubusercontent.com/mrbarkis/Dat...,"[pandas, matplotlib.pyplot, seaborn]","[.pyplot, .csv, .csv, .max_columns, .str, .str...","[.read_csv(), .read_csv(), .set_option(), .tai...","[print(), print(), print(), print(), assign_x_...","[1, statcast, revolut, aaron, judg, judg, one,..."


### 6.2 Sort the tokens using the tf-idf

In [15]:
def sort_by_frequency(all_tokens):
    """Sort tokens both by occurance"""
    dictionary = corpora.Dictionary(all_tokens)
    bows = [dictionary.doc2bow(tokens) for tokens in all_tokens]
    by_frequency = [[dictionary[token[0]] 
                           for token 
                            in sorted(bow, key=lambda x: x[1], reverse=True)]
                            for bow in bows]
    return by_frequency

def sort_by_tfidf(all_tokens):
    """Sort tokens both by tf-idf"""
    dictionary = corpora.Dictionary(all_tokens)
    bows = [dictionary.doc2bow(tokens) for tokens in all_tokens]
    model = TfidfModel(bows)
    
    by_tfidf=[]
    for bow in bows:
        tfidf = model[bow]
        by_tfidf.append(
            [dictionary[token[0]] for token 
             in sorted(tfidf, key=lambda x: x[1], reverse=True)]
        )
   
    return by_tfidf

In [16]:
# Sort all the tokens based on the tfidf score
columns = ["functions", "methods", "attributes", "imports"]
for col in columns:
    tokens = list(joined[col])
    joined[col] = pd.Series(sort_by_tfidf(tokens))

joined["keywords"] = pd.Series([keys[:20] for keys
                                in sort_by_tfidf(joined.texts)])

joined.head(3)

Unnamed: 0,repo_url,title,language,author,bio,datacamp_url,description,notebook_url,notebook_dl_url,imports,attributes,methods,functions,texts,keywords
0,https://github.com/mrbarkis/DataCamp_projects/...,A Network Analysis of Game of Thrones,python,Mridul Seth,Data science enthusiast,https://www.datacamp.com/projects/76,Analyze the network of characters in Game of T...,https://www.github.com/mrbarkis/DataCamp_proje...,https://raw.githubusercontent.com/mrbarkis/Dat...,"[networkx, numpy, pandas]","[.DataFrame, .T, .index, .csv]","[.degree_centrality(), .from_records(), .Graph...","[setBookAxes(), list(), set(), sorted(), range...","[1, winter, come, let, load, dataset, asap, he...","[book, network, central, charact, throne, fift..."
1,https://github.com/mrbarkis/DataCamp_projects/...,A New Era of Data Analysis in Baseball,python,David Venturi,Curriculum Manager at DataCamp,https://www.datacamp.com/projects/250,Use MLB's Statcast data to compare New York Ya...,https://www.github.com/mrbarkis/DataCamp_proje...,https://raw.githubusercontent.com/mrbarkis/Dat...,"[seaborn, matplotlib.pyplot, pandas]","[.zone, .loc, .max_columns, .str, .pyplot, .csv]","[.set_visible(), .gca(), .set_title(), .apply(...","[assign_x_coord(), assign_y_coord(), print()]","[1, statcast, revolut, aaron, judg, judg, one,...","[pitch, home, stanton, statcast, judg, ball, v..."
2,https://github.com/mrbarkis/DataCamp_projects/...,A Visual History of Nobel Prize Winners,python,Rasmus Bååth,Senior Data Scientist at King (Activision Bliz...,https://www.datacamp.com/projects/441,Explore a dataset from Kaggle containing a cen...,https://www.github.com/mrbarkis/DataCamp_proje...,https://raw.githubusercontent.com/mrbarkis/Dat...,"[PercentFormatter, matplotlib.ticker, seaborn,...","[.ticker, .yaxis, .figsize, .rcParams, .year, ...","[.nsmallest(), .lineplot(), .lmplot(), .set_ma...","[PercentFormatter(), display(), len()]","[1, nobel, prize, nobel, prize, perhap, world,...","[prize, winner, nobel, 1901, imbal, peac, chem..."


## 7. Exract long descriptions from DataCamp
The project pages at "datacamp_url" look like this
![project wreppers](img/project_description.png "descriptions on datacamp_url")
The code looks like
```html
<h4>Project Description</h4>
<div>
    <p>Description ...</p>
    <p>Description ...</p>
    <p>Description ...</p>
</div>
```
we can search for the tag after the h4 heading tag.

In [17]:
txts = []
for _, row in joined.iterrows():
    url = row["datacamp_url"]
    file_name = "./localdata/" + row["title"].replace(" ", "_") + ".html"
    downloadPage(url, file_name, overwrite=False)
    
    with open(file_name, 'r') as html_file:
        html = BeautifulSoup(html_file)
    txts.append(html.find('h4').find_next().prettify())

joined["long_description"] = pd.Series(txts)
joined.head()

Using an old file: ./localdata/A_Network_Analysis_of_Game_of_Thrones.html
Using an old file: ./localdata/A_New_Era_of_Data_Analysis_in_Baseball.html
Using an old file: ./localdata/A_Visual_History_of_Nobel_Prize_Winners.html
Using an old file: ./localdata/Analyze_Your_Runkeeper_Fitness_Data.html
Using an old file: ./localdata/Bad_passwords_and_the_NIST_guidelines.html
Using an old file: ./localdata/Book_Recommendations_from_Charles_Darwin.html
Using an old file: ./localdata/Comparing_Cosmetics_by_Ingredients.html
Using an old file: ./localdata/Disney_Movies_and_Box_Office_Success.html
Using an old file: ./localdata/Do_Left-handed_People_Really_Die_Young?.html
Using an old file: ./localdata/Dr._Semmelweis_and_the_Discovery_of_Handwashing.html
Using an old file: ./localdata/Exploring_the_Bitcoin_Cryptocurrency_Market.html
Using an old file: ./localdata/Exploring_the_Evolution_of_Linux.html
Using an old file: ./localdata/Exploring_the_History_of_Lego.html
Using an old file: ./localdata/Ex

Unnamed: 0,repo_url,title,language,author,bio,datacamp_url,description,notebook_url,notebook_dl_url,imports,attributes,methods,functions,texts,keywords,long_description
0,https://github.com/mrbarkis/DataCamp_projects/...,A Network Analysis of Game of Thrones,python,Mridul Seth,Data science enthusiast,https://www.datacamp.com/projects/76,Analyze the network of characters in Game of T...,https://www.github.com/mrbarkis/DataCamp_proje...,https://raw.githubusercontent.com/mrbarkis/Dat...,"[networkx, numpy, pandas]","[.DataFrame, .T, .index, .csv]","[.degree_centrality(), .from_records(), .Graph...","[setBookAxes(), list(), set(), sorted(), range...","[1, winter, come, let, load, dataset, asap, he...","[book, network, central, charact, throne, fift...","<div>\n <p>\n Jon Snow, Daenerys Targaryen, o..."
1,https://github.com/mrbarkis/DataCamp_projects/...,A New Era of Data Analysis in Baseball,python,David Venturi,Curriculum Manager at DataCamp,https://www.datacamp.com/projects/250,Use MLB's Statcast data to compare New York Ya...,https://www.github.com/mrbarkis/DataCamp_proje...,https://raw.githubusercontent.com/mrbarkis/Dat...,"[seaborn, matplotlib.pyplot, pandas]","[.zone, .loc, .max_columns, .str, .pyplot, .csv]","[.set_visible(), .gca(), .set_title(), .apply(...","[assign_x_coord(), assign_y_coord(), print()]","[1, statcast, revolut, aaron, judg, judg, one,...","[pitch, home, stanton, statcast, judg, ball, v...",<div>\n <p>\n There's a new era of data analy...
2,https://github.com/mrbarkis/DataCamp_projects/...,A Visual History of Nobel Prize Winners,python,Rasmus Bååth,Senior Data Scientist at King (Activision Bliz...,https://www.datacamp.com/projects/441,Explore a dataset from Kaggle containing a cen...,https://www.github.com/mrbarkis/DataCamp_proje...,https://raw.githubusercontent.com/mrbarkis/Dat...,"[PercentFormatter, matplotlib.ticker, seaborn,...","[.ticker, .yaxis, .figsize, .rcParams, .year, ...","[.nsmallest(), .lineplot(), .lmplot(), .set_ma...","[PercentFormatter(), display(), len()]","[1, nobel, prize, nobel, prize, perhap, world,...","[prize, winner, nobel, 1901, imbal, peac, chem...",<div>\n <p>\n The Nobel Prize is perhaps the ...
3,https://github.com/mrbarkis/DataCamp_projects/...,Analyze Your Runkeeper Fitness Data,python,Andrii Pavlenko,Project Instructor,https://www.datacamp.com/projects/727,"Import, clean, and analyze seven years worth o...",https://www.github.com/mrbarkis/DataCamp_proje...,https://raw.githubusercontent.com/mrbarkis/Dat...,"[statsmodels.api, warnings, matplotlib.pyplot,...","[.5, .api, .figure, .observed, .trend, .tsa, ....","[.resample(), .set(), .axhspan(), .mean(), .fi...","[display(), int(), range(), print(), len()]","[1, obtain, review, raw, data, one, day, old, ...","[km, run, heart, train, miss, shoe, distanc, a...",<div>\n <p>\n With the explosion in fitness t...
4,https://github.com/mrbarkis/DataCamp_projects/...,Bad passwords and the NIST guidelines,python,Rasmus Bååth,Senior Data Scientist at King (Activision Bliz...,https://www.datacamp.com/projects/141,Check what passwords fail to conform to the Na...,https://www.github.com/mrbarkis/DataCamp_proje...,https://raw.githubusercontent.com/mrbarkis/Dat...,[pandas],"[.str, .txt, .csv]","[.extract(), .sum(), .isin(), .any(), .len(), ...","[print(), len()]","[1, nist, special, public, 800, 63b, 50, year,...","[password, nist, user, secret, 63b, flag, repe...",<div>\n <p>\n Almost every web service you jo...


## 8 Generate markdown entries

Let's define a function that outputs the following text:

## [Title of project](https://www.datacamp.com/projects/76 "To repo url")

Description of the project. Description of the project. Description of the project. Description of the project. Description of the project. Description of the project. Description of the project. Description of the project. Description of the project.

[Original source](https://www.datacamp.com/projects/76 "To datacamp url") by John Doe, Data science enthusiast


In [18]:
text = """# My completed DataCamp projects
The below descriptions have been scraped from
[DataCamp projects page](https://www.datacamp.com/projects).
The keywords, imports, methods, functions, and attributes, in turn, have been scraped from the notebooks themselves,
and sorted using the tf-idf, i.e. term frequency–inverse document frequency, metric. """
for i, r in joined.iterrows():
    title, author, description, long, repo_url, url, bio = (r.title,
                                                      r.author,
                                                      r.description,
                                                      r.long_description,
                                                      r.notebook_url,
                                                      r.datacamp_url,
                                                      r.bio)
    heading = f"\n## [{title}]({repo_url})"
    info = f"\n{long} \n"
    source = f"\n[Original project]({url}) by {author}, {bio}\n"

    
    keywords = " ".join(r.keywords)  
    imports = " ".join(r.imports)
    methods = " ".join(r.methods)
    functions = " ".join(r.functions)
    attributes = " ".join(r.attributes)
    
    
    table = "\n\n| Keywords | Imports | Methods | Functions | Attributes/Extensions|"
    table += "\n| --- |--- | --- | --- | --- |"
    table += "\n|" + keywords
    table += "|" + imports
    table += "|" + methods
    table += "|" + functions
    table += "|" + attributes
    table += "|\n\n"
    
    text += heading  + info + table + source
    

with open("project_list.md", 'w') as file:
            file.write(text)

In [19]:
# Backup the joined DataFrame
if backup:
    joined.to_csv("./data/data_backup.csv", index=False)