In [1]:
import pandas as pd
from requests import get
from bs4 import BeautifulSoup
import os
import path
import re

import unicodedata
import re
import json

import nltk
from nltk.tokenize.toktok import ToktokTokenizer
from nltk.corpus import stopwords

from wordcloud import WordCloud


In [4]:
# read the html file from a file in the curent working directory ('GitHubTopStars_repos.html')
def read_GitHub_most_stars(filename):
    with open(filename) as f:
        contents = f.read()
    return contents

def parse_urls(contents):
    soup = BeautifulSoup(contents, 'html.parser')
    title = soup.title
    orig_text = soup.text
    stars = soup.find_all('a', class_= 'v-align-middle')
    #  find all the "hrefs", which are the trending titles
    return(re.findall(r'href="(.*?)"', str(stars)))



# Grabs the language and body for each git repo as a dictionary and adds to a list
def get_git_info(star_list):
    
    github_base_url = 'https://github.com'
    git_repo_url = []
    
    for repo in star_list:
        git_repo_url.append(github_base_url + repo)

    list_of_git_info = []

    for repo_address in git_repo_url:
        git_repo_url = repo_address 
        headers = {'User-Agent': 'Codeup Ada Data Science'}
        response = get(git_repo_url, headers=headers)
        soup = BeautifulSoup(response.content, 'html.parser')
        language = soup.find('span', class_='lang')

#       This adds the part to get to the readme and scrapes the body  
        readme_end = '/blob/master/README.md'
        readme_url = git_repo_url + readme_end
        headers = {'User-Agent': 'Codeup Ada Data Science'}
        response = get(readme_url, headers=headers)
        soup = BeautifulSoup(response.content, 'html.parser')
        body = soup.find('article', class_="markdown-body entry-content p-3 p-md-6")

#       Combines the language and body to a dictionary, if no language on the repo it is ignored
        if language != None:
            language = str(language)
            list_of_git_info.append({'Language': re.findall(r'>(.*?)<', language)[0],
                                     'Body': body})

    return list_of_git_info


def drop_empty_readmes(repos):
    output = []
    for repo in repos:
        if repo['Body'] != None:
            output.append(repo)
    return output

def find_nunique(string):
    words = string.split(" ")
    unique_words = set(words)
    return len(unique_words)


def basic_clean(repo):
    repo = re.sub(r'\s', ' ', repo).lower()
    repo = unicodedata.normalize('NFKD', repo)\
        .encode('ascii', 'ignore')\
        .decode('utf-8', 'ignore')
    repo = re.sub(r"[^a-z0-9'\s]", '', repo)
    return repo


def tokenize(repo):
    tokenizer = nltk.tokenize.ToktokTokenizer()
    repo = tokenizer.tokenize(repo, return_str=True)
    return repo
    

def stem(repo):
    ps = nltk.porter.PorterStemmer()
    stems = [ps.stem(word) for word in repo.split()]
    repo_stemmed = ' '.join(stems)
    return repo_stemmed


def lemmatize(repo):
    wnl = nltk.stem.WordNetLemmatizer()
    lemmas = [wnl.lemmatize(word) for word in repo.split()]
    repo_lemmatized = ' '.join(lemmas)
    return repo_lemmatized


def remove_stopwords(repo, extra_words = [], exclude_words = []):
    stopword_list = stopwords.words('english')
    [stopword_list.append(word) for word in extra_words if word not in stopword_list]
    [stopword_list.remove(word) for word in exclude_words if word in stopword_list]
    words = repo.lower().split()
    filtered_words = [w for w in words if w not in stopword_list]
    repo_without_stopwords = ' '.join(filtered_words)
    return repo_without_stopwords


def prep_repo(dictionary_repo, extra_words = [], exclude_words = []):
    cleaned_dict = {
    'language': dictionary_repo['Language'],
    'original': dictionary_repo['Body'],
    'stemmed': stem(dictionary_repo['Body']),
    'lemmatized': lemmatize(dictionary_repo['Body']),
    'clean': remove_stopwords(basic_clean(dictionary_repo['Body']), extra_words, exclude_words),
        }

    return cleaned_dict

def prepare_repo_data(dictionary_repo, extra_words = [], exclude_words = []):
    clean_dict_list = []
    
    for repo in dictionary_repo:
        clean_dict_list.append(prep_repo(repo, extra_words, exclude_words))
        
    return clean_dict_list

def get_df(bulk):
    return pd.DataFrame(prepare_repo_data(bulk))

In [6]:
# Can read csv to save time!

df = pd.read_csv('git_repo_nlp.csv')

In [None]:
# #  turn off the web access for now and just read the cached files from the working directory
# #  contents = get_GitHub_most_stars()
# list_of_html_files = ['GitHubTopStars_repos-page1.html','GitHubTopStars_repos-page2.html',\
#                       'GitHubTopStars_repos-page3.html','GitHubTopStars_repos-page4.html',\
#                       'GitHubTopStars_repos-page5.html','GitHubTopStars_repos-page6.html',\
#                       'GitHubTopStars_repos-page7.html','GitHubTopStars_repos-page8.html',\
#                       'GitHubTopStars_repos-page9.html','GitHubTopStars_repos-page10.html',\
#                       'GitHubTopStars_repos-page11.html','GitHubTopStars_repos-page12.html',\
#                       'GitHubTopStars_repos-page13.html']

# star_list = []
# for html_file in list_of_html_files:
#     contents = read_GitHub_most_stars(html_file)
#     next_list = parse_urls(contents)
#     star_list = star_list + next_list

In [None]:
# github_base_url = 'https://github.com'

# git_repo_url = []

# for repo in star_list:
#     git_repo_url.append(github_base_url + repo)
        

In [None]:
# bulk = get_git_info(star_list)

In [None]:
# bulk = drop_empty_readmes(bulk)


In [None]:
# for repo in bulk:
#     repo['Body'] = repo['Body'].text.strip()
    

In [None]:
# df = get_df(bulk)

In [7]:
df.language.value_counts()

JavaScript     40
Python         10
C++             8
Java            7
TypeScript      7
Go              7
CSS             7
Shell           4
Vue             3
Objective-C     2
Ruby            2
C               1
Dart            1
Rust            1
Name: language, dtype: int64

In [8]:
all_words = ' '.join(df['clean'])
js_words = ' '.join(df['clean'][df.language == 'JavaScript'])
python_words = ' '.join(df['clean'][df.language == 'Python'])
cpp_words = ' '.join(df['clean'][df.language == 'C++'])
typeScript_words = ' '.join(df['clean'][df.language == 'TypeScript'])
go_words = ' '.join(df['clean'][df.language == 'Go'])
java_words = ' '.join(df['clean'][df.language == 'Java'])
css_words = ' '.join(df['clean'][df.language == 'CSS'])
shell_words = ' '.join(df['clean'][df.language == 'Shell'])
vue_words = ' '.join(df['clean'][df.language == 'Vue'])
ruby_words = ' '.join(df['clean'][df.language == 'Ruby'])
objc_words = ' '.join(df['clean'][df.language == 'Objective-C'])
dart_words = ' '.join(df['clean'][df.language == 'Dart'])
rust_words = ' '.join(df['clean'][df.language == 'Rust'])
c_words = ' '.join(df['clean'][df.language == 'C'])



In [9]:
all_freq = pd.Series(all_words.split()).value_counts()
js_freq = pd.Series(js_words.split()).value_counts()
python_freq = pd.Series(python_words.split()).value_counts()
cpp_freq = pd.Series(cpp_words.split()).value_counts()
typeScript_freq = pd.Series(typeScript_words.split()).value_counts()
go_freq = pd.Series(go_words.split()).value_counts()
java_freq = pd.Series(java_words.split()).value_counts()
css_freq = pd.Series(css_words.split()).value_counts()
shell_freq = pd.Series(shell_words.split()).value_counts()
vue_freq = pd.Series(vue_words.split()).value_counts()
ruby_freq = pd.Series(ruby_words.split()).value_counts()
objc_freq = pd.Series(objc_words.split()).value_counts()
dart_freq = pd.Series(dart_words.split()).value_counts()
rust_freq = pd.Series(rust_words.split()).value_counts()
c_freq = pd.Series(c_words.split()).value_counts()



In [10]:
word_counts = (pd.concat([all_freq, js_freq, python_freq, cpp_freq,typeScript_freq,
                          go_freq, java_freq, css_freq, shell_freq, vue_freq,
                          ruby_freq, objc_freq, dart_freq, rust_freq, c_freq], axis=1, sort=True)
                .set_axis(['all', 'js', 'python', 'cpp', 'ts', 'go', 'java',
                          'scc', 'shell', 'vue', 'ruby', 'objc', 'dart',
                          'rust', 'c'], axis=1, inplace=False)
                .fillna(0)
                .apply(lambda s: s.astype(int)))



What are the most common words in READMEs?


In [11]:
word_counts.sort_values(by='all', ascending=False).head()


Unnamed: 0,all,js,python,cpp,ts,go,java,scc,shell,vue,ruby,objc,dart,rust,c
use,1096,613,204,21,6,55,9,8,152,9,0,12,0,0,7
go,910,14,14,1,1,871,0,0,1,0,2,0,0,2,4
yes,604,11,586,0,0,0,1,0,0,0,0,2,0,0,4
const,545,542,0,3,0,0,0,0,0,0,0,0,0,0,0
unknown,534,0,534,0,0,0,0,0,0,0,0,0,0,0,0


Does the length of the README vary by language?


In [13]:
df['readme_length'] = df.clean.str.len()
df[['language','readme_length']].groupby('language').mean().sort_values('readme_length', ascending=False)


Unnamed: 0_level_0,readme_length
language,Unnamed: 1_level_1
Python,26752.0
C,20854.0
Go,18900.714286
Shell,14148.0
Objective-C,8872.0
JavaScript,8644.0
Rust,3132.0
Ruby,2952.0
C++,2746.875
Java,2729.857143


Do different languages use a different number of unique words?

In [14]:
df['unique_words'] = df.clean.apply(find_nunique)
df[['language','unique_words']].groupby('language').mean().sort_values('unique_words', ascending=False)


Unnamed: 0_level_0,unique_words
language,Unnamed: 1_level_1
Python,1191.6
C,1182.0
Go,903.142857
Shell,700.25
Objective-C,481.0
JavaScript,423.95
Rust,296.0
Vue,216.0
Ruby,214.0
C++,204.375
