In [1]:
import pandas as pd
from requests import get
from bs4 import BeautifulSoup
import os
import path
import re

import unicodedata
import re
import json

import nltk
from nltk.tokenize.toktok import ToktokTokenizer
from nltk.corpus import stopwords

from wordcloud import WordCloud


from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score


In [2]:
# read the html file from a file in the curent working directory ('GitHubTopStars_repos.html')
def read_GitHub_most_stars(filename):
    with open(filename) as f:
        contents = f.read()
    return contents

def parse_urls(contents):
    soup = BeautifulSoup(contents, 'html.parser')
    title = soup.title
    orig_text = soup.text
    stars = soup.find_all('a', class_= 'v-align-middle')
    #  find all the "hrefs", which are the trending titles
    return(re.findall(r'href="(.*?)"', str(stars)))



# Grabs the language and body for each git repo as a dictionary and adds to a list
def get_git_info(star_list):
#   Github base url
    github_base_url = 'https://github.com'
    git_repo_url = []
    
#   Adds the repo portion of address
    for repo in star_list:
        git_repo_url.append(github_base_url + repo)

    list_of_git_info = []

    for repo_address in git_repo_url:
#       Grabbs the language from the repo website  
        git_repo_url = repo_address 
        headers = {'User-Agent': 'Codeup Ada Data Science'}
        response = get(git_repo_url, headers=headers)
        soup = BeautifulSoup(response.content, 'html.parser')
        language = soup.find('span', class_='lang')

#       This adds the part to get to the readme and scrapes the body  
        readme_end = '/blob/master/README.md'
        readme_url = git_repo_url + readme_end
        headers = {'User-Agent': 'Codeup Ada Data Science'}
        response = get(readme_url, headers=headers)
        soup = BeautifulSoup(response.content, 'html.parser')
        body = soup.find('article', class_="markdown-body entry-content p-3 p-md-6")

#       Combines the language and body to a dictionary, if no language on the repo it is ignored
        if language != None:
            language = str(language)
            list_of_git_info.append({'Language': re.findall(r'>(.*?)<', language)[0],
                                     'Body': body})

    return list_of_git_info


# If the readme is empty it is dropped
def drop_empty_readmes(repos):
    output = []
    for repo in repos:
        if repo['Body'] != None:
            output.append(repo)
    return output

# Finds the number of unique words 
def find_nunique(string):
    words = string.split(" ")
    unique_words = set(words)
    return len(unique_words)

def regex_it(string):
    return ' '.join(re.findall(r'[a-z]+', string))

def basic_clean(repo):
    repo = re.sub(r'\s', ' ', repo).lower()
    repo = unicodedata.normalize('NFKD', repo)\
        .encode('ascii', 'ignore')\
        .decode('utf-8', 'ignore')
    repo = re.sub(r"[^a-z0-9'\s]", '', repo)
    return repo


def tokenize(repo):
    tokenizer = nltk.tokenize.ToktokTokenizer()
    repo = tokenizer.tokenize(repo, return_str=True)
    return repo
    

def stem(repo):
    ps = nltk.porter.PorterStemmer()
    stems = [ps.stem(word) for word in repo.split()]
    repo_stemmed = ' '.join(stems)
    return repo_stemmed


def lemmatize(repo):
    wnl = nltk.stem.WordNetLemmatizer()
    lemmas = [wnl.lemmatize(word) for word in repo.split()]
    repo_lemmatized = ' '.join(lemmas)
    return repo_lemmatized


def remove_stopwords(repo, extra_words = [], exclude_words = []):
    stopword_list = stopwords.words('english')
    [stopword_list.append(word) for word in extra_words if word not in stopword_list]
    [stopword_list.remove(word) for word in exclude_words if word in stopword_list]
    words = repo.lower().split()
    filtered_words = [w for w in words if w not in stopword_list]
    repo_without_stopwords = ' '.join(filtered_words)
    return repo_without_stopwords


def prep_repo(dictionary_repo, extra_words = [], exclude_words = []):
    cleaned_dict = {
    'language': dictionary_repo['Language'],
    'original': dictionary_repo['Body'],
    'stemmed': stem(dictionary_repo['Body']),
    'lemmatized': lemmatize(dictionary_repo['Body']),
    'clean': remove_stopwords(basic_clean(dictionary_repo['Body']), extra_words, exclude_words),
        }

    return cleaned_dict

def prepare_repo_data(dictionary_repo, extra_words = [], exclude_words = []):
    clean_dict_list = []
    
    for repo in dictionary_repo:
        clean_dict_list.append(prep_repo(repo, extra_words, exclude_words))
        
    return clean_dict_list

def get_df(bulk):
    return pd.DataFrame(prepare_repo_data(bulk))

In [3]:
# Can read csv to save time!

df = pd.read_csv('git_repo_nlp.csv')
df2 = pd.read_csv('github_data.csv', index_col=0)

In [4]:
df2.rename(columns={'readme': 'original'}, inplace=True)
df2['clean'] = df2.original.apply(remove_stopwords, basic_clean)


In [5]:
df = df.append(df2)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  sort=sort)


In [6]:
# #  turn off the web access for now and just read the cached files from the working directory
# #  contents = get_GitHub_most_stars()
# list_of_html_files = ['GitHubTopStars_repos-page1.html','GitHubTopStars_repos-page2.html',\
#                       'GitHubTopStars_repos-page3.html','GitHubTopStars_repos-page4.html',\
#                       'GitHubTopStars_repos-page5.html','GitHubTopStars_repos-page6.html',\
#                       'GitHubTopStars_repos-page7.html','GitHubTopStars_repos-page8.html',\
#                       'GitHubTopStars_repos-page9.html','GitHubTopStars_repos-page10.html',\
#                       'GitHubTopStars_repos-page11.html','GitHubTopStars_repos-page12.html',\
#                       'GitHubTopStars_repos-page13.html']

# star_list = []
# for html_file in list_of_html_files:
#     contents = read_GitHub_most_stars(html_file)
#     next_list = parse_urls(contents)
#     star_list = star_list + next_list

In [7]:
# github_base_url = 'https://github.com'

# git_repo_url = []

# for repo in star_list:
#     git_repo_url.append(github_base_url + repo)
        

In [8]:
# bulk = get_git_info(star_list)

In [9]:
# bulk = drop_empty_readmes(bulk)


In [10]:
# for repo in bulk:
#     repo['Body'] = repo['Body'].text.strip()
    

In [11]:
# df = get_df(bulk)

In [12]:
df.language.value_counts().tail(13)

Kotlin              4
Dart                3
Rust                3
Objective-C         3
Swift               2
TeX                 1
Jupyter Notebook    1
C#                  1
PHP                 1
Dockerfile          1
Assembly            1
Vim script          1
Clojure             1
Name: language, dtype: int64

In [13]:
df = df[df.language != 'Kotlin']
df = df[df.language != 'Rust']
df = df[df.language != 'Dart']
df = df[df.language != 'Objective-C']
df = df[df.language != 'Swift']
df = df[df.language != 'Clojure']
df = df[df.language != 'Jupyter Notebook']
df = df[df.language != 'TeX']
df = df[df.language != 'C#']
df = df[df.language != 'Vim script']
df = df[df.language != 'Assembly']
df = df[df.language != 'Dockerfile']
df = df[df.language != 'PHP']



In [14]:
# Getting only words and stemming them

df.clean = df.clean.apply(regex_it)
df.clean = df.clean.apply(stem)

In [15]:
# getting the words for each language

all_words = ' '.join(df['clean'])
js_words = ' '.join(df['clean'][df.language == 'JavaScript'])
python_words = ' '.join(df['clean'][df.language == 'Python'])
cpp_words = ' '.join(df['clean'][df.language == 'C++'])
typeScript_words = ' '.join(df['clean'][df.language == 'TypeScript'])
go_words = ' '.join(df['clean'][df.language == 'Go'])
java_words = ' '.join(df['clean'][df.language == 'Java'])
css_words = ' '.join(df['clean'][df.language == 'CSS'])
shell_words = ' '.join(df['clean'][df.language == 'Shell'])
vue_words = ' '.join(df['clean'][df.language == 'Vue'])
ruby_words = ' '.join(df['clean'][df.language == 'Ruby'])
objc_words = ' '.join(df['clean'][df.language == 'Objective-C'])
dart_words = ' '.join(df['clean'][df.language == 'Dart'])
rust_words = ' '.join(df['clean'][df.language == 'Rust'])
c_words = ' '.join(df['clean'][df.language == 'C'])



In [16]:
# finding the frequency for each word for each language

all_freq = pd.Series(all_words.split()).value_counts()
js_freq = pd.Series(js_words.split()).value_counts()
python_freq = pd.Series(python_words.split()).value_counts()
cpp_freq = pd.Series(cpp_words.split()).value_counts()
typeScript_freq = pd.Series(typeScript_words.split()).value_counts()
go_freq = pd.Series(go_words.split()).value_counts()
java_freq = pd.Series(java_words.split()).value_counts()
css_freq = pd.Series(css_words.split()).value_counts()
shell_freq = pd.Series(shell_words.split()).value_counts()
vue_freq = pd.Series(vue_words.split()).value_counts()
ruby_freq = pd.Series(ruby_words.split()).value_counts()
objc_freq = pd.Series(objc_words.split()).value_counts()
dart_freq = pd.Series(dart_words.split()).value_counts()
rust_freq = pd.Series(rust_words.split()).value_counts()
c_freq = pd.Series(c_words.split()).value_counts()



In [17]:
# Bag of words

word_counts = (pd.concat([all_freq, js_freq, python_freq, cpp_freq,typeScript_freq,
                          go_freq, java_freq, css_freq, shell_freq, vue_freq,
                          ruby_freq, objc_freq, dart_freq, rust_freq, c_freq], axis=1, sort=True)
                .set_axis(['all', 'js', 'python', 'cpp', 'ts', 'go', 'java',
                          'scc', 'shell', 'vue', 'ruby', 'objc', 'dart',
                          'rust', 'c'], axis=1, inplace=False)
                .fillna(0)
                .apply(lambda s: s.astype(int)))



What are the most common words in READMEs?


In [18]:
word_counts.sort_values(by='all', ascending=False).head(20)


Unnamed: 0,all,js,python,cpp,ts,go,java,scc,shell,vue,ruby,objc,dart,rust,c
use,6060,3154,1041,134,21,532,139,51,774,28,19,0,0,0,119
go,2452,174,90,3,1,2134,7,1,20,0,3,0,0,0,11
code,2312,1384,417,64,30,205,41,22,39,10,20,0,0,0,30
data,2155,619,1001,13,0,196,45,5,154,6,8,0,0,0,61
react,2066,2006,8,0,7,0,1,30,2,9,0,0,0,0,0
file,2021,787,615,27,6,294,20,19,168,2,10,0,0,0,50
function,1971,1577,163,8,0,109,21,9,51,0,0,0,0,0,29
s,1951,675,526,35,3,215,78,28,347,1,7,0,0,0,31
nativ,1940,1812,49,8,2,46,3,0,8,0,0,0,0,0,7
sourc,1928,1100,368,61,15,128,39,15,103,0,5,0,0,0,30


Does the length of the README vary by language?


In [19]:
df['readme_length'] = df.clean.str.len()
df[['language','readme_length']].groupby('language').mean().sort_values('readme_length', ascending=False)


Unnamed: 0_level_0,readme_length
language,Unnamed: 1_level_1
Shell,32015.0
Python,16558.825
Go,14339.272727
JavaScript,9536.5625
C,7920.375
HTML,6435.428571
Java,2867.24
C++,2757.85
Ruby,2138.0
Vue,1991.0


Do different languages use a different number of unique words?

In [20]:
df['unique_words'] = df.clean.apply(find_nunique)
df[['language','unique_words']].groupby('language').mean().sort_values('unique_words', ascending=False)


Unnamed: 0_level_0,unique_words
language,Unnamed: 1_level_1
Shell,827.571429
Python,667.175
Go,606.636364
C,450.875
JavaScript,403.53125
HTML,336.0
C++,205.9
Ruby,187.666667
Java,182.72
Vue,177.857143


In [22]:
X = df.clean
y = df['language']

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=.2)

train = pd.DataFrame(dict(actual=y_train))
test = pd.DataFrame(dict(actual=y_test))

