In [1]:
import pandas as pd
from requests import get
from bs4 import BeautifulSoup
import os
import path
import re

import unicodedata
import re
import json

import nltk
from nltk.tokenize.toktok import ToktokTokenizer
from nltk.corpus import stopwords



In [2]:
# Grabs the language and body for each git repo as a dictionary and adds to a list
def get_git_info(git_list):
    
    list_of_git_info = []
    
    for git in git_list:
#       This section finds the language with the base url
        lang_url = git 
        headers = {'User-Agent': 'Codeup Ada Data Science'}
        response = get(lang_url, headers=headers)
        soup = BeautifulSoup(response.content, 'html.parser')
        language = soup.find('span', class_='lang')

#       This adds the part to get to the readme and scrapes the body  
        body_end = '/blob/master/README.md'
        body_url = lang_url + body_end
        headers = {'User-Agent': 'Codeup Ada Data Science'}
        response = get(body_url, headers=headers)
        soup = BeautifulSoup(response.content, 'html.parser')
        body = soup.find('article', class_="markdown-body entry-content p-3 p-md-6").text.strip()
        
#       Combines the language and body to a dictionary, if no language on the repo it is ignored
        if language != None:
            language = str(language)
            list_of_git_info.append({'Language': re.findall(r'>(.*?)<', language)[0],
                                     'Body': body})
        
    return list_of_git_info


In [3]:
gits = ['https://github.com/microsoft/Terminal',
        'https://github.com/kkuchta/css-only-chat',
        'https://github.com/microsoft/PowerToys',
        'https://github.com/jolaleye/cssfx',
        'https://github.com/flutter/flutter_web']


In [None]:
# url = 'https://github.com/search?q=stars%3A%3E0&s=stars&type=Repositories'
# headers = {'User-Agent': 'Codeup Ada Data Science'} # codeup.com doesn't like our default user-agent
# response = get(url, headers=headers)

# soup = BeautifulSoup(response.content, 'html.parser')
# title = soup.title
# #  pull the GitHub Trending repositories
# trending = soup.find_all('div', class_= 'd-inline-block')
# #  find all the "hrefs", which are the trending titles
# repo_names = re.findall(r'href="(.*?)"', str(trending))


In [None]:
# git_base_address = 'https://github.com'

# repo_address_list = []

# for repo in repo_names:
#     repo_address_list.append(git_base_address + repo)

In [4]:
def basic_clean(repo):
    repo = re.sub(r'\s', ' ', repo).lower()
    repo = unicodedata.normalize('NFKD', repo)\
        .encode('ascii', 'ignore')\
        .decode('utf-8', 'ignore')
    repo = re.sub(r"[^a-z0-9'\s]", '', repo)
    return repo


def tokenize(repo):
    tokenizer = nltk.tokenize.ToktokTokenizer()
    repo = tokenizer.tokenize(repo, return_str=True)
    return repo
    

def stem(repo):
    ps = nltk.porter.PorterStemmer()
    stems = [ps.stem(word) for word in repo.split()]
    repo_stemmed = ' '.join(stems)
    return repo_stemmed


def lemmatize(repo):
    wnl = nltk.stem.WordNetLemmatizer()
    lemmas = [wnl.lemmatize(word) for word in repo.split()]
    repo_lemmatized = ' '.join(lemmas)
    return repo_lemmatized


def remove_stopwords(repo, extra_words = [], exclude_words = []):
    stopword_list = stopwords.words('english')
    [stopword_list.append(word) for word in extra_words if word not in stopword_list]
    [stopword_list.remove(word) for word in exclude_words if word in stopword_list]
    words = repo.lower().split()
    filtered_words = [w for w in words if w not in stopword_list]
    repo_without_stopwords = ' '.join(filtered_words)
    return repo_without_stopwords


def prep_repo(dictionary_repo, extra_words = [], exclude_words = []):
    cleaned_dict = {
    'language': dictionary_repo['Language'],
    'original': dictionary_repo['Body'],
    'stemmed': stem(dictionary_repo['Body']),
    'lemmatized': lemmatize(dictionary_repo['Body']),
    'clean': remove_stopwords(basic_clean(dictionary_repo['Body']), extra_words, exclude_words),
        }

    return cleaned_dict

def prepare_repo_data(dictionary_repo, extra_words = [], exclude_words = []):
    clean_dict_list = []
    
    for repo in dictionary_repo:
        clean_dict_list.append(prep_repo(repo, extra_words, exclude_words))
        
    return clean_dict_list
        

In [None]:
data = prepare_repo_data(get_git_info(gits))

In [None]:
df = pd.DataFrame(data)
df.set_index('language', inplace=True)

In [None]:
df.head()

In [7]:
def get_git_df(gits):
    data = prepare_repo_data(get_git_info(gits))
    df = pd.DataFrame(data)
    df.set_index('language', inplace=True)
    return df
    
    

In [8]:
get_git_df(gits)

Unnamed: 0_level_0,clean,lemmatized,original,stemmed
language,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
C++,welcome repository contains source code window...,Welcome! This repository contains the source c...,Welcome! This repository contains the source c...,welcome! thi repositori contain the sourc code...
Ruby,cssonly chat truly monstrous async web chat us...,CSS-Only Chat A truly monstrous async web chat...,CSS-Only Chat\nA truly monstrous async web cha...,css-onli chat A truli monstrou async web chat ...
Vue,beautifully simple clicktocopy css effects htt...,Beautifully simple click-to-copy CSS effect ht...,Beautifully simple click-to-copy CSS effects\n...,beauti simpl click-to-copi css effect https://...
Dart,welcome code repository flutter web repository...,Welcome to the code repository for Flutter for...,Welcome to the code repository for Flutter for...,welcom to the code repositori for flutter for ...
