In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
import json
import unicodedata
import re
from bs4 import BeautifulSoup
from typing import Dict, List, Optional, Union, cast
from time import strftime, sleep

import nltk
from nltk.tokenize.toktok import ToktokTokenizer
from nltk.corpus import stopwords
nltk.download('stopwords')
nltk.download('wordnet')

import requests
from requests import get

from prepare import *
from acquire import *

from env import github_token, github_username

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/hector/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/hector/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [None]:
#run acquire.py

In [2]:
#reads in json
df = pd.read_json('data.json')
df.head()

Unnamed: 0,repo,language,readme_contents
0,thedaviddias/Front-End-Checklist,,"<h1 align=""center"">\n<br>\n <img src=""https:/..."
1,onevcat/Kingfisher,Swift,"<p align=""center"">\n<img src=""https://raw.gith..."
2,FallibleInc/security-guide-for-developers,,# 实用性 WEB 开发人员安全须知 \n\n### 目标读者 \n\n安全问题主要由以...
3,tailwindlabs/tailwindcss,JavaScript,"<p>\n <a href=""https://tailwindcss.com/"" ta..."
4,codepath/android_guides,,# CodePath Android Cliffnotes\n\nWelcome to th...


In [3]:
df.shape

(560, 3)

In [4]:
df.language.value_counts(dropna = False)

JavaScript          141
Python               59
NaN                  54
Java                 46
TypeScript           45
Go                   36
C++                  30
C                    20
Swift                13
Shell                13
HTML                 12
Rust                 11
Kotlin                9
C#                    9
Ruby                  9
PHP                   6
Jupyter Notebook      6
Vue                   6
Vim script            6
CSS                   5
Objective-C           3
Haskell               2
CoffeeScript          2
Elixir                2
TeX                   2
Lua                   2
Batchfile             1
Standard ML           1
Emacs Lisp            1
Makefile              1
Crystal               1
Dockerfile            1
OCaml                 1
Clojure               1
Rascal                1
Dart                  1
AsciiDoc              1
Name: language, dtype: int64

In [5]:
#sums null counts
df.isnull().sum()

repo                0
language           54
readme_contents     0
dtype: int64

In [6]:
#drops nulls
df = df.dropna()
df.shape

(506, 3)

In [7]:
#reset index 
df = df.reset_index(drop = True)
df.head()

Unnamed: 0,repo,language,readme_contents
0,onevcat/Kingfisher,Swift,"<p align=""center"">\n<img src=""https://raw.gith..."
1,tailwindlabs/tailwindcss,JavaScript,"<p>\n <a href=""https://tailwindcss.com/"" ta..."
2,github/fetch,JavaScript,# window.fetch polyfill\n\nThe `fetch()` funct...
3,ianstormtaylor/slate,TypeScript,"<p align=""center"">\n <a href=""#""><img src=""./..."
4,Kong/insomnia,JavaScript,# Insomnia REST Client\n\n[![Slack Channel](ht...


In [None]:
#functions used in our current work.
#pulled it out to work with it.
def basic_clean(string):
    '''
    This function takes in a string and normalizes it for nlp purposes
    '''
    # lowercase the string
    string = string.lower()

    # return normal form for the unicode string, encode/remove ascii
    string = unicodedata.normalize('NFKD', string).encode('ascii', 'ignore').decode('utf-8')
    
    # breaks down the string by keeping alphabet letters, numbers, apostraphes and spaces
    string = re.sub(r"[^a-z0-9\s]", '', string)
    
    return string


def tokenize(string):
    '''
    This function takes in a string and tokenizes it
    '''
    # create the tokenizer
    tokenizer = nltk.tokenize.ToktokTokenizer()
    
    # use the tokenizer, return as a string
    string = tokenizer.tokenize(string, return_str = True)
    
    return string

def stem(text):
    '''
    This function takes in a text and stems the words to their original stem
    '''
    
    # create a porter stemmer
    ps = nltk.porter.PorterStemmer()
    
    # loop through the text to stem the words
    stems = [ps.stem(word) for word in text.split()]
    
    # return back together
    stems = ' '.join(stems)
    
    return stems


def lemmatize(text):
    '''
    This function takes in a text and changes the words back to their root (lemmatize)
    '''
    
    # create the lemmatizer
    wnl = nltk.stem.WordNetLemmatizer()
    
    # loop through the list to split and lemmatize
    lemmas = [wnl.lemmatize(word) for word in text.split()]
    
    # return back together
    lemmas =' '.join(lemmas)
    
    return lemmas


def remove_stopwords(string, extra_words = [], exclude_words = []):
    '''
    This function takes in a string
    And returns the string with the English stopwords removed
    Additional stopwords can be added to extra_words (list)
    or words to exclude from stopwords can be added to exclude_words (list)
    
    -- This might break if the excluded words aren't in the stopwords list
    '''
    # define stopwords list      
    stopwords_list = stopwords.words('English')
    
    # add or remove words based on arguments
    stopwords_list = set(stopwords_list) - set(exclude_words) # the set removes words
    
    stopwords_list = stopwords_list.union(set(extra_words))
        
    # remove stopwords from string
    # turn string into list
    words = string.split()
    
    # remove the stopwords
    filtered_words = [w for w in words if w not in stopwords_list]
    
    # turn back into a string
    new_string = ' '.join(filtered_words)
    
    return new_string


################## ~~~~~~ Mother Prep Function ~~~~~~ ##################

def prepare_nlp_data(df, content = 'content', extra_words=[], exclude_words=[]):
    '''
    This function take in a df and the content (in string) for the column 
    with an option to pass lists for additional stopwords (extra_words)
    and an option to pass words to exclude from stopwords (exclude words)
    returns a df with the  original text, cleaned (tokenized and stopwords removed),
    stemmed text, lemmatized text.
    '''
    df['clean'] = df[content].apply(basic_clean)\
                            .apply(tokenize)\
                            .apply(remove_stopwords, 
                                   extra_words=extra_words, exclude_words=exclude_words)
    
    df['stemmed'] = df['clean'].apply(stem)
    
    df['lemmatized'] = df['clean'].apply(lemmatize)
    
    return df


def is_chinese(texts):
    '''
    This function takes in a dataframe and return true if the scanned text is in chinese
    '''
    if re.search("[\u4e00-\u9FFF]", texts):
            return True



def get_top_4_languages(df):
    '''
    This function takes in a dataframe and returns the top four
    programming languages found in the data
    '''
    top_4_list = list(df.language.value_counts().head(4).index)
    mask = df.language.apply(lambda x: x in top_4_list)
    df = df[mask]
    return df


def drop_unneeded_data(df):
    '''
    This function takes in the repo dataframe
    Drops any rows with nulls
    Drops any rows that are chinese
    Drops all rows that aren't in the top 4 languages
    '''
    df = df.dropna()
    df = df[df.readme_contents.apply(is_chinese) !=True]
    df = get_top_4_languages(df)
    df = df.reset_index().drop(columns = 'index')
    return df


def split_data(df):
    '''
    This function takes in a dataframe and splits it into train, test, and 
    validate dataframes for my model
    '''

    train_validate, test = train_test_split(df, test_size=.2, 
                                        random_state=123, stratify=df.language)
    train, validate = train_test_split(train_validate, test_size=.3, 
                                   random_state=123, stratify=train_validate.language)

    print('train--->', train.shape)
    print('validate--->', validate.shape)
    print('test--->', test.shape)
    return train, validate, test

In [8]:
#this takes in the readme_content and prepares it.
#the result is a clean value
text = df['readme_contents'].iloc[0] #takes in the first value of the 1st row of the readme_contents column and assigns it 'text' variable
text = text.lower()#takes in the variable and sets all to lower
soup = BeautifulSoup(text, 'html.parser')#soups it!
text = soup.get_text()#grabs the texts from the readme_contents
text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8', 'ignore')#normalized and encodes
text = re.sub(r"[^a-z0-9'\s]", '', text)#keeps alpha numeric characters
wnl = nltk.stem.WordNetLemmatizer()#lemma
lemmas = [wnl.lemmatize(word) for word in text.split()]#lemma loop
text_lemma = ' '.join(lemmas)#lemmas
stopwords = nltk.corpus.stopwords.words('english')#stopword
newStopWords = ['u','ha','wa']#we can change these. I left it in from my last exercise as a place holder if we found any other stopwords we wanted to use.
stopwords.extend(newStopWords)#adds new stopwords
words = text_lemma.split()#splits
filtered_words = [w for w in words if w not in stopwords]#loops the split
speech = ' '.join(filtered_words)#joins it all thgether

#Could not figure out how to loop this correctly


In [9]:
#run after the above.
#results == cleaned and prepared content
speech



In [10]:
df.head()

Unnamed: 0,repo,language,readme_contents
0,onevcat/Kingfisher,Swift,"<p align=""center"">\n<img src=""https://raw.gith..."
1,tailwindlabs/tailwindcss,JavaScript,"<p>\n <a href=""https://tailwindcss.com/"" ta..."
2,github/fetch,JavaScript,# window.fetch polyfill\n\nThe `fetch()` funct...
3,ianstormtaylor/slate,TypeScript,"<p align=""center"">\n <a href=""#""><img src=""./..."
4,Kong/insomnia,JavaScript,# Insomnia REST Client\n\n[![Slack Channel](ht...


In [None]:
#i was messing around with this, not sure if it helps.
#it pulls ALL the text from every DOCUMENT...the 500+ readme_contents
our_list = df['clean'].to_list()
our_lish

In [None]:
df.head()

In [None]:
#### Notes below. Some possible useful code to get rid of the chinese documents below....

In [None]:
text = df['readme_contents'].iloc[0] 
text = text.lower()
soup = BeautifulSoup(text, 'html.parser')
text = soup.get_text()
text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8', 'ignore')
text = re.sub(r"[^a-z0-9'\s]", '', text)
wnl = nltk.stem.WordNetLemmatizer()
lemmas = [wnl.lemmatize(word) for word in text.split()]
text_lemma = ' '.join(lemmas)
stopwords = nltk.corpus.stopwords.words('english')
newStopWords = ['u','ha','wa']
stopwords.extend(newStopWords)
words = text_lemma.split()
filtered_words = [w for w in words if w not in stopwords]
speech = ' '.join(filtered_words)

In [None]:
url = 'https://github.com/hoppscotch/hoppscotch'
def stuff(url):
    response = get(url, headers = {'User-Agent': 'Codeup Data Science'})
    soup = BeautifulSoup(response.text, features='lxml')
    speech = soup.select('.markdown-body')
    speech = speech[0].select('p')
    
    words = [words.text for words in speech]
    
    return words

In [None]:
words = stuff(url)

In [None]:
words

In [None]:
def clean_speech(words):
    original = ' '.join(words)
    text = original.lower()
    text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8', 'ignore')
    text = re.sub(r"[^a-z0-9'\s]", '', text)
    wnl = nltk.stem.WordNetLemmatizer()
    lemmas = [wnl.lemmatize(word) for word in text.split()]
    text_lemma = ' '.join(lemmas)
    stopwords = nltk.corpus.stopwords.words('english')
    newStopWords = ['u','ha','wa']
    stopwords.extend(newStopWords)
    words = text_lemma.split()
    filtered_words = [w for w in words if w not in stopwords]
    speech = ' '.join(filtered_words)

    return speech

In [None]:
speech = clean_speech(words)
speech

In [None]:
#This gets rid of the chinese documents
def is_chinese(texts):
    '''
    This function takes in a dataframe and return true if the scanned text is in chinese
    '''
    if re.search("[\u4e00-\u9FFF]", texts):
            return True

In [None]:
#drops true values from is_chinese function
df = df[df.readme_contents.apply(is_chinese) !=True]

In [None]:
def basic_clean(string):
    '''
    This function takes in a string and normalizes it for nlp purposes
    '''
    # lowercase the string
    string = string.lower()

    # return normal form for the unicode string, encode/remove ascii
    string = unicodedata.normalize('NFKD', string).encode('ascii', 'ignore').decode('utf-8')
    
    # breaks down the string by keeping alphabet letters, numbers, apostraphes and spaces
    string = re.sub(r"[^a-z0-9\s]", '', string)
    
    return string


def tokenize(string):
    '''
    This function takes in a string and tokenizes it
    '''
    # create the tokenizer
    tokenizer = nltk.tokenize.ToktokTokenizer()
    
    # use the tokenizer, return as a string
    string = tokenizer.tokenize(string, return_str = True)
    
    return string

def stem(text):
    '''
    This function takes in a text and stems the words to their original stem
    '''
    
    # create a porter stemmer
    ps = nltk.porter.PorterStemmer()
    
    # loop through the text to stem the words
    stems = [ps.stem(word) for word in text.split()]
    
    # return back together
    stems = ' '.join(stems)
    
    return stems


def lemmatize(text):
    '''
    This function takes in a text and changes the words back to their root (lemmatize)
    '''
    
    # create the lemmatizer
    wnl = nltk.stem.WordNetLemmatizer()
    
    # loop through the list to split and lemmatize
    lemmas = [wnl.lemmatize(word) for word in text.split()]
    
    # return back together
    lemmas =' '.join(lemmas)
    
    return lemmas


def remove_stopwords(string, extra_words = [], exclude_words = []):
    '''
    This function takes in a string
    And returns the string with the English stopwords removed
    Additional stopwords can be added to extra_words (list)
    or words to exclude from stopwords can be added to exclude_words (list)
    
    -- This might break if the excluded words aren't in the stopwords list
    '''
    # define stopwords list      
    stopwords_list = stopwords.words('English')
    
    # add or remove words based on arguments
    stopwords_list = set(stopwords_list) - set(exclude_words) # the set removes words
    
    stopwords_list = stopwords_list.union(set(extra_words))
        
    # remove stopwords from string
    # turn string into list
    words = string.split()
    
    # remove the stopwords
    filtered_words = [w for w in words if w not in stopwords_list]
    
    # turn back into a string
    new_string = ' '.join(filtered_words)
    
    return new_string


################## ~~~~~~ Mother Prep Function ~~~~~~ ##################

def prepare_nlp_data(df, content = 'content', extra_words=[], exclude_words=[]):
    '''
    This function take in a df and the content (in string) for the column 
    with an option to pass lists for additional stopwords (extra_words)
    and an option to pass words to exclude from stopwords (exclude words)
    returns a df with the  original text, cleaned (tokenized and stopwords removed),
    stemmed text, lemmatized text.
    '''
    df['clean'] = df[content].apply(basic_clean)\
                            .apply(tokenize)\
                            .apply(remove_stopwords, 
                                   extra_words=extra_words, exclude_words=exclude_words)
    
    df['stemmed'] = df['clean'].apply(stem)
    
    df['lemmatized'] = df['clean'].apply(lemmatize)
    
    return df


def is_chinese(texts):
    '''
    This function takes in a dataframe and return true if the scanned text is in chinese
    '''
    if re.search("[\u4e00-\u9FFF]", texts):
            return True



def get_top_4_languages(df):
    '''
    This function takes in a dataframe and returns the top four
    programming languages found in the data
    '''
    top_4_list = list(df.language.value_counts().head(4).index)
    mask = df.language.apply(lambda x: x in top_4_list)
    df = df[mask]
    return df


def drop_unneeded_data(df):
    '''
    This function takes in the repo dataframe
    Drops any rows with nulls
    Drops any rows that are chinese
    Drops all rows that aren't in the top 4 languages
    '''
    df = df.dropna()
    df = df[df.readme_contents.apply(is_chinese) !=True]
    df = get_top_4_languages(df)
    df = df.reset_index().drop(columns = 'index')
    return df


def split_data(df):
    '''
    This function takes in a dataframe and splits it into train, test, and 
    validate dataframes for my model
    '''

    train_validate, test = train_test_split(df, test_size=.2, 
                                        random_state=123, stratify=df.language)
    train, validate = train_test_split(train_validate, test_size=.3, 
                                   random_state=123, stratify=train_validate.language)

    print('train--->', train.shape)
    print('validate--->', validate.shape)
    print('test--->', test.shape)
    return train, validate, test

In [None]:
df = prepare_nlp_data(df, content = 'readme_contents', extra_words=['customer', 'customers', '1', '2'])

In [None]:
df.head()

#### prep function

In [None]:
def prep():
#drop nulls
    df = df.dropna()
#drop written chinese
    df = df[df.readme_contents.apply(is_chinese) !=True]
#Keeps top 5 languages
    df = top_5
#reindex and drop old index column
    df = df.reset_index(drop = True)
    
    return df

In [None]:
text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8', 'ignore')
text = re.sub(r"[^a-z0-9'\s]", '', text)
wnl = nltk.stem.WordNetLemmatizer()
lemmas = [wnl.lemmatize(word) for word in text.split()]
text_lemma = ' '.join(lemmas)
stopwords = nltk.corpus.stopwords.words('english')
newStopWords = ['u','ha','wa']
stopwords.extend(newStopWords)
words = text_lemma.split()
filtered_words = [w for w in words if w not in stopwords]
speech = ' '.join(filtered_words)

In [None]:
speech