In [150]:
from urllib.request import urlopen
from bs4 import BeautifulSoup
import markdownify
import pandas as pd

In [151]:
def openFile(url):
    # Open the url file
    file = urlopen(url)
    filebytes = file.read()

    # get full html of the url with BeautifulSoup
    html = BeautifulSoup(filebytes, features="html.parser")
    for script in html(["script"]):
        script.extract()

    file.close()
    
    return html

def convertHTMLtoDF(htmlText):
    # convert html to markdown
    md = markdownify.markdownify(str(htmlText))

    # text to dataframe
    df = pd.DataFrame(md.splitlines())
    
    return df

def getCodeSnippet(df):
    # find all ``` syntax indexes
    codeblock = df.loc[df[0] == "```"]
    indexlist = codeblock.index

    # get all code snippets and store them in a list
    codesnippet = []
    for index in range(0, len(indexlist), 2):
        start = indexlist[index] + 1
        stop = indexlist[index+1]
        codesnippet.append(df[start:stop])
    
    return codesnippet

def exportScrapOutput(url, df, codesnippet):
    # default output filename: last id of url path
    urlsplit = url.split('/')
    filename = urlsplit[-2] + '-' + urlsplit[-1]
    fileextension = '.cs'
    filepath = 'output/' + filename + fileextension

    outfile = open(filepath, 'w')

    for index, code in enumerate(codesnippet):
        codetext = code[0].str.cat(sep='\n')

        if index != 0: outfile.write("\n")
        outfile.write("%s" % codetext)

    outfile.close()
    
    #df.to_csv('output/' + filename + '.csv')
    
    return filepath

In [152]:
def handleFile(filepath):
    file = open(filepath, 'r')
    file_text = file.read()
    file.close()
    
    df = pd.DataFrame(file_text.splitlines())
    
    return df
    
# search code per line
def search(desiredCode, df):
    desiredCode = desiredCode.strip()
    if '{' in desiredCode or '}' in desiredCode or desiredCode == '':
        #print(desiredCode)    
        return None
    for index, code in enumerate(df[0]):
        #print(desiredCode, code)
        if desiredCode in code:
            return index
    return None

def compareDF(df1, df2):
    df1_same_index = []
    df2_same_index = []
    
    for index, code in enumerate(df2[0]):
        found_index = search(code, df1)
        print(found_index, index)
        if found_index != None:
            df1_same_index.append(found_index)
            df2_same_index.append(index)
    
    return df1_same_index, df2_same_index

def commentSameCode(df1, df1_same_index, df2, df2_same_index):
    outdf1 = df1.copy()
    outdf2 = df2.copy()

    #print(len(df1_same_index))
    for index in df1_same_index:
        outdf1.loc[index, 0] = '//' + df1[0][index]

    #print(len(df2_same_index))
    for index in df2_same_index:
        outdf2.loc[index, 0] = '//' + df2[0][index]
        
    return outdf1, outdf2

def exportCompareOutput(filename, outdf, index):
    filepath = 'output/comparator/out' + index + '_' + filename
    outfile = open(filepath, 'w')
    codetext = outdf[0].str.cat(sep='\n')
    outfile.write("%s" % codetext)
    outfile.close()
    return filepath

In [153]:
def scrapper(url):
    html = openFile(url)
    df = convertHTMLtoDF(html)
    codeList = getCodeSnippet(df)
    outfilepath = exportScrapOutput(url, df, codeList)
    
    return outfilepath

In [154]:
def comparator(filepath1, filepath2):
    df1 = handleFile(filepath1)
    df2 = handleFile(filepath2)
    df1sameindex, df2sameindex = compareDF(df1, df2)
    outdf1, outdf2 = commentSameCode(df1, df1sameindex, df2, df2sameindex)
    
    filename1 = filepath1.split('/')[-1]
    outfilepath1 = exportCompareOutput(filename1, outdf1, '1')
    filename2 = filepath2.split('/')[-1]
    outfilepath2 = exportCompareOutput(filename2, outdf2, '2')
    
    return outfilepath1, outfilepath2

In [155]:
def main(urlFileList):
    outInputFilePathList = {}
    
    for url, inputFile in urlFileList:
        inputFilePath = 'input/' + inputFile
        if inputFile in outInputFilePathList.keys():
            print("--", outInputFilePathList[inputFile])
            inputFilePath = outInputFilePathList[inputFile]
        
        outpath = scrapper(url)
        outfilepath1, outfilepath2 = comparator(outpath, inputFilePath)
        
        print(outpath, outfilepath2)
        
        outInputFilePathList.update({inputFile: outfilepath2})

In [156]:
fileList = [['<<url-link>>', 'input2.cs']]

main(fileList)

output/module-6-add-leaderboard-menu.cs output/comparator/out2_LeaderboardsMenu.cs
output/module-6-add-leaderboard-menu.cs output/comparator/out2_LeaderboardsPeriodMenu.cs
output/module-6-use-the-sdk-to-get-leaderboard.cs output/comparator/out2_LeaderboardEssentialsWrapper.cs
-- output/comparator/out2_LeaderboardsMenu.cs
output/module-6-put-it-all-together.cs output/comparator/out2_out2_LeaderboardsMenu.cs
-- output/comparator/out2_LeaderboardsPeriodMenu.cs
output/module-6-put-it-all-together.cs output/comparator/out2_out2_LeaderboardsPeriodMenu.cs
output/module-6-put-it-all-together.cs output/comparator/out2_IndividualLeaderboardMenu.cs
output/module-7-add-a-login-menu.cs output/comparator/out2_SinglePlatformAuthWrapper.cs
-- output/comparator/out2_SinglePlatformAuthWrapper.cs
output/module-7-use-sdk-to-login.cs output/comparator/out2_out2_SinglePlatformAuthWrapper.cs
-- output/comparator/out2_out2_SinglePlatformAuthWrapper.cs
output/module-7-put-it-all-together.cs output/comparator/o