In [1]:
# imports
import requests
import json
import os
import sys
import time
import pandas as pd
import numpy as np
import platform
import datetime
import re

from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.chrome.options import Options

In [3]:
# generic store data to file function
def store_data(data, file, mode='w', toJson=False):
    if toJson:
        data = json.dumps(data)
    with open(file, mode, encoding='utf-8') as fp:
        result = fp.write(data)
        return result
    
# generic load data from file function
def load_data(file, fromJson=False):
    if os.path.isfile(file):
        with open(file, 'r', encoding='utf-8', errors="ignore") as fp:
            data = fp.read()
            if fromJson:
                data = json.loads(data)
            return data
    else:
        return 'file not found'

# test text
#print(store_data('Hello', '../data/repositories/mlart/test.txt'))
#print(load_data('../data/repositories/mlart/test.txt'))

# test json
#print(store_data({'msg':'Hello World'}, '../data/repositories/mlart/test.json', toJson=True))
#print(load_data('../data/repositories/mlart/test.json', fromJson=True))

#store_data(result[0]['html'], '../data/repositories/kaggle/notebook.html')
#store_data(result[0]['iframe'], '../data/repositories/kaggle/kernel.html')

In [4]:
# helper function to create folder create_folder
def create_folder(path):
    if not os.path.exists(os.path.dirname(path)):
        try:
            os.makedirs(os.path.dirname(path))
            print(path + ' created')
        except OSError as exc: # Guard against race condition
            if exc.errno != errno.EEXIST:
                raise

In [37]:
# scan text for predefined terms

text = 'We use LSTM for anomaly and object detection. As Convolutional Neural Networks are great for ML.'

pd_ml_terms = pd.read_csv('../data/patterns/ml_terms.csv')
ml_terms = pd_ml_terms['Term'].tolist()
ml_slugs = pd_ml_terms['Slug'].tolist()
ml_slugs = [x for x in ml_slugs if str(x) != 'nan']
ml_tags = pd_ml_terms['Tag'].tolist()
ml_tags = [x for x in ml_tags if str(x) != 'nan']

#print(ml_tags)

ml_libs = pd.read_csv('../data/patterns/ml_libraries.csv')
ml_libs = ml_libs['Python Package'].tolist()

def match_text(haystack, needles, toLower = False, unique = True):
    
    if toLower == True:
        haystack = haystack.lower()
        needles = [x.lower() for x in needles]
    
    if unique == True:
        matches = {x for x in needles if x in haystack}
        matches = list(matches)
    else:
        matches = [x for x in needles if x in haystack]
    
    return matches

def match_tags(haystack):
    df = pd.read_csv('../data/patterns/ml_terms.csv')
    tags = []
    
    df.set_index('Term', inplace = True)
    for item in haystack:
        try:
            tag = df.loc[item].get('Tag')
            if not 'nan' in str(tag):
                tags.append(tag)
        except:
            pass
        
    df.set_index('Slug', inplace = True)
    for item in haystack:
        try:
            tag = df.loc[item].get('Tag')
            if not 'nan' in str(tag):
                tags.append(str(tag))
        except:
            pass
        
    #if 'ANN' in tags or 'CNN' in tags or 'RNN' in tags:
    #    tags.remove('NN')
    
    return list(set(tags))

#ml_slugs, ml_terms, ml_libs, match_text(haystack, needles, toLower = False, unique = True)
needles = {
    'ml_slugs': ml_slugs,
    'ml_terms': ml_terms,
    'ml_libs': ml_libs,
}
needles_need_str_lower = {
    'ml_slugs': False,
    'ml_terms': True,
    'ml_libs': False,
}

matches = []

matches.extend(match_text(text, ml_terms, True))
matches.extend(match_text(text, ml_slugs, False))
print('matches', matches)

tags = match_tags(matches)
print('tags', tags)

matches ['object detection', 'convolutional neural network', 'lstm', 'anomaly', 'detect', 'neural network', 'ML']
tags ['Object Detection', 'NN', 'LSTM', 'ML', 'CNN']


In [38]:
# get file modifictaion date
# https://stackoverflow.com/questions/237079/how-to-get-file-creation-modification-date-times-in-python

def creation_date(path_to_file, datetime = True):
    """
    Try to get the date that a file was created, falling back to when it was
    last modified if that isn't possible.
    See http://stackoverflow.com/a/39501288/1709587 for explanation.
    """
    if platform.system() == 'Windows':
        timestamp = os.path.getctime(path_to_file)
    else:
        stat = os.stat(path_to_file)
        try:
            timestamp = stat.st_birthtime
        except AttributeError:
            # We're probably on Linux. No easy way to get creation dates here,
            # so we'll settle for when its content was last modified.
            timestamp = stat.st_mtime
        
    if datetime == True:
        timestamp = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(timestamp))

    return timestamp
    
folder_base = '../data/repositories/kaggle/competitions/c/'
folder = '3d-object-detection-for-autonomous-vehicles/notebooks/asimandia/lyft3d-inference-kernel/'
notebook = 'notebook_02.html'
kernel = 'kernel.html'
print(creation_date(folder_base+folder+notebook))

2020-12-12 16:08:03


In [39]:
# clear text formatting
import re
def clear_text(s):
    s = s.replace('\n',' ').replace('\r','').replace('¶','').strip()
    s = re.sub("\s\s+" , " ", s)
    return s

print(clear_text("The   fox jumped   over    the log."))

The fox jumped over the log.


In [36]:
# scrape publications list

path_in  = '../data/repositories/manual/zalando.com/research-publications/index.html'
path_out = '../data/datasets/zalando_publications_01.csv'

def scrape_publication(html):
    
    
    soup = BeautifulSoup(html, 'html.parser')
    
    snippet = soup.find('div', class_='entry-content')
    
    raw_items = snippet.find_all('div', class_='portelement')
    items = []
    for item in raw_items:
        meta = {}
        meta['title'] = item.find('h3').text.strip()
        block = item.find('div', class_='description-block_2')
        block = block.text.split('Authors:')
        meta['authors'] = block[1].strip()
        
        b = block[0].replace('\n','').strip()
        meta['publicate_at'] = b
        match = re.match(r'.*([0-9]{4})', b)
        if match is not None:
            meta['year'] = match.group(1)
                    
        link = item.find('div', class_='button-block')
        if link != None:
            link = link.find('a')
            meta['link'] = link.get('href')
        
        #meta['link'] = url
        #meta['date'] = soup.find('time', class_="ct-meta-element-date").get('datetime')

        # date formatting #"2020-05-08T19:41:42+05:30"
        # ignore "+05:30"
        #date_time_str = meta['date'].split('+')
        #date_time_str = date_time_str[0]
        #date_time_obj = datetime.datetime.strptime(date_time_str, '%Y-%m-%dT%H:%M:%S')
        #meta['date'] = str(date_time_obj)

        #meta['text'] = clear_text(text)
        #meta['code'] = code
        items.append(meta)
    
    
    return items


html = load_data(path_in)
if 'file not found' in html:
    print(html)
meta = scrape_publication(html)
print(len(meta), meta[0])
df = pd.DataFrame(meta)
print(df)
df.to_csv(path_out, sep=';', index=False)
print('done')

36 {'title': 'Contextual BERT: Conditioning the Language Model Using a Global State', 'authors': 'Denk, Peleteiro', 'publicate_at': 'Accepted to Coling – TextGraphs-14 workshop (2020)', 'year': '2020', 'link': 'https://arxiv.org/abs/2010.15778'}
                                                title  \
0   Contextual BERT: Conditioning the Language Mod...   
1   Towards User-in-the-Loop Online Fashion Size R...   
2   Attention Gets You the Right Size and Fit in F...   
3   Personalized Size Recommendations with Human i...   
4   Task-Aware Representation of Sentences for Gen...   
5   Outfit Generation and Recommendation – An Expe...   
6           Learning Size and Fit from Fashion Images   
7   Meta-learning for Size and Fit Recommendation ...   
8   SizeNet: Weakly Supervised Learning of Visual ...   
9   Transform the Set: Memory Attentive Generation...   
10  Generating High-Resolution Fashion Model Image...   
11  Eigendecompositions of Transfer Operators in R...   
12  A Deep Le

In [62]:
# throw all json-files into a single csv

path_in = '../data/repositories/manual/zalando.com/blog/'
csv_out = '../data/database/zalando_blog_01.csv'

path_in = '../data/repositories/manual/zalando.com/jobs/'
csv_out = '../data/database/zalando_jobs_01.csv'

path_in = '../data/repositories/manual/zalando.com/research-projects/'
csv_out = '../data/database/zalando_projects_01.csv'

path_in = '../data/repositories/manual/medium.com/Applications of Machine Learning in FinTech/'
csv_out = '../data/database/medium_fintech_01.csv'

path_in = '../data/repositories/manual/bcgdv.com/dv_hacks/'
csv_out = '../data/database/bcgdv_hackaton_01.csv'

path_in = '../data/repositories/manual/bcgdv.com/founded_company/'
csv_out = '../data/database/bcgdv_founded_01.csv'

article = load_data(path_in+'article.json', fromJson=True)
quit = 0 # quit after n files processed / 0 ... no limit

path_in = path_in+'items/'
files = os.listdir(path_in)
print('files:', len(files))
i = 0

runtime_start = time.time()
df = pd.DataFrame()

for file in files:
    path = os.path.join(path_in, file)
    if os.path.isfile(path) and not '_preset' in file:
        print('#', i, file)
        i += 1

        #print(' - ', j, 'author:', author, '/ notebook:', notebook)
        data = {**article}
        data.update(load_data(path, fromJson=True))
        #print(data)

        data['text'] = clear_text(data['text'])
        if 'description' in data:
            data['description'] = clear_text(data['description'])
        if 'about' in data:
            data['about'] = clear_text(data['about'])

        # store item
        df = df.append(data, ignore_index=True)

        if quit!=0 and j>quit:
            break
        
# drop duplicates
#df = df.drop_duplicates(['link'])

runtime_end = time.time()
print('runtime:', round(runtime_end - runtime_start, 3), 'seconds for', i, 'items')
print(df.shape)
print(df.head())
        
# drop columns
#df.drop(columns=['code', 'text'], inplace=True)
#df.drop(columns=['text'], inplace=True)

df.to_csv(csv_out, sep=';', index=False)
print('done')

files: 14
# 0 Adversarial-Learning.json
# 1 Determinantal-Point-Processes.json
# 2 Fashion-DNA.json
# 3 Fashion-on-People-Images.json
# 4 Fashion-Renderer.json
# 5 Forecasting-Customers-Preference.json
# 6 Generative-Fashion-Design.json
# 7 Language-Modeling.json
# 8 Personalized-Size-Recommendation.json
# 9 Robust-Reinforcement-Learning.json
# 10 Sample-Efficient-Reinforcement-Learning.json
runtime: 0.09 seconds for 11 items
(11, 15)
  branche            category claps company        date date_scraped  \
0          Wholesale & Retail                             17.01.2021   
1          Wholesale & Retail                             17.01.2021   
2          Wholesale & Retail                             17.01.2021   
3          Wholesale & Retail                             17.01.2021   
4          Wholesale & Retail                01.10.2020   17.01.2021   

                                         description field  \
0  We have an article embedding which encodes, in...         
1  D