# Crawling class tests

## Import files and other settings

In [1]:
import django
import os
os.environ.setdefault("DJANGO_SETTINGS_MODULE", "byteme.settings.base")
django.setup()

In [2]:
from crawler.models import Crawler
from accounts.models import Speaker, UserProfile, User
from events.tag import Tag
from events.models import Event
from events.views import approveEventChange
from time import sleep
import csv
import json
import pickle
import sys
import random
from datetime import datetime
from django.utils import timezone

sys.setrecursionlimit(100000)
my_crawler = Crawler()
my_crawler.verbose = False

## Crawling

In [None]:
univ_list = ['Kaist', 'Korea Advanced Institute of Science and Technology', 'Stanford University',
       'Cambridge', 'MIT', 'Yale', 'Georgia Institute of Technology', 'Harvard University', 'ETH Zurich', 'EPFL', 'Oxford University',
       'Imperial College London', 'NUS', 'NTU', 'Princeton', 'Cornell', 'Tshinghua']
univ_scholar_ids = [[] for i in univ_list]

In [None]:
#This failed
for count, univ in enumerate(univ_list):
    try:
        ids = my_crawler.crawl_univ_scholar_ids(univ, 50)
        univ_scholar_ids.append(ids)
        print(count, univ, len(ids))
    except Exception:
        ids = []
        print('Exception Occured')
    univ_scholar_ids[count] = ids
    sleep(5)

In [None]:
all_ids = set()
with open('scholars.csv', newline='\n') as csvfile:
    m_reader = csv.reader(csvfile, delimiter=',', quotechar='|')
    for row in m_reader:
        tag_id = row[1:]
        all_ids = all_ids.union(set(tag_id))
all_ids = list(all_ids)
print(len(all_ids))

In [None]:
scholar_dic_list = []
for count, cur_id in enumerate(all_ids):
    print(count, ': ', cur_id)
    cur_dic = my_crawler.crawl_scholar(cur_id)
    scholar_dic_list.append(cur_dic)

In [None]:
with open('scholars.csv', 'w', newline='\n') as csvfile:
    m_writer = csv.writer(csvfile, delimiter=',',
                            quotechar='|', quoting=csv.QUOTE_MINIMAL)
    for ids, univ in zip(univ_scholar_ids, univ_list):
        row = [i for i in ids]
        row.insert(0, univ)
        m_writer.writerow(row)

In [None]:
with open('crawling_output/scholar_crawled.pickle', 'wb') as handle:
    pickle.dump(scholar_dic_list, handle, protocol=pickle.HIGHEST_PROTOCOL)

## Load Scholar info, set-up users

In [3]:
with open('crawling_output/scholar_crawled.pickle', 'rb') as handle:
    scholar_dic_pickle = pickle.load(handle)

In [4]:
scholar_list = []
for scholar_dic in scholar_dic_pickle:
    name = scholar_dic['name']
    try:
        univ = scholar_dic['association']
    except KeyError:
        univ = 'Kaist'
    email = ''.join(name.split(' ')) + str(random.randint(0,100)) + '@' + univ.split(' ')[0] + '.edu'
    email = email.lower()
    #print(name, univ, email)
    scholar = Speaker.objects.create(name=name, univ=univ, speakerEmail=email)
    if scholar_dic != {}:
        if 'field_of_study' in scholar_dic.keys():
            tag_objects = my_crawler.update_tag_info(scholar_dic['field_of_study'])
            scholar_dic['tags'] = tag_objects
        my_crawler.update_scholar_info(scholar, scholar_dic)
    scholar_list.append(scholar)

In [5]:
image_urls = []
with open('crawling_output/images.csv', newline='\n') as csvfile:
    m_reader = csv.reader(csvfile, delimiter=',', quotechar='|')
    for row in m_reader:
        image_urls.append(row[0])

In [6]:
num_random_scholars = 50
index_list = set()
while len(index_list) != num_random_scholars:
    index_list.add(random.randint(0, len(scholar_dic_pickle)-1))
index_list = list(index_list)

[128, 0, 131, 136, 140, 141, 12, 14, 144, 17, 18, 147, 29, 160, 36, 165, 169, 42, 171, 45, 174, 47, 177, 50, 179, 52, 53, 56, 184, 58, 60, 192, 65, 69, 71, 75, 79, 81, 84, 87, 96, 100, 102, 106, 109, 111, 113, 120, 123, 124]


In [7]:
admin = User.objects.create_superuser(username='admin', password='password@', email='berk17@gmail.com')
UserProfile.objects.create(user=admin)
username = 'myaldiz'
passw = '1234'
email = 'myaldiz@kaist.ac.kr'
staff = True
django_user = User.objects.create_user(username, password=passw, email= email, is_staff=staff)
user = UserProfile.objects.create(user=django_user)
print(User.objects.all(), UserProfile.objects.all())

<QuerySet [<User: admin>, <User: myaldiz>]> <QuerySet [<UserProfile: admin, KAIST>, <UserProfile: myaldiz, KAIST>]>


In [8]:
with open('crawling_output/Events.json') as file:
    vals = json.load(file)
abstract_list = []
place_list = []
title_list = []
details_list = []
for i in vals['Events']:
    abstract_list.append(i['abstract'])
    place_list.append(i['place'])
    title_list.append(i['title'])
    details_list.append(i['details'])

In [9]:
for index in index_list:
    scholar_dic = scholar_dic_pickle[index]
    scholar = scholar_list[index] 
    image_url = image_urls[random.randint(0, len(image_urls)-1)]
    time = timezone.now()
    time = time.replace(year=2019, month = random.randint(1,12), 
                        day=random.randint(1,25), hour=12, minute=0)
    abstract = abstract_list[random.randint(0,len(abstract_list)-1)]
    place = place_list[random.randint(0,len(place_list)-1)]
    title = title_list[random.randint(0,len(title_list)-1)]
    details = details_list[random.randint(0,len(details_list)-1)]
    cur_event = Event.objects.create(creater = user, speaker = scholar, time = time, timeReq = time, speakerReq = scholar, 
                                     req = "add", placeReq=place, titleReq=title, detailsReq=details,
                                     abstractReq=abstract, imgurLReq=image_url)
    tags = set()
    while (len(tags) != 10):
        cur_tag = Tag.objects.all()[random.randint(0,len(Tag.objects.all())-1)]
        tags.add(cur_tag)
    for tag in tags:
        cur_event.tags.add(tag)
    
    if bool(random.randint(0,2)):
        approveEventChange(cur_event.identifier, req="add")
    cur_event.save()

## Manual Speaker Crawling

In [None]:
s1 = Speaker(name='Min H Kim', speakerEmail='1@kaist.ac.kr')
s2 = Speaker(name='Daniel Suk Jeon', speakerEmail='2@kaist.ac.kr')
s3 = Speaker(name='Osman', univ='Stanford', speakerEmail='3@kaist.ac.kr')
s1.save()
s2.save()
s3.save()
print(Speaker.objects.all())

In [None]:
s1 = Speaker.objects.all()[0]
s2 = Speaker.objects.all()[1]
s3 = Speaker.objects.all()[2]

In [None]:
my_crawler.scholar_crawl_request(s1)
my_crawler.scholar_crawl_request(s2)
my_crawler.scholar_crawl_request(s3)

In [None]:
print(Tag.objects.all())
for speaker in Speaker.objects.all():
    print(speaker.tags.all())

## Indiv Score testing

In [None]:
print('UserProfiles: ', UserProfile.objects.all())
print('Speakers: ', Speaker.objects.all())
print('Tags:', Tag.objects.all())
print('Events:', Event.objects.all())

In [None]:
event = Event.objects.all()[0]
user = UserProfile.objects.all()[0]

In [None]:
score = event.generateRankingScore(user)
print(score)

In [None]:
event.speaker.citations

## Pytrends Test

In [None]:
import pytrends
from pytrends.request import TrendReq

pytrend = TrendReq(hl='en-US', tz=360)
#pytrend = TrendReq(hl='en-US', tz=360, proxies = {'https': 'https://34.203.233.13:80'})

In [None]:
#kw_list = ["Blockchain"]
kw_list = ['generative adverserial network', 'neural machine translation', 'neural turing machine']
pytrend.build_payload(kw_list, cat=0, timeframe='today 5-y', geo='', gprop='')

In [None]:
interest_over_time_df = pytrend.interest_over_time()
print(interest_over_time_df)

In [None]:
def parse_scholar_id(in_str):
    idx1 = in_str.find('user=') + 5
    idx2 = in_str.find('&') #Check this!!
    substr = in_str[idx1:idx2]
    return substr

def create_link(scholar_id):
    return "http://scholar.google.com/citations?user=" + scholar_id + "&hl=en"

In [None]:
query = "Dieter Fox google scholar"

matches = set()
for j in search(query, stop=5): 
    if "scholar.google" in j and len(j) < 100:
        scholar_id = parse_scholar_id(j)
        matches.add(scholar_id)
        
print('Number of links: ', len(matches))

In [None]:
scholar_id = matches.pop()
link = create_link(scholar_id)
print(link)
page = requests.get(link)
soup = BeautifulSoup(page.content, 'html.parser')
print(soup.prettify())