<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Notebook-setup" data-toc-modified-id="Notebook-setup-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Notebook setup</a></span><ul class="toc-item"><li><span><a href="#Import-required-libraries" data-toc-modified-id="Import-required-libraries-1.1"><span class="toc-item-num">1.1&nbsp;&nbsp;</span>Import required libraries</a></span></li><li><span><a href="#Define-notebook-wide-variables" data-toc-modified-id="Define-notebook-wide-variables-1.2"><span class="toc-item-num">1.2&nbsp;&nbsp;</span>Define notebook-wide variables</a></span></li><li><span><a href="#Create-output-directories" data-toc-modified-id="Create-output-directories-1.3"><span class="toc-item-num">1.3&nbsp;&nbsp;</span>Create output directories</a></span></li><li><span><a href="#Define-helper-functions" data-toc-modified-id="Define-helper-functions-1.4"><span class="toc-item-num">1.4&nbsp;&nbsp;</span>Define helper functions</a></span></li></ul></li><li><span><a href="#Extract-company-list-data" data-toc-modified-id="Extract-company-list-data-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Extract company list data</a></span><ul class="toc-item"><li><span><a href="#Afrilabs---map-of-African-accelerators-/-hubs" data-toc-modified-id="Afrilabs---map-of-African-accelerators-/-hubs-2.1"><span class="toc-item-num">2.1&nbsp;&nbsp;</span>Afrilabs - map of African accelerators / hubs</a></span></li><li><span><a href="#Startups-List----global-country-specific-listings" data-toc-modified-id="Startups-List----global-country-specific-listings-2.2"><span class="toc-item-num">2.2&nbsp;&nbsp;</span>Startups List -- global country-specific listings</a></span></li><li><span><a href="#Google-Places-API" data-toc-modified-id="Google-Places-API-2.3"><span class="toc-item-num">2.3&nbsp;&nbsp;</span>Google Places API</a></span><ul class="toc-item"><li><span><a href="#Nearby-places" data-toc-modified-id="Nearby-places-2.3.1"><span class="toc-item-num">2.3.1&nbsp;&nbsp;</span>Nearby places</a></span></li><li><span><a href="#Text-search" data-toc-modified-id="Text-search-2.3.2"><span class="toc-item-num">2.3.2&nbsp;&nbsp;</span>Text search</a></span></li></ul></li></ul></li><li><span><a href="#Extract-Company-Metadata" data-toc-modified-id="Extract-Company-Metadata-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>Extract Company Metadata</a></span><ul class="toc-item"><li><span><a href="#Google-Place-API---Grab-Place-details-based-on-Place-IDs" data-toc-modified-id="Google-Place-API---Grab-Place-details-based-on-Place-IDs-3.1"><span class="toc-item-num">3.1&nbsp;&nbsp;</span>Google Place API - Grab Place details based on Place IDs</a></span></li><li><span><a href="#Google-Search-Query-API" data-toc-modified-id="Google-Search-Query-API-3.2"><span class="toc-item-num">3.2&nbsp;&nbsp;</span>Google Search Query API</a></span></li><li><span><a href="#Twitter-data" data-toc-modified-id="Twitter-data-3.3"><span class="toc-item-num">3.3&nbsp;&nbsp;</span>Twitter data</a></span><ul class="toc-item"><li><span><a href="#get-tweets-by-keyword" data-toc-modified-id="get-tweets-by-keyword-3.3.1"><span class="toc-item-num">3.3.1&nbsp;&nbsp;</span>get tweets by keyword</a></span></li><li><span><a href="#get-tweets-from-user-timeline" data-toc-modified-id="get-tweets-from-user-timeline-3.3.2"><span class="toc-item-num">3.3.2&nbsp;&nbsp;</span>get tweets from user timeline</a></span></li></ul></li><li><span><a href="#From-company-website" data-toc-modified-id="From-company-website-3.4"><span class="toc-item-num">3.4&nbsp;&nbsp;</span>From company website</a></span></li></ul></li></ul></div>

# Notebook setup

## Import required libraries

In [12]:
# please import these required packages before running the code.
import pandas as pd
import numpy as np

import glob
import os
import datetime
import time

import requests
import re
import json
import bs4

import googlemaps
from googleapiclient.discovery import build
import pprint

import twitter

## Define notebook-wide variables

In [2]:
# generate timestamp for today for output files
date_today = datetime.date.today().isoformat()

In [None]:
# input Places API key
gmaps = googlemaps.Client(key='<please insert your own Gmaps key here>')

In [520]:
# Google Search API keys here
my_api_key = '<please input your own API key here>'
my_cse_id = '<please input your CSE ID here>'

## Create output directories

In [11]:
output_folder = "outputs"

os.makedirs(output_folder, exist_ok=True)
for subfolder in ['1 extract lists', '2 extract metadata']:
    os.makedirs(os.path.join(output_folder, subfolder), exist_ok=True)

## Define helper functions

In [390]:
def get_dummy_type_cols(df_gmaps_all, col="types"):
    
    for genre in set.union(*df_gmaps_all[col].apply(set)):
        df_gmaps_all[genre] = df_gmaps_all.apply(lambda _: int(_[col].count(genre)), axis=1)
    
    return(df_gmaps_all)

In [521]:
def google_search(search_term, api_key, cse_id, **kwargs):
    service = build("customsearch", "v1", developerKey=api_key)
    res = service.cse().list(q=search_term, cx=cse_id, **kwargs).execute()
    return(res['items'])

In [635]:
def extract_status_data(results):
    status_cols = [
        'created_at', 
    'favorited', 
    'id', 
    'text', 
    'location', 
    'in_reply_to_screen_name', 
    'in_reply_to_user_id', 
    'in_reply_to_status_id', 
    'truncated', 
    'retweeted', 
    'source', 
    'user', 
    'urls', 
    'user_mentions', 
    'hashtags', 
    'geo', 
    'place', 
    'coordinates', 
    'contributors', 
    'retweeted_status'
    ]

    twitter_data_all = []

    for result in results:
        created_at_res = result.created_at
        favorited_res = result.favorited
        id_res = result.id
        text_res = result.text
        location_res = result.location
        in_reply_to_screen_name_res = result.in_reply_to_screen_name
        in_reply_to_user_id_res = result.in_reply_to_user_id
        in_reply_to_status_id_res = result.in_reply_to_status_id
        truncated_res = result.truncated
        retweeted_res = result.retweeted
        source_res = result.source
        user_res = result.user
        urls_res = result.urls
        user_mentions_res = result.user_mentions
        hashtags_res = result.hashtags
        geo_res = result.geo
        place_res = result.place
        coordinates_res = result.coordinates
        contributors_res = result.contributors
        retweeted_status_res = result.retweeted_status

        twitter_data_all.append([created_at_res, 
            favorited_res, 
            id_res, 
            text_res, 
            location_res, 
            in_reply_to_screen_name_res, 
            in_reply_to_user_id_res, 
            in_reply_to_status_id_res, 
            truncated_res, 
            retweeted_res, 
            source_res, 
            user_res, 
            urls_res, 
            user_mentions_res, 
            hashtags_res, 
            geo_res, 
            place_res, 
            coordinates_res, 
            contributors_res, 
            retweeted_status_res, 
            ])
        
    df_twitter_data = pd.DataFrame(twitter_data_all, columns=status_cols)
    return(df_twitter_data)

# Extract company list data

## Afrilabs - map of African accelerators / hubs
Data source: http://www.afrilabs.com/afrilabs-passport/

In [9]:
# pull page source of Afrilabs page
url = 'http://www.afrilabs.com/afrilabs-passport/'
r = requests.get(url)
page_source = r.content
soup = bs4.BeautifulSoup(page_source, "lxml")

# extract maps script
script = soup.find('script', text=re.compile('\.maps'))

# extract the maps data
json_text = re.search(r'.*\.maps\((.*)\)\.data\("wpgmp_maps"\);}\);$', script.string).group(1)
data = json.loads(json_text)
df_afrilabs = pd.DataFrame(data['places'])

In [165]:
# don't mind categories column since contains redundant data
print(df_afrilabs['categories'].astype(str).value_counts())

[{'id': '1', 'name': 'Afri', 'type': 'category', 'extension_fields': False, 'icon': 'http://www.afrilabs.com/wp-content/plugins/wp-google-map-plugin//assets/images/icons/justice.png'}]    57
Name: categories, dtype: int64


In [166]:
# extract nested data on location
df_afrilabs_location = pd.DataFrame()
for ix in df_afrilabs.index:
    df_temp = pd.DataFrame.from_dict(df_afrilabs['location'].ix[ix], orient='index')
    df_temp.columns = [ix]
    df_temp = df_temp.T
    df_afrilabs_location = df_afrilabs_location.append(df_temp)

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  after removing the cwd from sys.path.


In [169]:
# merge scraped data
df_afrilabs_full = df_afrilabs.join(df_afrilabs_location)[list(df_afrilabs.columns) + list(df_afrilabs_location.columns)]
df_afrilabs_full.to_csv("outputs/1 extract lists/%s - Afrilabs List of Accelerators or Hubs.csv" % date_today)

## Startups List -- global country-specific listings
Data source: http://nigeria.startups-list.com/

In [171]:
# grab page content
url = 'http://nigeria.startups-list.com/'
r = requests.get(url)
page_source = r.content
soup = bs4.BeautifulSoup(page_source, "lxml")

In [186]:
# extract code with startups
list_startups = soup.find_all("div", {'class': 'startup'})

Extract key elements per code chunk
- startup name `<h1 property="name">`
- startup description `<p>`
- startup link outer `<div data-href= attribute>`
- startup keywords `<img alt attribute>`
- startup logo `<img src= attribute />`

In [253]:
list_startups_data = []

for startup in list_startups:
    temp = startup.find("h1")
    startup_name = " ".join(re.split("\s+", bs4.BeautifulSoup(' '.join([str(x) for x in temp.contents]), "lxml").text.strip(), flags=re.UNICODE))

    temp = startup.find("p")
    startup_description = " ".join(re.split("\s+", bs4.BeautifulSoup(' '.join([str(x) for x in temp.contents]), "lxml").text.strip(), flags=re.UNICODE))

    startup_link = startup.get('data-href')

    startup_keywords = startup.find('img').get('alt')

    startup_logo_img_src = startup.find('img').get('src')
    
    if (startup_logo_img_src == "/img/spacer.gif") | (startup_logo_img_src == ""):
        try:
            startup_logo_img_src = startup.find('img').get('data-src')
        except:
            startup_logo_img_src = ""

    list_startups_data.append([startup_name, startup_description, startup_link, startup_keywords, startup_logo_img_src])

In [254]:
df_startupslist = pd.DataFrame(list_startups_data, columns = ['startup_name', 'startup_description','startup_link','startup_keywords','startup_logo_img_src'])

# clean invalid image logos
df_startupslist = df_startupslist.replace({'/img/spacer.gif': np.nan, 'https://angel.co/images/shared/nopic_startup.png': np.nan})

In [256]:
df_startupslist.to_csv("outputs/1 extract lists/%s - Nigeria Startups List.csv" % date_today)

## Google Places API
Documentation here: https://developers.google.com/places/web-service/search

### Nearby places

In [335]:
# get first N batches of 20 results
batch_num = 50

# use kenya capital (Abuja) geocoordinates. Approximate radius with 480km
results = gmaps.places_nearby(radius=520000, location=(9.072264, 7.491302))
df_gmaps_all = pd.DataFrame(results['results'])

for batch in range(batch_num):
    # get next 20 results
    try:
        page_token_next = results['next_page_token']
    except:
        break

    # wait for 2 seconds before getting new data
    time.sleep(2)
    results = gmaps.places_nearby(page_token=page_token_next)
    
    df_gmaps_all = df_gmaps_all.append(pd.DataFrame(results['results']))
    
    print("batch %s done" % str(batch + 1))
    
df_gmaps_all.reset_index(drop=True, inplace=True)
print(df_gmaps_all.shape)

batch 1 done
batch 2 done
(60, 13)


In [338]:
for genre in set.union(*df_gmaps_all['types'].apply(set)):
    df_gmaps_all[genre] = df_gmaps_all.apply(lambda _: int(_['types'].count(genre)), axis=1)

In [345]:
df_gmaps_all.to_csv("outputs/1 extract lists/%s - Nigeria Google Places API - Nearby Places endpoint.csv" % date_today)

### Text search

In [384]:
keywords = ["accelerator", "hub", "startup", "business", "company", "incubator"]
country = "Kenya"
cou_id = "ke"

df_gmaps_textsearch = pd.DataFrame()

print(country)
for keyword in keywords:
    
    results_textsearch = gmaps.places(keyword, region=cou_id)
    df_temp = pd.DataFrame(results_textsearch['results'])

    df_temp['keyword_for_text_search'] = keyword
    df_temp['country_for_text_search'] = country
    print(keyword, df_temp.shape)
    df_gmaps_textsearch = df_gmaps_textsearch.append(df_temp)
    
df_gmaps_textsearch = df_gmaps_textsearch.reset_index(drop=True)
df_gmaps_textsearch = get_dummy_type_cols(df_gmaps_textsearch)

print(df_gmaps_textsearch.shape)

df_gmaps_textsearch.to_csv("outputs/1 extract lists/%s - %s Google Places API - Text Search endpoint.csv" % (date_today, country))

Kenya
accelerator (1, 14)
hub (9, 14)
startup (0, 2)
business (20, 14)
company (20, 14)
incubator (3, 14)
(53, 14)


In [400]:
df_gmaps_textsearch_all = pd.concat([df_gmaps_textsearch, df_gmaps_textsearch_id])[df_gmaps_textsearch.columns]
df_gmaps_textsearch_all.to_csv("outputs/1 extract lists/%s - Consolidated Google Places API - Text Search endpoint.csv" % (date_today))

In [660]:
df_gmaps_textsearch.shape, df_gmaps_textsearch_id.shape

((53, 28), (24, 21))

# Extract Company Metadata
We focus extracting data on these 3 Indonesian companies searched via Google Places API

In [499]:
df_extract = df_gmaps_textsearch_all[df_gmaps_textsearch_all['keyword_for_text_search'].isin(["incubator", "accelerator"]) & (df_gmaps_textsearch_all['country_for_text_search']== 'Indonesia')].reset_index(drop=True)

## Google Place API - Grab Place details based on Place IDs
https://developers.google.com/places/web-service/details

In [496]:
place_id_list = df_extract['place_id'].values

In [489]:
df_place_details = pd.DataFrame()
for place_id in place_id_list:
    test_place_indonesia = gmaps.place(place_id)

    consolidate = []
    for key in test_place_indonesia['result'].keys():
        consolidate.append(test_place_indonesia['result'][key])

    df_temp = pd.DataFrame(consolidate[1:]).T
    df_temp.columns = list(test_place_indonesia['result'].keys())[1:]
    
    df_place_details = pd.concat([df_place_details, df_temp])

In [490]:
df_place_details = df_place_details.reset_index(drop=True)

In [501]:
df_place_details.to_csv("outputs/2 extract metadata/%s - Indonesia Google Places API - Place Details endpoint.csv" % (date_today))

## Google Search Query API
Top Google search results for the company such as relevant articles, etc.

Parameter info here: https://developers.google.com/custom-search/json-api/v1/reference/cse/list

In [642]:
cou_id = 'id'
results = google_search('GnB Accelerator', my_api_key, my_cse_id, num=10, gl=cou_id, cr = cou_id)

In [643]:
df_google_search = pd.DataFrame(results)

In [645]:
df_google_search.to_csv("outputs/2 extract metadata/%s - Indonesia GnB Accelerator Google Search API.csv" % (date_today))

## Twitter data
Top tweets resuls for the company such as relevant articles, etc.

Use Twitter API to get public tweets or search relevant tweets 

Python client library: https://github.com/bear/python-twitter

Twitter Standard (FREE) tier is LIMITED -- This search API searches against a sampling of recent Tweets published in the past 7 days. Part of the 'public' set of APIs.

Other endpoints have max requests per 15-min window

More info here:
- https://apps.twitter.com
- https://developer.twitter.com/en/docs/tweets/search/api-reference/get-search-tweets.html
- https://developer.twitter.com/en/docs/tweets/timelines/api-reference/get-statuses-user_timeline
- https://developer.twitter.com/en/docs/basics/authentication/overview/application-only

In [4]:
api = twitter.Api(consumer_key='<add consumer key here>',
                      consumer_secret='<add consumer secret here>',
                      access_token_key= '<add access token key here>',
                      access_token_secret='<add access token secret here>')

In [1]:
print(api.VerifyCredentials())

In [3]:
api.GetUserTimeline(screen_name='<add screenname here>')

In [5]:
api.GetFollowersPaged(screen_name='<add screenname here>')

In [4]:
api.ShowFriendship(source_screen_name='<add screenname here>', target_screen_name='<add another screenname here>')

### get tweets by keyword

In [626]:
results = api.GetSearch(raw_query="q=gnb%20accelerator&count=100", include_entities=True)
# results = api.GetSearch(raw_query='q=%22mad%20incubator%22')

In [637]:
df_twitter_data = extract_status_data(results)
df_twitter_data['TwitterAPIsource'] = 'keyword_search'

### get tweets from user timeline

In [633]:
statuses = api.GetUserTimeline(screen_name='GnBAccelerator')

In [663]:
df_twitter_usertimeline = extract_status_data(statuses)
df_twitter_usertimeline['TwitterAPIsource'] = 'user_timeline'

In [665]:
df_twitter_data_all = pd.concat([df_twitter_data,df_twitter_usertimeline])

In [666]:
df_twitter_data_all['TwitterAPIsource'].value_counts()

keyword_search    10
user_timeline      5
Name: TwitterAPIsource, dtype: int64

In [667]:
df_twitter_data_all.to_csv("outputs/2 Extract Company Metadata/%s - Indonesia GnB Accelerator Twitter API.csv" % (date_today))

## From company website

In [648]:
website_list = df_place_details[['name', 'website']].set_index('name').to_dict()['website']
website_list

{'GnB Accelerator': 'https://gnb.ac/',
 'Mad Incubator': 'http://www.incubator.com.my/',
 'The Accelerator': 'http://www.accelerator.co.id/'}

In [654]:
company = 'GnB Accelerator'
url = website_list[company]

In [655]:
# grab page content
r = requests.get(url, verify=False)
page_source = r.content
soup = bs4.BeautifulSoup(page_source)

  


 BeautifulSoup(YOUR_MARKUP})

to this:

 BeautifulSoup(YOUR_MARKUP, "lxml")

  markup_type=markup_type))


In [657]:
html = soup.prettify("utf-8")

In [659]:
html = soup.prettify("utf-8")
with open("outputs/2 extract metadata/%s - Indonesia GnB Accelerator Website data.html" % (date_today), "wb") as file:
    file.write(html)