# Extraction of Wiki data

## Importing the necessary packages

In [35]:
import pandas as pd
import json
import requests
from SPARQLWrapper import SPARQLWrapper, JSON

## Defining queries for fetching wiki data

The **get_wikidata_id()** function takes a name as input and returns the Wikidata ID for that name. If the entity is not found, the function returns None.

The function first constructs a URL to the Wikidata API. The API endpoint wbsearchentities takes a number of parameters, including the name of the entity, the language, and the format of the response. The function then uses the requests library to make a GET request to the API. The response is a JSON object, which the function parses.

The function then checks if the JSON object contains any entities. If it does, the function returns the ID of the first entity. Otherwise, the function returns None.

In [4]:
def get_wikidata_id(name):
    """
    Get the Wikidata ID for a given name.

    Args:
        name (str): The name of the entity.

    Returns:
        str: The Wikidata ID, or None if the entity is not found.
    """

    url = 'https://www.wikidata.org/w/api.php'
    params = {
        'action': 'wbsearchentities',
        'format': 'json',
        'language': 'en',
        'search': name
    }
    response = requests.get(url, params=params)
    data = response.json()

    # Check if the entity was found.

    if len(data['search']) > 0:
        return data['search'][0]['id']
    else:
        return None


In [45]:
def extract_value(json_string):
  """Extracts the value component from a JSON string.

  Args:
    json_string: A JSON string.

  Returns:
    The value component of the JSON string.
  """

  json_object = json.loads(json.dumps(json_string))
  if not isinstance(json_object, float):
    return json_object["value"]
  else:
    return json_object


In [60]:
def get_knowledge_graph(wikidata_id):
    """
    Get the knowledge graph for a given Wikidata ID.

    Args:
        wikidata_id (str): The Wikidata ID of the entity.

    Returns:
        dict: The knowledge graph, a dictionary of properties to values.
    """

    sparql = SPARQLWrapper("https://query.wikidata.org/sparql")
    query = """
SELECT    ?companyLabel ?ticker ?countryLabel ?headquartersLabel ?ceoLabel ?industryLabel ?businessLabel ?productLabel ?competitorLabel ?nicknameLabel ?subsidiaryLabel WHERE {
      SERVICE wikibase:label { bd:serviceParam wikibase:language "[AUTO_LANGUAGE],en". }
      ?company wdt:P361 wd:%s.
      OPTIONAL {
        ?company p:P414 ?exchange .
        ?exchange ps:P414 wd:Q13677 .
        ?exchange pq:P249 ?ticker .
      }
      OPTIONAL { ?company wdt:P17 ?country. }
      OPTIONAL { ?company wdt:P159 ?headquarters. }
      OPTIONAL { ?company wdt:P169 ?ceo. }
      OPTIONAL { ?company wdt:P452 ?industry. }
      OPTIONAL { ?company wdt:P199 ?business. }
      OPTIONAL { ?company wdt:P1056 ?product. }
#       OPTIONAL { ?company wdt:P169 ?competitor. }
      OPTIONAL { ?company wdt:P1449 ?nickname. }
      OPTIONAL { ?company wdt:P355 ?subsidiary. }
    }
    """ % (wikidata_id)
    sparql.setQuery(query)
    sparql.setReturnFormat(JSON)
    results = sparql.query().convert()

    df = pd.DataFrame(results["results"]["bindings"])
    df.rename(columns={"propertyLabel": "Property", "value": "Value"}, inplace=True)

    # Extract the value from the string
    df = df.applymap(extract_value)

    return df


## Extracting wiki data id for Dow Jones

In [61]:
wikidata_id = get_wikidata_id('Dow Jones Industrial Average')

## Extracting data of Companies listed in Dow Jones Index

In [62]:
knowledge_graph = get_knowledge_graph(wikidata_id)

In [68]:
#companyLabel_tuples = knowledge_graph["companyLabel"].apply(tuple)
#grouped_df = knowledge_graph.groupby(companyLabel_tuples)
#knowledge_graph = grouped_df.head(1)
knowledge_graph

Unnamed: 0,companyLabel,countryLabel,headquartersLabel,ceoLabel,industryLabel,businessLabel,productLabel,subsidiaryLabel,nicknameLabel,ticker
0,Apple,United States of America,Cupertino,Tim Cook,electronics,Apple Store,iPod,Apple Store,,
100,Intel,United States of America,Santa Clara,Bob Swan,electrical industry,,computer hardware,"McAfee, LLC",Chipzilla,
300,Cisco Systems,United States of America,San Jose,John Morgridge,telecommunications,,networking hardware,Webex by Cisco,,
4464,Microsoft,United States of America,Redmond,Satya Nadella,software industry,Microsoft Research,Microsoft Visual Studio,ZeniMax Media,,
7164,Boeing,United States of America,Chicago,Dave Calhoun,arms industry,Boeing Commercial Airplanes,Boeing 747,Boeing Commercial Airplanes,,BA
7564,IBM,United States of America,New York City,Arvind Krishna,information technology consulting,IBM Global Services,cloud computing,Red Hat,,IBM
7906,Merck KGaA,Germany,Q21037322,Belén Garijo,chemical industry,,liquid crystal,Seven Seas Limited,,
11864,McDonald’s,United States of America,Chicago,Chris Kempczinski,accommodation and food service activities,McCafé,McChicken,McDonald's Canada,Mekáč,MCD
14207,General Electric,United States of America,Boston,H. Lawrence Culp Jr.,mechanical engineering,GE Transportation,aircraft engine,NBC,,GE
18045,The Walt Disney Company,United States of America,Burbank,Bob Iger,film industry,"Disney Parks, Experiences and Products",software,Pixar,,DIS
