In [6]:
import os
from elasticsearch import Elasticsearch, helpers, NotFoundError
import json
from datetime import datetime

In [3]:
client = Elasticsearch("http://localhost:9200", \
                       basic_auth=("elastic", "tRjeGQw7"))

In [109]:
madmap = "madmap"
client.info()

ObjectApiResponse({'name': '4b1fa41a41cc', 'cluster_name': 'docker-cluster', 'cluster_uuid': 'cHkDnSXHQ4eRsYCNEolotA', 'version': {'number': '8.17.3', 'build_flavor': 'default', 'build_type': 'docker', 'build_hash': 'a091390de485bd4b127884f7e565c0cad59b10d2', 'build_date': '2025-02-28T10:07:26.089129809Z', 'build_snapshot': False, 'lucene_version': '9.12.0', 'minimum_wire_compatibility_version': '7.17.0', 'minimum_index_compatibility_version': '7.0.0'}, 'tagline': 'You Know, for Search'})

#  Section 1: Loading Different Types of Data #

#### Q1

In [5]:
health_info = client.cluster.health()

response = dict(list(health_info.items())[:5])
with open('answers/q1.json', 'w') as f:
    json.dump(dict(response), f,indent=4)
response
    

{'cluster_name': 'docker-cluster',
 'status': 'yellow',
 'timed_out': False,
 'number_of_nodes': 1,
 'number_of_data_nodes': 1}

#### Q2

In [24]:
madmap = "madmap"
client.indices.create(index=madmap)

ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'madmap'})

In [21]:
json_data_dir= "data/jsons"
json_files=[f for f in os.listdir(json_data_dir) if ".json" in f]  
json_files  

['locs.json', 'halloween.json', 'places.json', 'news_madison.json']

In [None]:
#bulk load locs.json
with open(os.path.join(json_data_dir, "locs.json"),'r') as file:
        data = json.load(file)
places = data["places"]
operations = [
        {"_index": madmap, "_source": place}
        for place in places
]
helpers.bulk(client,operations)

(307, [])

In [27]:
#bulk load halloween.json
with open(os.path.join(json_data_dir, "halloween.json"),'r') as file:
        data = json.load(file)
arrests = data["arrests"]
operations = [
        {"_index": madmap, "_source": arrest}
        for arrest in arrests
]
helpers.bulk(client,operations)


(19, [])

In [28]:
#bulk load news_madison.json
with open(os.path.join(json_data_dir, "news_madison.json"),'r') as file:
        data = json.load(file)
articles = data["articles"]
operations = [
        {"_index": madmap, "_source": article}
        for article in articles
]
helpers.bulk(client,operations)

(201, [])

In [34]:
mapping =client.indices.get_mapping(index=madmap)
with open('answers/q2.json', 'w') as f:
    json.dump(dict(mapping), f,indent=4)
mapping

ObjectApiResponse({'madmap': {'mappings': {'properties': {'WhenIsItHappening': {'type': 'text', 'fields': {'keyword': {'type': 'keyword', 'ignore_above': 256}}}, 'arrests': {'type': 'long'}, 'attended': {'type': 'long'}, 'author': {'type': 'text', 'fields': {'keyword': {'type': 'keyword', 'ignore_above': 256}}}, 'content': {'type': 'text', 'fields': {'keyword': {'type': 'keyword', 'ignore_above': 256}}}, 'coordinates': {'type': 'text', 'fields': {'keyword': {'type': 'keyword', 'ignore_above': 256}}}, 'description': {'type': 'text', 'fields': {'keyword': {'type': 'keyword', 'ignore_above': 256}}}, 'formattedAddress': {'type': 'text', 'fields': {'keyword': {'type': 'keyword', 'ignore_above': 256}}}, 'geoLocation': {'type': 'text', 'fields': {'keyword': {'type': 'keyword', 'ignore_above': 256}}}, 'name': {'type': 'text', 'fields': {'keyword': {'type': 'keyword', 'ignore_above': 256}}}, 'place_id': {'type': 'text', 'fields': {'keyword': {'type': 'keyword', 'ignore_above': 256}}}, 'place_ty

#### Q3

In [37]:
txt_dir = "data/text"
txt_files = [f for f in os.listdir(txt_dir) if ".txt" in f]
documents = []
for txt in txt_files:
    with open(os.path.join(txt_dir,txt),"r") as f:
        data = f.read()
        document= {"wiki": str(data)}
        documents.append(document)
helpers.bulk(client,documents, index = madmap)

(10, [])

In [39]:
mapping = client.indices.get_mapping(index=madmap)
with open('answers/q3.json', 'w') as f:
    json.dump(dict(mapping),f,indent=4)

# Section 2: Madison and UW-M Trivia!

#### Q4

In [25]:
response = client.search(
    index = madmap,
    body = {
        "query": {
            "match": {
                "formattedAddress" : "University"
            }
        },
        "size": 20
    }
)
with open('answers/q4.json', 'w') as f:
    json.dump(dict(response),f,indent=4)
for hit in response['hits']['hits']:
    print(hit['_source']['formattedAddress'])

University of Wisconsin–Madison, 2000 University Bay Dr, Madison, WI 53705, USA
University of Wisconsin–Madison, 2000 University Bay Dr, Madison, WI 53705, USA
800 University Ave, Madison, WI 53706, USA
975 University Ave, Madison, WI 53706, USA
624 University Ave, Madison, WI 53715, USA
800 University Ave, Madison, WI 53706, USA
1150 University Ave, Madison, WI 53706, USA
602 University Ave, Madison, WI 53715, USA
2840 University Ave, Madison, WI 53705, USA
610 University Ave, Madison, WI 53715, USA
3650 University Ave, Madison, WI 53705, USA
2862 University Ave, Madison, WI 53705, USA
800 University Ave, Madison, WI 53706, USA
1150 University Ave, Madison, WI 53706, USA
703 University Ave, Madison, WI 53715, USA
6825 University Ave, Middleton, WI 53562, USA
3401 University Ave, Madison, WI 53705, USA
2000 University Bay Dr, Madison, WI 53705, USA
2000 University Bay Dr, Madison, WI 53705, USA
2107, Mechanical Engineering Building, 1513 University Ave, Madison, WI 53706, USA


#### Q5

In [111]:
response = client.search(
    index= madmap,
    body={
        "query": {
            "match": {
                    "title": {
                        "query" : "Madison",
                        "fuzziness": "AUTO"
                    }
            }
        },
        "_source": ["title"],
        "size": 200
    }
)
with open("answers/q5.json", "w") as f:
    json.dump(dict(response),f,indent=4)
#for hit in response['hits']['hits']:
    #print(hit['_source'])

#### Q6

In [11]:
response= client.search(
    index=madmap,
    body={
        "query":{
            "bool":{
                "should": [
                    {"match_phrase": {"title": "Wisconsin Badgers"}},
                    {"match_phrase": {"description": "Wisconsin Badgers"}},
                    {"match_phrase": {"content": "Wisconsin Badgers"}}
                ]
            }
        },
        "size": 210
    }
)
with open("answers/q6.json", "w") as f:
    json.dump(dict(response),f,indent=4)
for hit in response['hits']['hits']:
    print(hit['_source'])

{'source': {'id': 'usa-today', 'name': 'USA Today'}, 'author': 'Roll Tide Wire', 'title': 'Alabama football rolls past the Wisconsin Badgers 42-10', 'description': "After a sloppy Week 2 win against the USF Bulls, everyone was curious how the Alabama Crimson Tide would respond against the Wisconsin Badgers in Kalen DeBoer's first road test as the head coach in Tuscaloosa. The result was more than positive. Wisco", 'url': 'https://rolltidewire.usatoday.com/2024/09/14/alabama-football-wisconsin-badgers-victory/', 'urlToImage': 'https://s.yimg.com/ny/api/res/1.2/RwRc7z.ZmgkcXFw88x1cbg--/YXBwaWQ9aGlnaGxhbmRlcjt3PTEyMDA7aD04MDA-/https://media.zenfs.com/en/roll_tide_wire_usa_today_articles_214/f3637affb3dc62d4738204dc186c71e5', 'publishedAt': '2024-09-14T19:55:55Z', 'content': 'Sep 14, 2024; Madison, Wisconsin, USA; Alabama Crimson Tide quarterback Jalen Milroe (4) throws a pass during the first quarter against the Wisconsin Badgers at Camp Randall Stadium. Mandatory Credit… [+1213 chars]'}


#### Q7

In [112]:

response = client.search(
    index= madmap,
    body={
        "query":{
            "bool":{
                "must": [
                    {"exists": {"field": "formattedAddress"}},
        
                ],
                "must_not": [
                    {"match":{"formattedAddress": "Madison"}}
                ]
            }
        },
        "_source": ["name", "formattedAddress"],
        "size": 200
    }
)

with open("answers/q7.json", "w") as f:
    json.dump(dict(response), f, indent=4)
# for hit in response["hits"]["hits"]:
#     print(hit["_source"])

#### Q8

In [110]:
response= client.search(
    index= madmap,
    body= {
        "query": {
            "simple_query_string":{
                "query": "\"rivalry\"^10.0 \"football\"^5.0 \"badgers\"",
                "fields": ["wiki"]
            }
        },
        "_source": ["wiki"]
    }
)
with open("answers/q8.json", "w") as f:
    json.dump(dict(response), f, indent=4)
#for hit in response["hits"]["hits"]:
    #print(hit["_source"])

#### Q9

In [35]:
response= client.search(
    index=madmap,
    body={
        "query": {
            "match_phrase": {
                "wiki": "rivalry"
            }
        },
        "_source": ["wiki"],
        "highlight": {
            "fields": {
                "wiki": {}
            }
        },
        "size": 1
    }
)
top_hit = response["hits"]["hits"][0]
highlight_section = top_hit.get("highlight")
with open("answers/q9.json", "w") as f:
    json.dump(dict(highlight_section), f, indent=4)
highlight_section


{'wiki': ["Rivalries\nThe Wisconsin Badgers' most notable <em>rivalry</em> within the Big Ten is with the Minnesota Golden",
  'Gophers, which is the most-played <em>rivalry</em> in Division I-A football.',
  "The I-94 <em>rivalry</em> between Wisconsin men's basketball and the in-state Marquette Golden Eagles has been"]}

#### Q10

In [78]:
response = client.search(
    index="madmap",
    body={
        "query": {
            "match_phrase": {
                "source.name": "NAsa"
            }
        },
        "_source": ["source", "title", "publishedAt"],
        "size": 20
    }
)
with open("answers/q10.json", "w") as f:
    json.dump(dict(response), f, indent=4)
for hit in response["hits"]["hits"]:
    print(hit["_source"])

{'source': {'id': None, 'name': 'NASA'}, 'title': 'NASA Mission Gets Its First Snapshot of Polar Heat Emissions', 'publishedAt': '2024-09-15T15:12:17Z'}
{'source': {'id': None, 'name': 'NASA'}, 'title': 'SARP West 2024 Oceans Group', 'publishedAt': '2024-09-25T21:08:57Z'}
{'source': {'id': None, 'name': 'NASA'}, 'title': 'NASA Mission Gets Its First Snapshot of Polar Heat Emissions', 'publishedAt': '2024-09-15T15:12:17Z'}
{'source': {'id': None, 'name': 'NASA'}, 'title': 'SARP West 2024 Oceans Group', 'publishedAt': '2024-09-25T21:08:57Z'}


#### Q11

In [90]:
response = client.search(
    index=madmap,
    body={
        "size":0,
        "aggs": {
            "total_arrests_sum" :{
                 "sum": {
                    "field": "arrests"
                }
            }
        }
        
    }
)
number = response['aggregations']['total_arrests_sum']['value']
with open("answers/q11.json", "w") as f:
    json.dump(number, f, indent=4)
print(response['aggregations']['total_arrests_sum']['value'])

1671.0


#### Q12

In [95]:
response= client.search(
    index=madmap,
    body={
        "size":0,
        "aggs": {
            "source_count":{
                "terms": {
                    "field": "source.name.keyword",
                    "size": 10
                }
            }
        }

    }
)
top10= response['aggregations']['source_count']['buckets']
with open("answers/q12.json", "w") as f:
    json.dump(top10, f, indent=4)
top10



[{'key': 'Milwaukee Journal Sentinel', 'doc_count': 22},
 {'key': 'Yahoo Entertainment', 'doc_count': 19},
 {'key': 'Fox Sports', 'doc_count': 18},
 {'key': 'USA Today', 'doc_count': 18},
 {'key': 'Forbes', 'doc_count': 15},
 {'key': 'CBS Sports', 'doc_count': 9},
 {'key': 'Newsweek', 'doc_count': 8},
 {'key': 'ESPN', 'doc_count': 6},
 {'key': 'Scientific American', 'doc_count': 6},
 {'key': 'Tuscaloosa News', 'doc_count': 6}]

#### Q13

In [100]:
response= client.search(
    index=madmap,
    body={
        "size":0,
        "aggs":{
            "location_name_count": {
                "value_count": {
                    "field": "name.keyword"
                }
            }
        }
    }
)
valuecount = response["aggregations"]["location_name_count"]['value']
with open("answers/q13.json", "w") as f:
    json.dump(valuecount,f,indent=4)
print(response['aggregations']['location_name_count']['value'])

307


#### Q14

In [105]:
response=client.search(
    index=madmap,
    body={
        "size":0,
        "aggs":{
            "unique_authors":{
                "cardinality":{
                    "field": "author.keyword"
                }
            }
        }
    }
)
authors = response['aggregations']['unique_authors']['value']
with open("answers/q14.json", "w") as f:
    json.dump(authors,f,indent=4)
print(response['aggregations']['unique_authors']['value'])

64


#### Q15

In [107]:
response = client.search(
    index=madmap,
    body={
        "size":0,
        "aggs": {
            "avg_attended": {
                "avg": {
                    "field": "attended"
                }
            }
        }
    }
)
party_goers = response['aggregations']['avg_attended']['value']
with open("answers/q15.json", "w") as f:
    json.dump(party_goers,f,indent=4)
print(response['aggregations']['avg_attended']['value'])

47736.84210526316


# Section 3: Interactive Visualization: Making the 639 Madison maps application

#### Done in Kibana 

In [4]:
! pytest autograder.py

platform linux -- Python 3.10.12, pytest-8.3.4, pluggy-1.5.0
rootdir: /home/alexvlasik/project-3-p3_vlasik_mmgupta2
plugins: anyio-4.8.0
collected 22 items                                                             [0m

autograder.py [32m.[0m[32m.[0m[32m.[0m[32m.[0m[32m.[0m[32m.[0m[32m.[0m[32m.[0m[32m.[0m[32m.[0m[32m.[0m[32m.[0m[32m.[0m[32m.[0m[32m.[0m[32m.[0m[32m.[0m[32m.[0m[32m.[0m[32m.[0m[32m.[0m[32m.[0m[32m                                     [100%][0m

