## StardewValley Wiki Scrapper



### Setup

In [21]:
%load_ext autoreload
%autoreload 2

# Importing project-specific modules
import stars

# Web Scraping
from bs4 import BeautifulSoup
import requests as re

# Data
from pymongo.server_api import ServerApi
import pandas as pd
import pymongo
import json

# Load keys
with open('key.json') as f:
    keys = json.loads(f.read())

# Setting up cache
cache = {}

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


### Infox to DF

In [91]:
# soup = BeautifulSoup(re.get('https://stardewvalleywiki.com/Bamboo_Pole').text)
# soup = BeautifulSoup(re.get('https://stardewvalleywiki.com/Fish_Taco').text)
# table = soup.find('table', {'id':"infoboxtable"})
# table_lines = table.find_all('tr')

In [22]:
def extract_page(url):

    soup = BeautifulSoup(stars.utils.request_and_cache(url, cache).text)
    table = soup.find('table', {'id':"infoboxtable"})
    table_lines = table.find_all('tr')

    data = {'lines': {}, 'hyper_class': stars.utils.extract_hyper_class(soup)}

    detail_flag = True
    image_flag = True

    for line in table_lines:

        # Finding the category the line is in
        if line.find('td', {"style":"text-align:center; padding-left: 0; padding-right: 0;"}) != None:
            cat_holder = line.find('td', {"style":"text-align:center; padding-left: 0; padding-right: 0;"}).text.replace('\n', '')

        # Finding the name of the item that the page is about
        elif line.find('td', {'id':"infoboxheader"}) != None:
            data['name'] = line.find('td', {'id':"infoboxheader"}).text.replace('\n', '')

        # Finding the image of the item that the page is about
        elif (line.find('img') != None) and (image_flag):
            data['image'] = 'https://stardewvalleywiki.com/' + line.find('img')['src']
            image_flag = False # This is to make sure that the image is only found once

        # Finding the description of the item that the page is about
        elif (line.find('td', {'id':"infoboxdetail"}) != None) and detail_flag:
            data['description'] = line.find('td', {'id':"infoboxdetail"}).text.replace('\n', '')
            detail_flag = False # This is to make sure that the description is only found once

        # Finding the contents of the infobox lines  
        elif line.find('td', {'id': 'infoboxsection'}) != None:
            section = line.find('td', {'id': 'infoboxsection'}).text.replace('\n', '')
            detail = stars.items.extract_dynamic(data, line)
            data['lines'][section] = detail
    
    return data
            

In [30]:
_temp = {'lines': {'Source': ['Cooking'], 'Energy / Health': {'Energy': '100', 'Health': '45'}, 'Sell Price': '220g', 'Qi Seasoning': {'Energy': '180', 'Health': '81', 'Price': '330g'}, 'Recipe Source(s)': [{'Font': 'The Queen of Sauce', 'Day': '21', 'Season': 'Summer', 'Year': '1'}, {'Font': 'Stardrop Saloon', 'Price': '300g'}]}, 'hyper_class': 'Recipes', 'name': 'Maki Roll', 'image': 'https://stardewvalleywiki.com//mediawiki/images/b/b6/Maki_Roll.png', 'description': 'Fish and rice wrapped in seaweed.'}
'Cooking' in _temp['lines']['Source']

True

In [55]:
extract_page('https://stardewvalleywiki.com/Oil_of_Garlic')

{'lines': {'Source': ['Crafting'],
  'Buff(s)': {},
  'Buff Duration': {'m': '10'},
  'Energy / Health': {'Energy': '200', 'Health': '89'},
  'Sell Price': '1000g',
  'Recipe Source(s)': [],
  'Ingredients': {'Garlic': '10', 'Oil': '1'}},
 'hyper_class': 'Recipes',
 'name': 'Oil of Garlic',
 'image': 'https://stardewvalleywiki.com//mediawiki/images/4/4b/Oil_of_Garlic.png',
 'description': 'Drink this and weaker monsters will avoid you.'}

In [56]:
soup = BeautifulSoup(re.get('https://stardewvalleywiki.com/Fish_Taco').text)
all_recipes = ['https://stardewvalleywiki.com' + a['href'] for a in soup.find('table', {'class': 'wikitable', 'id': 'navbox'}).find_all('a', href=True) if a['href'] not in ('/Cooking', '/Crafting')]
all_recipes_dct = {}

for recipe in all_recipes:
    try:
        all_recipes_dct[recipe] = extract_page(recipe)
    except Exception as e:
        print(recipe)
        raise e
        break

In [57]:
with open('output.json', 'w') as f:
    json.dump(all_recipes_dct, f, indent=4)

In [1]:
!docker run --name StarAGE -p 5455:5432 -p 3000:3000 -e POSTGRES_USER=postgresUser -e POSTGRES_PASSWORD=postgresPW -e POSTGRES_DB=postgresDB -d age_stardew_graphs

11894abe3f7bbbdd72a881443e4da3115429ae33f5fbf69e07c8e9a36f50d77b


In [30]:
import psycopg2

#establishing the connection
conn = psycopg2.connect(
   database="postgresDB", user='postgresUser', password='postgresPW', host='localhost', port= '5455'
)
#Creating a cursor object using the cursor() method
cursor = conn.cursor()

#Executing an MYSQL function using the execute() method
cursor.execute('SELECT * FROM "demo_graph"."Person" LIMIT 5;')

# Fetch a single row using fetchone() method.
data = cursor.fetchall()
print(data)

#Closing the connection
conn.close()

[('844424930131969', '{"name": "james", "bornIn": "US"}')]


In [None]:
cursor.execute("SELECT * FROM demo_graph.Person;")


In [None]:
from sqlalchemy import create_engine, Table, MetaData

engine = create_engine('postgresql://username:password@localhost:5432/postgresDB')

metadata = MetaData()
person_table = Table('Person', metadata, autoload=True, autoload_with=engine)

# Select all rows from the Person table
select_query = person_table.select()
results = engine.execute(select_query).fetchall()

# Print the results
for row in results:
    print(row)

In [70]:

client = pymongo.MongoClient(f"mongodb+srv://{keys['MongoDB']['user']}:{keys['MongoDB']['passowrd']}@{keys['MongoDB']['host']}/?retryWrites=true&w=majority", server_api=ServerApi('1'))
db = client.test


In [71]:
db['teste'].find_one()

In [72]:
db['teste']

Collection(Database(MongoClient(host=['ac-ut0hidr-shard-00-02.zdhwxnz.mongodb.net:27017', 'ac-ut0hidr-shard-00-01.zdhwxnz.mongodb.net:27017', 'ac-ut0hidr-shard-00-00.zdhwxnz.mongodb.net:27017'], document_class=dict, tz_aware=False, connect=True, retrywrites=True, w='majority', authsource='admin', replicaset='atlas-g84kyu-shard-0', tls=True, server_api=<pymongo.server_api.ServerApi object at 0x0000018FAF9AAB20>), 'test'), 'teste')