In [1]:
import getpass
from sqlalchemy import create_engine
import pymysql
import requests
from bs4 import BeautifulSoup
import lxml
import datetime
import pandas as pd

In [2]:
class Data_gathering:
    def __init__(self):
      
        #creating engine for connecting to database.
        #programm asks to enter user name, password and database name, which are required for connection.
        #warning and request for checking credential appear when the connection test fails
        self.user = str(input('Enter SQL user name: '))
        self.password = getpass.getpass('Enter password: ')
        self.database = str(input('Enter SQL database name: '))
        self.engine = create_engine("mysql+pymysql://{user}:{pw}@localhost/{db}"
                                   .format(user=self.user,
                                           pw=self.password,
                                           db=self.database))
        try:
            self.conn = self.engine.connect()
            #confirmation of a successfull connection with database
            print(self.user, 'is connected to', self.database, 'database. Test of the connection was successful')
            #closing connection
            self.conn.close()
        except:
            print('Warning! Test of the connection failed!!! Check the credentials.')
        
        
    def scrape_and_add_lottery_data(self):
        '''Scrapes lottery data (money prize, winner location, date) from http://megalotto.pl/najwyzsze-wygrane/lotto,
        creates lottery_data table in database and adds scraped data into lottery_data table'''
        
        #connecting to database
        self.conn = self.engine.connect()
        #confirmation of a successfull connection with database
        print(self.user, 'is connected to', self.database, 'database')   
        #creating lottery_data table in database
        self.conn.execute("create table lottery_data ("
                       "prize_id int primary key auto_increment,"
                       "lottery_prize int,"
                       "lottery_winner_location varchar(255),"
                       "lottery_date date)")
        print("lottery_data table was created in", self.database, "database")
        
        #defining url-s with data source
        self.url_lotto = 'http://megalotto.pl/najwyzsze-wygrane/lotto'
        
        #definig lists that will serve as a base for creating the table with prizes, winners' locations
        #and dates of lottery wins.
        lottery_prizes_list = []
        lottery_winner_locations_list = []
        lottery_dates_list = []

        #while True loop for scrapping data from http://megalotto.pl/najwyzsze-wygrane/lotto.
        #the loop continues scrapping data till the final page with lottery data is reached.
        while True: 
            page = requests.get(self.url_lotto)
            html_content = BeautifulSoup(page.content, 'html.parser')
            prizes = html_content.find_all(class_ = 'numbers_in_list numbers_in_list_najwyzsze_wygrane')
            cities = html_content.find_all(class_ = 'date_in_list date_in_list_najwyzsze_wygrane_miasto')
            dates = html_content.find_all(class_='date_in_list date_in_list_najwyzsze_wygrane_date')
            next_page = html_content.find_all(class_ = 'prev_next')

            #extracting money prizes, lottery winners' locations and datas from html_contents
            #converting money prizes amounts into integers
            for index, prize in enumerate(prizes):
                if index > 0: #skipping data with index==0, which contains header of the table
                    lottery_prizes_list.append(int(str(prizes[index]).split('>')[1].split(',')[0].replace(" ", "")))
                    lottery_winner_locations_list.append(str(cities[index]).split('>')[1].split(' <')[0])
                    lottery_dates_list.append(str(dates[index]).split('>')[1].split(' <')[0])

            if "Następny" in str(next_page[1]): #getting url of the next page with prizes
                self.url_lotto = "http://megalotto.pl" + str(next_page[1]).split('href="')[1].split('"')[0]
            else:
                break #ending the 'while True' loop when there is no next page with prizes list
        
        #adding lottery data into lottery_data table in database
        for index, prize in enumerate(lottery_prizes_list):
            self.conn.execute("insert into lottery_data values (default, %s, %s, %s)",
                           (prize, lottery_winner_locations_list[index], lottery_dates_list[index]))
        print("lottery data (money prize, winner location, date) were added to lottery_data table")
        
        #closing the connection
        self.conn.close()
        print('connection with the database was closed')
        
    def scrape_and_add_polish_cities_data(self):
        '''Scrapping polish_cities table from https://www.polskawliczbach.pl/Miasta page.'''
        
        #connecting to database
        self.conn = self.engine.connect()
        #confirmation of a successfull connection with database
        print(self.user, 'is connected to', self.database, 'database')   
              
        #defining url-s with data source
        self.url_polish_cities = 'https://www.polskawliczbach.pl/Miasta'

        #scrapping whole table using pd.read_html and removing unnecessary columns
        polish_cities = pd.read_html(self.url_polish_cities)[0].drop(['Unnamed: 0', 'Powiat', 'Obszar'], axis=1)
        #changing columns' names to english
        polish_cities.columns = ['City', 'Province', 'Population'] #changing columns' names to english

        #changing format from object into str and int. no inplace argument for series.str.replace function and copy=False
        #for df.astype is not assigning changes to existing variables, so reassign is applied.
        polish_cities['Population'] = polish_cities['Population'].str.replace(" ", "")
        polish_cities = \
        polish_cities.astype({'City' : 'string', 'Province' : 'string', 'Population' : 'int'})
            
        #adding polish_cities as polish_cities table in database
        polish_cities.to_sql('polish_cities', con=self.engine)
        print('polish_cities table was added to', self.database, 'database')
        
        #closing the connection
        self.conn.close()
        print('connection with the database was closed')
        
    def scrape_and_add_polish_provinces_data(self):
        '''Scrapping polish_cities table from https://www.polskawliczbach.pl/Wojewodztwa page.'''
        
        #connecting to database
        self.conn = self.engine.connect()
        #confirmation of a successfull connection with database
        print(self.user, 'is connected to', self.database, 'database')
        
        #defining url-s with data source
        self.url_polish_provinces = 'https://www.polskawliczbach.pl/Wojewodztwa'
        
        #scrapping whole table using pd.read_html and removing unnecessary columns
        polish_provinces = pd.read_html(self.url_polish_provinces)[0].drop(['Unnamed: 0', 'Obszar'], axis=1)
        #changing columns' names to english
        polish_provinces.columns = ['Province', 'Population', 'Urbanisation [%]']
        
        #changing format from object into str and int. no inplace argument for series.str.replace function and copy=False
        #for df.astype is not assigning changes to existing variables, so reassign is applied.
        polish_provinces['Population'] = polish_provinces['Population'].str.replace(" ", "")
        polish_provinces['Urbanisation [%]'] = polish_provinces['Urbanisation [%]'].str.replace(",", ".")
        polish_provinces['Urbanisation [%]'] = polish_provinces['Urbanisation [%]'].str.replace("%", "")
        polish_provinces = \
            polish_provinces.astype({'Province' : 'string', 'Population' : 'int', 'Urbanisation [%]' : 'float'})
           
        #adding polish_provinces as polish_provinces table in SQL database
        polish_provinces.to_sql('polish_provinces', con=self.engine)
        print('polish_provinces table was added to SQL database')
        
        #closing the connection
        self.conn.close()
        print('connection with the database was closed')

In [3]:
lotto = Data_gathering()

Enter SQL user name:  piotrsoczewka
Enter password:  ······
Enter SQL database name:  lotto


piotrsoczewka is connected to lotto database. Test of the connection was successful


In [4]:
lotto.scrape_and_add_lottery_data()

piotrsoczewka is connected to lotto database
lottery_data table was created in lotto database
lottery data (money prize, winner location, date) were added to lottery_data table
connection with the database was closed


In [5]:
lotto.scrape_and_add_polish_cities_data()

piotrsoczewka is connected to lotto database
polish_cities table was added to lotto database
connection with the database was closed


In [6]:
lotto.scrape_and_add_polish_provinces_data()

piotrsoczewka is connected to lotto database
polish_provinces table was added to SQL database
connection with the database was closed
