# In this notebook, wikipedia has been scrapped using beautiful soup in order to gather information about different cities

In [2]:
#Importing the necessary libraries

import pandas as pd
from bs4 import BeautifulSoup
import requests
import re

In [2]:
!pip install --upgrade beautifulsoup4



In [458]:
# testing to check if everyhting is working
web_site = 'https://en.wikipedia.org/wiki/Malaga'

In [460]:
response = requests.get(web_site)
soup = BeautifulSoup(response.content, 'html.parser')

In [461]:
soup

<!DOCTYPE html>

<html class="client-nojs" dir="ltr" lang="en">
<head>
<meta charset="utf-8"/>
<title>Málaga - Wikipedia</title>
<script>document.documentElement.className="client-js";RLCONF={"wgBreakFrames":false,"wgSeparatorTransformTable":["",""],"wgDigitTransformTable":["",""],"wgDefaultDateFormat":"dmy","wgMonthNames":["","January","February","March","April","May","June","July","August","September","October","November","December"],"wgRequestId":"9acb8cd5-be8e-4500-9604-b55597628460","wgCSPNonce":false,"wgCanonicalNamespace":"","wgCanonicalSpecialPageName":false,"wgNamespaceNumber":0,"wgPageName":"Málaga","wgTitle":"Málaga","wgCurRevisionId":1115466856,"wgRevisionId":1115466856,"wgArticleId":184763,"wgIsArticle":true,"wgIsRedirect":false,"wgAction":"view","wgUserName":null,"wgUserGroups":["*"],"wgCategories":["Webarchive template wayback links","CS1 Spanish-language sources (es)","CS1 European Spanish-language sources (es-es)","All articles with dead external links","Articles with 

In [10]:
# Response 200, so everything seems to be working

response

<Response [200]>

In [15]:
# Scrapping to get the name of the city

soup.select('#firstHeading')[0].get_text()

'Berlin'

In [90]:
# Country code

soup.select('td.nickname')[1].get_text()

'DE-BE'

In [881]:
# Population timestamp

soup.select('.infobox-header>.ib-settlement-fn')[2].get_text()

'\xa0(2018)[3]'

In [3]:
# Creating a function to gather all the necessary info from Wikipedia such as lat and lon, population and timestamp pop.

def city_data(cities):
  info_cities = []
    
  for city in cities:
    url = f'https://en.wikipedia.org/wiki/{city}'

    r = requests.get(url)
    soup = BeautifulSoup(r.content, 'html.parser')

    response_dict = {}
    response_dict['city'] = soup.select('#firstHeading')[0].get_text()
    response_dict['lat'] = soup.select('.geo-dms .latitude')[0].get_text()
    response_dict['lon'] = soup.select('.geo-dms .longitude')[0].get_text()
    response_dict['population'] =  soup.select_one('th.infobox-header:-soup-contains("Population")')
    .parent.find_next_sibling().find(text=re.compile(r'\d+'))
    response_dict['timestamp_population'] = soup.select('.infobox-header>.ib-settlement-fn')[2].get_text()
    info_cities.append(response_dict)
    
  cities_df = pd.DataFrame(info_cities)
  cities_df['lat'] = cities_df['lat'].str.split('″').str[0].str.replace('°', '.', regex=False)
    .str.replace('′', '.', regex=False)
  cities_df['lon'] = cities_df['lon'].str.split('″').str[0].str.replace('°', '.', regex=False)
    .str.replace('′', '.', regex=False)

    
  return cities_df

In [4]:
# Creating a list with the cities that I want to include in the dataframe

list_of_cities = ['Madrid','Barcelona','Malaga','Berlin','Hamburg','London']

In [5]:
city_data_frame = city_data(list_of_cities)

In [6]:
# Since the latitude and longitude of each city are in GPS coordinates, they need to be converted to lat and lon
# so, first thing is to stripped the north and east coordinates in letter as well as the period.

city_data_frame['lat']= city_data_frame['lat'].str.replace('.N','', regex=True)

In [7]:
city_data_frame['lon'] = city_data_frame['lon'].str.replace('.E','', regex=True)

In [13]:
# GPS coordinates are expressed differently from latitude and longitude, so the first thing is to divide the minutes
# by 60, then the seconds by 3600 and add everything to the degrees to get lat and lon

city_data_frame['lat'][0] = round(int(city_data_frame['lat'][0][0:2]) + 
                                  int(city_data_frame['lat'][0][3:5])/60 + int(city_data_frame['lat'][0][6:8])/3600, 4)
city_data_frame['lat'][1] = round(int(city_data_frame['lat'][1][0:2]) + 
                                  int(city_data_frame['lat'][1][3:5])/60, 4)
city_data_frame['lat'][2] = round(int(city_data_frame['lat'][2][0:2]) + 
                                  int(city_data_frame['lat'][2][3:5])/60 + int(city_data_frame['lat'][2][6:8])/3600, 4)
city_data_frame['lat'][3] = round(int(city_data_frame['lat'][3][0:2]) + 
                                  int(city_data_frame['lat'][3][3:5])/60 + int(city_data_frame['lat'][3][6:8])/3600, 4)
city_data_frame['lat'][4] = round(int(city_data_frame['lat'][4][0:2]) + 
                                  int(city_data_frame['lat'][4][3:5])/60 + int(city_data_frame['lat'][4][6:8])/3600, 4)
city_data_frame['lat'][5] = round(int(city_data_frame['lat'][5][0:2]) + 
                                  int(city_data_frame['lat'][5][3:5])/60 + int(city_data_frame['lat'][5][6:8])/3600, 4)

In [19]:
city_data_frame['lon'][0] = round(int(city_data_frame['lon'][0][0:2]) + 
                                  int(city_data_frame['lon'][0][3:5])/60 + int(city_data_frame['lon'][0][6:8])/3600, 4)
city_data_frame['lon'][1] = int(city_data_frame['lon'][1][0:1]) + 
                                  int(city_data_frame['lon'][1][2:4])/60
city_data_frame['lon'][2] = round(int(city_data_frame['lon'][2][0:1]) + 
                                  int(city_data_frame['lon'][2][2:4])/60 + int(city_data_frame['lon'][2][5:7])/3600, 4)
city_data_frame['lon'][3] = round(int(city_data_frame['lon'][3][0:2]) + 
                                  int(city_data_frame['lon'][3][3:5])/60 + int(city_data_frame['lon'][3][6:8])/3600, 4)
city_data_frame['lon'][4] = round(int(city_data_frame['lon'][4][0:2]) + 
                                  int(city_data_frame['lon'][4][3:5])/60 + int(city_data_frame['lon'][4][6:8])/3600, 4)
city_data_frame['lon'][5] = round(int(city_data_frame['lon'][5][0:1]) + 
                                  int(city_data_frame['lon'][5][2:3])/60 + int(city_data_frame['lon'][5][4:6])/3600, 4)


In [20]:
city_data_frame

Unnamed: 0,city,lat,lon,population,timestamp_population
0,Madrid,40.4167,3.7025,3223334,(2018)[3]
1,Barcelona,41.3833,2.183333,1620343,(2018)[5]
2,Málaga,36.7194,4.42,571026,(2018)[3]
3,Berlin,52.52,13.405,3769495,(31 December 2020)[2]
4,Hamburg,53.55,10.0,1845229,(30 June 2020)[2]
5,London,51.5072,0.1275,9002488,(2021)[5]


In [21]:
# Now, timestamp population need to be cleanned, first by removing the parentesis, and then by removing the
# reference number.the 

city_data_frame['timestamp_population'] = city_data_frame['timestamp_population'].str.replace('(','',regex=True).str.replace(')','',regex=True)

In [22]:
city_data_frame['timestamp_population'] = city_data_frame['timestamp_population'].str.replace('31 December','',regex=True).str.replace('30 June','',regex=True)

In [23]:
# Here, the longitude for Malaga is a negative coordinate, hence:

city_data_frame['lon'][2] = city_data_frame['lon'][2]*(-1)

In [24]:
city_data_frame['timestamp_population'] = city_data_frame['timestamp_population'].str.replace('3]','',regex=True).str.replace('5]','',regex=True).str.replace('2]','',regex=True).str.replace('[','',regex=True)

In [25]:
city_data_frame

Unnamed: 0,city,lat,lon,population,timestamp_population
0,Madrid,40.4167,3.7025,3223334,2018
1,Barcelona,41.3833,2.183333,1620343,2018
2,Málaga,36.7194,-4.42,571026,2018
3,Berlin,52.52,13.405,3769495,2020
4,Hamburg,53.55,10.0,1845229,2020
5,London,51.5072,0.1275,9002488,2021


Unnamed: 0,city,lat,lon,population,timestamp_population
0,Madrid,40.4167,3.7025,3223334,2018
1,Barcelona,41.3833,2.183333,1620343,2018
2,Málaga,36.7194,-4.42,571026,2018
3,Berlin,52.52,13.405,3769495,2020
4,Hamburg,53.55,10.0,1845229,2020
5,London,51.5072,0.1275,9002488,2021


In [26]:
# The city_data_frame into two dataframes, one for cities and the other for population, which will be uploaded to MySQL.

cities = city_data_frame[['city','lat','lon']]

In [27]:
population = city_data_frame[['city', 'population','timestamp_population']]

In [28]:
value = [1,2,3,4,5,6]
population.insert(0,'city_id',value)

In [29]:
value = [1,2,3,4,5,6]
cities.insert(0,'city_id',value)

In [30]:
cities

Unnamed: 0,city_id,city,lat,lon
0,1,Madrid,40.4167,3.7025
1,2,Barcelona,41.3833,2.183333
2,3,Málaga,36.7194,-4.42
3,4,Berlin,52.52,13.405
4,5,Hamburg,53.55,10.0
5,6,London,51.5072,0.1275


In [31]:
population

Unnamed: 0,city_id,city,population,timestamp_population
0,1,Madrid,3223334,2018
1,2,Barcelona,1620343,2018
2,3,Málaga,571026,2018
3,4,Berlin,3769495,2020
4,5,Hamburg,1845229,2020
5,6,London,9002488,2021


In [36]:
# Assigning the correct datatypes to each dataframe

cities['lat'].astype('float64') 

0    40.4167
1    41.3833
2    36.7194
3    52.5200
4    53.5500
5    51.5072
Name: lat, dtype: float64

In [38]:
cities['lon'].astype('float64') 

0     3.702500
1     2.183333
2    -4.420000
3    13.405000
4    10.000000
5     0.127500
Name: lon, dtype: float64

In [39]:
cities.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6 entries, 0 to 5
Data columns (total 4 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   city_id  6 non-null      int64  
 1   city     6 non-null      object 
 2   lat      6 non-null      float64
 3   lon      6 non-null      float64
dtypes: float64(2), int64(1), object(1)
memory usage: 320.0+ bytes


In [40]:
# Here, sqlalchemy and pymysql have been imported to upload both dataframes to MySQL tables that were created beforehand.

import sqlalchemy
import pymysql

In [41]:
schema="GansWeatherDB"
host="127.0.0.1"
user="My_root"
password="My_pass"
port=3306
con = f'mysql+pymysql://{user}:{password}@{host}:{port}/{schema}'

In [42]:
cities.dropna().to_sql('cities', if_exists='append', con=con, index=False)

In [45]:
cities.to_csv('cities.csv')

In [47]:
cities = pd.read_csv('cities.csv', index_col=[0])

In [49]:
cities

Unnamed: 0,city_id,city,lat,lon
0,1,Madrid,40.4167,3.7025
1,2,Barcelona,41.3833,2.183333
2,3,Málaga,36.7194,-4.42
3,4,Berlin,52.52,13.405
4,5,Hamburg,53.55,10.0
5,6,London,51.5072,0.1275


In [56]:
import sqlalchemy
import pymysql

In [57]:
schema="GansWeatherDB"
host="127.0.0.1"
user="My_User"
password="My_pass"
port=3306
con = f'mysql+pymysql://{user}:{password}@{host}:{port}/{schema}'

In [58]:
cities.dropna().to_sql('population', if_exists='append', con=con, index=False)