# Project web scraping


In [2]:
import requests
from bs4 import BeautifulSoup as Bsoup

# Object construction

In [625]:
class ITA_weather_downloader:
    '''  This downloader scrapes ilmeteo.it website. It downloads monthly tables with daily weather data in
         Italian cities for which data for each month in the chosen interval are available at the website.   '''
    def ___init___(self, start_year=1985, end_year=2019, partitions=2):
        
        ''' At the initialization of the object we can choose the start and end years. Data from January to December 
        of each year between the start and end year (included) will be then downloaded. The default values which we
        use in our project are 1985 and 2019, respectively.
        
        In addition option "partitions" gives into how many parts are the final links partitioned (with respect
        to the chosen cities). This will enable user to download data by parts in order to be more time efficient 
        or if they are satisfied with only incomplete data.
        
        Calling ITA_weather_downloader class automatically imports requests and BeautifulSoup.'''
        
        self.start_year=start_year
        self.end_year=end_year
        self.partitions=partitions
        self.basic_link="https://www.ilmeteo.it/portale/"
        import requests
        from bs4 import BeautifulSoup as Bsoup
    def __repr__(self):
        return f"Italian weather data for interval {self.start_year} - {self.end_year}."
       
    
    def region_pages(self):
        '''   This method finds links to the pages of the regions and their names.    '''
        self.link2="https://www.ilmeteo.it/portale/archivio-meteo/"
        beginning = requests.get(self.link2) 
        soup_beginning=Bsoup(beginning.text)
        horalka=soup_beginning.findAll("td")
        rumba=horalka[1].findAll("a")
        reg_partial_links=[x.get("href") for x in rumba]
        self.reg_links=[self.basic_link+region for region in reg_partial_links]   # complete links
        region_names=[x.text for x in rumba]                       # list of all regions
        self.region_names=region_names
        return self.reg_links
    
    def city_pages(self, regions=region_names):
        '''  This method finds links to the data pages for the cities and their names.
        List of names of the regions to which search should be limited can be provided. The default option is all regions. '''
        cities=[]
        cities_partial_links=[]
        indeces=[self.region_names.index(x) for x in regions]
        chosen_reg_links=[self.reg_link[x] for x in indeces]
        for reg_link in chosen_reg_links:
            reg_page = requests.get(reg_link)
            soup_reg_page=Bsoup(reg_page.text)
            tea=soup_reg_page.findAll("div",{"class":"block noborder"})
            cofee=tea[0].findAll("a",{"target":""})
            for x in cofee:
                cities_partial_links.append(x.get("href"))   # links
                cities.append(x.text)                  # city names
        self.cities=sorted(list(set(cities)))    # to get rid of the reccurent cities, setting alphabetical order
        cities_partial_links=sorted(list(set(cities_partial_links)))   # to get rid of the reccurent cities, setting alphabetical order
        self.cities_links=[self.basic_link+city_link for city_link in cities_partial_links]
        return self.cities_links
        
        


## PART 1 
## Getting links to the webpages for the chosen cities' data

In [3]:
'''  basic links   '''
link1="https://www.ilmeteo.it/portale/"
link2="https://www.ilmeteo.it/portale/archivio-meteo/"

In [4]:
'''   Finding links to the pages of the regions    '''
beginning = requests.get(link2) 
soup_beginning=Bsoup(beginning.text)
horalka=soup_beginning.findAll("td")
rumba=horalka[1].findAll("a")
reg_partial_links=[]
for x in rumba:
    reg_partial_links.append(x.get("href"))

reg_links=[]
for region in reg_partial_links:
    reg_links.append(link1+region)
    
'''  Region names  '''
region_names=[]
for x in rumba:
    region_names.append(x.text)

In [7]:
'''   Gerting links to the data pages for the cities  '''
cities=[]
cities_partial_links=[]
for reg_link in reg_links:
    reg_page = requests.get(reg_link) 
    soup_reg_page=Bsoup(reg_page.text)
    tea=soup_reg_page.findAll("div",{"class":"block noborder"})
    cofee=tea[0].findAll("a",{"target":""})
    for x in cofee:
        cities_partial_links.append(x.get("href"))   # links
        cities.append(x.text)                  # city names
        
cities=sorted(list(set(cities)))    # to get rid of the reccurent cities, setting alphabetical order
cities_partial_links=sorted(list(set(cities_partial_links)))   # to get rid of the reccurent cities, setting alphabetical order

cities_links=[]
for city_link in cities_partial_links:
    cities_links.append(link1+city_link)
    

### Note: for some reason some of the links lead to a no-data page (links are correct, we get the same result if we move directly on the Italian webpage)

## Filtering of the cities which have data available for the required time period.

In [149]:
'''  Getting links to the monthly data from the pages for the cities   '''
city_years=[]
city_months=[]
MONTH_links=[]
for city in cities_links:
    YEARS=[]
    MONTHS=[]
    page = requests.get(city)
    soup1= Bsoup(page.text)
    temps_dark = soup1.findAll("tr",{'class':'dark'})
    temps_light = soup1.findAll("tr",{'class':'light'})
    for x in temps_dark:
        tds=x.findAll("td")
        YEARS.append(tds[0].text)                              
        links=[y.get("href") for y in tds[1].findAll("a")]
        months=[y.text for y in tds[1].findAll("a")]
        MONTH_links.append(links)
        MONTHS.append({tds[0].text:months})
    for x in temps_light:
        tds=x.findAll("td")
        YEARS.append(tds[0].text)
        links=[y.get("href") for y in tds[1].findAll("a")]
        months=[y.text for y in tds[1].findAll("a")]
        MONTH_links.append(links)
        MONTHS.append({tds[0].text:months})
    city_years.append(sorted(YEARS))    # list of lists of years for each city during which at least some data were recorded
    city_months.append([y for x,y in sorted([(list(x.keys()),x) for x in MONTHS])])  # list of months from which data are available

## Getting links to the webpages with the data tables that we want  



In [203]:
'''     Making working links to the available monthly data-tables     ''' 
final_links=[]
for city in MONTH_links:
    for monthly_link in city:
        final_links.append(link1+monthly_link)
        

In [201]:
'''    Making sorted lists of cities and to them corresponding lists of years and months 
                                              for which at least some data are available     '''

good_cities=[]
good_months=[]
good_years=[]
for i in range(len(cities)):
    if len(city_years[i])!=0:
        good_cities.append(cities[i])
        good_months.append(city_months[i])
        good_years.append(city_years[i])


## Altogether we have 40,730 available links. We will subset links for the cities which have available data for each month during the 1985-2019 period.

In [403]:
''' Subsetting only cities which have data available for every month during the chosen period  '''
#month_list=[y for z in good_months for y in z]

start_year=1985
a=start_year
end_year=2019
chosen_years=[a]
while a<end_year:                # getting the list of chosen years
    chosen_years.append(a+1)
    a+=1                       

chosen_cities=[]
for city in good_cities:          # getting the list of cities for which all months in the chosen years are provided
    adidas=[]
    for year in chosen_years:
        string=city+"/"+str(year)+"/"+"Gennaio"
        indeces=[MONTH_links.index(y) for y in MONTH_links for x in y if string in x] #is link for the January of the chosen
        if len(indeces)!=0:                                                              # year in the list?
            index=indeces[0]
            if len(MONTH_links[index])==12:
                adidas.append(1)
    if sum(adidas)==len(chosen_years):
        chosen_cities.append(city)


## We get 52 cities. For 35 years we would have 21,840 monthly data tables. It seems that half of it would be quite enough, therefore we will take only every second city in the list.

In [406]:
chosen_cities1=chosen_cities[::2]

In [572]:
'''  Choosing final links for the chosen cities and years    ''' 
chosen_links=[]
for link in final_links:
    for city in chosen_cities1:
        for year in chosen_years:
            if city in link and str(year) in link:
                chosen_links.append(link)
    

In [417]:
''' Making lists of cities, years and months corresponding to the list of the chosen links  '''   
final_cities=[x.split("/")[5] for x in chosen_links]
final_years=[x.split("/")[6] for x in chosen_links]
final_months=[x.split("/")[7] for x in chosen_links]

## So in the end we get 10,920 links to each month in the 1985-2019 period for 26 Italian cities.

## PART 2
## Scraping of the data

In [10]:
import timeit
 

In [473]:
from datetime import datetime

In [484]:
'''       Here we have the algorithm for getting all of the data and variable names from the linked data-pages    '''
table=[]
for link in chosen_links:
    try:
        index=chosen_links.index(link)
        month=final_months[index]    # what month do we scrape?
        year=str(final_years[index])      # what year do we scrape?
        city=final_cities[index]    # what city do we scrape?
        Dark_values=[]
        Light_values=[]
        values=[]
        page = requests.get(link)             # getting linked webpage
        soup1= Bsoup(page.text)
        var=soup1.findAll("table")[3].findAll("th")    # find the list of variable names on the webpage
        variables=["city","year","month"]                    # making the list of variable names...
        [variables.append(x.text) for x in var]     #  ... (for each link separately to check if they are always the same)
        temps_dark = soup1.findAll("tr",{'class':'dark'})    # data are in a table with alternating dark 
        temps_light = soup1.findAll("tr",{'class':'light'})   # and light rows
        for x in temps_dark:
            tds=x.findAll("td")
            dark_value=[city,year,month]
            [dark_value.append(y.text) for y in tds]            # getting list of values from each dark row
            Dark_values.append(dark_value)
        for x in temps_light:
            tds=x.findAll("td")
            light_value=[city,year,month]
            [light_value.append(y.text) for y in tds]          # getting list of values from each light row
            Light_values.append(light_value)
        if len(temps_dark)==(len(temps_light)+1) or len(temps_dark)==len(temps_light):
            for i in range(len(temps_light)):                              
                values.append(Dark_values[i])                   # list of values for months with even number of days
                values.append(Light_values[i])                         
        else:
            for i in range(len(temps_light)):
                values.append(Dark_values[i])                   # list of values for months with odd number of days
                values.append(Light_values[i])
            values.append(Dark_values[len(temps_light)])
        tab={variables[i]:[x[i] for x in values] for i in range(len(variables))} # creating a monthly dictionary - variable:list_of_values
        table.append(tab)               # in the end, we get a list of monthly dictionaries
        if index%200==0:     # checking the progress
            print(index)
            now = datetime.now()
            current_time = now.strftime("%H:%M:%S")
            print(current_time)        
    except:
        print(chosen_links.index(link))     # if errors, where?

## There were some errors for 988th, 10,751th  and 10,775th links, so let's add them now.

In [493]:
add_links=[chosen_links[988],chosen_links[10751],chosen_links[10775]]
xtable=[]
for link in add_links:
    try:
        index=chosen_links.index(link)
        month=final_months[index]    # what month do we scrape?
        year=str(final_years[index])      # what year do we scrape?
        city=final_cities[index]    # what city do we scrape?
        Dark_values=[]
        Light_values=[]
        values=[]
        page = requests.get(link)             # getting linked webpage
        soup1= Bsoup(page.text)
        var=soup1.findAll("table")[3].findAll("th")    # find the list of variable names on the webpage
        variables=["city","year","month"]                    # making the list of variable names...
        [variables.append(x.text) for x in var]     #  ... (for each link separately to check if they are always the same)
        temps_dark = soup1.findAll("tr",{'class':'dark'})    # data are in a table with alternating dark 
        temps_light = soup1.findAll("tr",{'class':'light'})   # and light rows
        for x in temps_dark:
            tds=x.findAll("td")
            dark_value=[city,year,month]
            [dark_value.append(y.text) for y in tds]            # getting list of values from each dark row
            Dark_values.append(dark_value)
        for x in temps_light:
            tds=x.findAll("td")
            light_value=[city,year,month]
            [light_value.append(y.text) for y in tds]          # getting list of values from each light row
            Light_values.append(light_value)
        if len(temps_dark)==(len(temps_light)+1) or len(temps_dark)==len(temps_light):
            for i in range(len(temps_light)):                              
                values.append(Dark_values[i])                   # list of values for months with even number of days
                values.append(Light_values[i])                         
        else:
            for i in range(len(temps_light)):
                values.append(Dark_values[i])                   # list of values for months with odd number of days
                values.append(Light_values[i])
            values.append(Dark_values[len(temps_light)])
        tab={variables[i]:[x[i] for x in values] for i in range(len(variables))} # creating a monthly dictionary - variable:list_of_values
        xtable.append(tab)               # in the end, we get a list of monthly dictionaries
    except:
        print(chosen_links.index(link))

In [503]:
'''   Adding the missing table, not to be rerun    '''
#table.insert(988,xtable[0])     
#table.insert(10751,xtable[1])
#table.insert(10775,xtable[2])

In [540]:
'''  Are the name of the variables always the same (including their order)?    '''
sum([list(x.keys())==list(table[0].keys()) for x in table])==len(table)

True

In [193]:
import pandas as pd

In [506]:
'''    Creating list of dataframes - each month is a single dataframe  '''
list_of_DFs=[pd.DataFrame(x) for x in table]

In [566]:
'''     Exporting dataframes to the csv files        '''
df_paths=['d:\moje_dokumenty\Desktop\IES\semester 11\Python\project\data\ '+str(chosen_links.index(x))+".csv" for x in chosen_links]
[list_of_DFs[df_paths.index(x)].to_csv(x, index = None, header=True) for x in df_paths]

# Adding data from the second half of the cities which have well-reported data.
### For the time consumption reasons, we took data only from half of the cities. We can add the second half separately.

In [574]:
'''  Repeat the same for the other half of the cities '''
chosen_cities2=chosen_cities[1::2]
'''  Choosing final links for the chosen cities and years    ''' 
chosen_links2=[]
for link in final_links:
    for city in chosen_cities2:
        for year in chosen_years:
            if city in link and str(year) in link:
                chosen_links2.append(link)
''' Making lists of cities, years and months corresponding to the list of the second set of chosen links  '''   
final_cities2=[x.split("/")[5] for x in chosen_links2]
final_years2=[x.split("/")[6] for x in chosen_links2]
final_months2=[x.split("/")[7] for x in chosen_links2]

In [580]:
'''       Here we have the algorithm for getting all of the data and variable names from the linked data-pages    '''
table2=[]
for link in chosen_links2:
    try:
        index=chosen_links2.index(link)
        month=final_months2[index]    # what month do we scrape?
        year=str(final_years2[index])      # what year do we scrape?
        city=final_cities2[index]    # what city do we scrape?
        Dark_values=[]
        Light_values=[]
        values=[]
        page = requests.get(link)             # getting linked webpage
        soup1= Bsoup(page.text)
        var=soup1.findAll("table")[3].findAll("th")    # find the list of variable names on the webpage
        variables=["city","year","month"]                    # making the list of variable names...
        [variables.append(x.text) for x in var]     #  ... (for each link separately to check if they are always the same)
        temps_dark = soup1.findAll("tr",{'class':'dark'})    # data are in a table with alternating dark 
        temps_light = soup1.findAll("tr",{'class':'light'})   # and light rows
        for x in temps_dark:
            tds=x.findAll("td")
            dark_value=[city,year,month]
            [dark_value.append(y.text) for y in tds]            # getting list of values from each dark row
            Dark_values.append(dark_value)
        for x in temps_light:
            tds=x.findAll("td")
            light_value=[city,year,month]
            [light_value.append(y.text) for y in tds]          # getting list of values from each light row
            Light_values.append(light_value)
        if len(temps_dark)==(len(temps_light)+1) or len(temps_dark)==len(temps_light):
            for i in range(len(temps_light)):                              
                values.append(Dark_values[i])                   # list of values for months with even number of days
                values.append(Light_values[i])                         
        else:
            for i in range(len(temps_light)):
                values.append(Dark_values[i])                   # list of values for months with odd number of days
                values.append(Light_values[i])
            values.append(Dark_values[len(temps_light)])
        tab={variables[i]:[x[i] for x in values] for i in range(len(variables))} # creating a monthly dictionary - variable:list_of_values
        table2.append(tab)               # in the end, we get a list of monthly dictionaries
        if index%200==0:     # checking the progress
            print(index)
            now = datetime.now()
            current_time = now.strftime("%H:%M:%S")
            print(current_time)
    except:
        print(chosen_links2.index(link))

0
14:32:59
200
14:35:43
400
14:38:18
600
14:40:53
800
14:43:31
1000
14:46:01
1200
14:48:29
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449

 ## There were some errors for links 1268-1789. We will try to add them separately.

In [590]:
ytable2=[]
for link in chosen_links2[1268:1790]:
    try:
        index=chosen_links2.index(link)
        month=final_months2[index]    # what month do we scrape?
        year=str(final_years2[index])      # what year do we scrape?
        city=final_cities2[index]    # what city do we scrape?
        Dark_values=[]
        Light_values=[]
        values=[]
        page = requests.get(link)             # getting linked webpage
        soup1= Bsoup(page.text)
        var=soup1.findAll("table")[3].findAll("th")    # find the list of variable names on the webpage
        variables=["city","year","month"]                    # making the list of variable names...
        [variables.append(x.text) for x in var]     #  ... (for each link separately to check if they are always the same)
        temps_dark = soup1.findAll("tr",{'class':'dark'})    # data are in a table with alternating dark 
        temps_light = soup1.findAll("tr",{'class':'light'})   # and light rows
        for x in temps_dark:
            tds=x.findAll("td")
            dark_value=[city,year,month]
            [dark_value.append(y.text) for y in tds]            # getting list of values from each dark row
            Dark_values.append(dark_value)
        for x in temps_light:
            tds=x.findAll("td")
            light_value=[city,year,month]
            [light_value.append(y.text) for y in tds]          # getting list of values from each light row
            Light_values.append(light_value)
        if len(temps_dark)==(len(temps_light)+1) or len(temps_dark)==len(temps_light):
            for i in range(len(temps_light)):                              
                values.append(Dark_values[i])                   # list of values for months with even number of days
                values.append(Light_values[i])                         
        else:
            for i in range(len(temps_light)):
                values.append(Dark_values[i])                   # list of values for months with odd number of days
                values.append(Light_values[i])
            values.append(Dark_values[len(temps_light)])
        tab={variables[i]:[x[i] for x in values] for i in range(len(variables))} # creating a monthly dictionary - variable:list_of_values
        ytable2.append(tab)               # in the end, we get a list of monthly dictionaries
        if index%200==0:     # checking the progress
            print(index)
            now = datetime.now()
            current_time = now.strftime("%H:%M:%S")
            print(current_time)
    except:
        print(chosen_links2.index(link))

1400
17:10:18
1600
17:11:46


In [605]:
#[table2.insert(1268+i,ytable2[i]) for i in range(len(ytable2))]
#[table2.remove(x) for x in table2[988:988+len(ytable2)]]

In [606]:
'''  Are the name of the variables always the same (including their order)?    '''
sum([list(x.keys())==list(table2[0].keys()) for x in table2])==len(table2)

True

In [607]:
'''    Creating list of dataframes - each month is a single dataframe  '''
list_of_DFs2=[pd.DataFrame(x) for x in table2]

In [608]:
'''     Exporting dataframes to the csv files        '''
df_paths2=['d:\moje_dokumenty\Desktop\IES\semester 11\Python\project\data2\ '+str(chosen_links2.index(x))+".csv" for x in chosen_links2]
[list_of_DFs2[df_paths2.index(x)].to_csv(x, index = None, header=True) for x in df_paths2]

[None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,

# Things to do: 
## analysis of the data (in a new Jupyter notebook file) - first of all to process them: 
*  ##  delete %, km/h etc. from the values in dataframes +  to transform strings into numerical values (floats) for appropriate variables, eg "6.6 °C" (string) into 6.6 (float)
* ## connected to the previous point, to replace missing values with monthly averages of the corresponding variable (or to drop the observation?)
* ## adding new data column to dataframes: daily range of temperature as Tmax-Tmin
* ## deleting unneccesary data columns from dataframes - Umidita, Raffica, Fenomeni,  Info (perhaps not really needed) 
* ## transliting Italian names of the variables (columns) into English, perhaps also name of the months inside of the dataframes
* ## then we can compute quarterly(?) (ie January to March, April to June...) means and variances for each city

## Regarding work done here, it will be needed to convert the code into objects and functions so it is object-oriented

# Previous versions - garbage bin

In [77]:
'''  basic links   '''
link1="https://www.ilmeteo.it/portale/"
link2="https://www.ilmeteo.it/portale/archivio-meteo/"

''' webpage with the links to the cities' data'''  # we chooses cities based on what is shown on this page
start = requests.get("https://www.ilmeteo.it/portale/archivio-meteo/Piemonte") 

In [269]:
beginning = requests.get(link2) 
soup_beginning=Bsoup(beginning.text)
horalka=soup_beginning.findAll("td")
rumba=horalka[1].findAll("a")
regions=[]
for x in rumba:
    regions.append(x.get("href"))

reg_links=[]
for region in regions:
    reg_links.append(link1+region)

In [311]:
'''   Gerting links to the data pages for the cities  '''
cities1=[]
for region in reg_links:
    reg_page = requests.get(region) 
    soup_reg_page=Bsoup(reg_page.text)
    tea=soup_reg_page.findAll("div",{"class":"block noborder"})
    cofee=tea[0].findAll("a",{"target":""})
    for x in cofee:
        cities1.append(x.text)
        
cities1=list(set(cities1))    # to get rid of the reccurent cities

In [80]:
'''   Gerting links to the data pages for the cities  '''
soup_start=Bsoup(start.text)

cola=soup_start.findAll("tr")

pepsi=cola[0].findAll("a")

#links=[]
#for city in pepsi:
#    links.append(city.get("href"))

#links
'''   Names of the cities '''
cities=[]
for city in pepsi:
    cities.append(city.text)

#print(links)
print(cities)   # do we get what we want?

['Torino', 'Genova', 'Milano', 'Trento', 'Venezia', 'Trieste', 'Bologna', 'Firenze', 'Ancona', 'Perugia', "L'Aquila", 'Roma', 'Campobasso', 'Prato', 'Bari', 'Napoli', 'Potenza', 'Catanzaro', 'Palermo', 'Cagliari', 'Catania']


In [81]:
'''   Links to the cities' webpages  '''
cities_data_pages=[]
for city in cities:       # we actually do not need actual links from the previous step as the links are just combinations of 
    cities_data_pages.append(link2+city) # "link2" and the name of the city (+ links from previous step give grammatical errors)
cities_data_pages   # do we get what we want?

['https://www.ilmeteo.it/portale/archivio-meteo/Torino',
 'https://www.ilmeteo.it/portale/archivio-meteo/Genova',
 'https://www.ilmeteo.it/portale/archivio-meteo/Milano',
 'https://www.ilmeteo.it/portale/archivio-meteo/Trento',
 'https://www.ilmeteo.it/portale/archivio-meteo/Venezia',
 'https://www.ilmeteo.it/portale/archivio-meteo/Trieste',
 'https://www.ilmeteo.it/portale/archivio-meteo/Bologna',
 'https://www.ilmeteo.it/portale/archivio-meteo/Firenze',
 'https://www.ilmeteo.it/portale/archivio-meteo/Ancona',
 'https://www.ilmeteo.it/portale/archivio-meteo/Perugia',
 "https://www.ilmeteo.it/portale/archivio-meteo/L'Aquila",
 'https://www.ilmeteo.it/portale/archivio-meteo/Roma',
 'https://www.ilmeteo.it/portale/archivio-meteo/Campobasso',
 'https://www.ilmeteo.it/portale/archivio-meteo/Prato',
 'https://www.ilmeteo.it/portale/archivio-meteo/Bari',
 'https://www.ilmeteo.it/portale/archivio-meteo/Napoli',
 'https://www.ilmeteo.it/portale/archivio-meteo/Potenza',
 'https://www.ilmeteo.it

In [None]:
#for city_link in cities1_links:
#    city_page = requests.get(city_link)
#    page_text=Bsoup(city_page.text)  ...


## Actaully, we can get the whole link by just combining the name of the city with the year, month and day:

In [9]:
''' List of years that we want to see  ''' 
years=[]
for year in range(1985,2020):
    years.append(str(year))

    ''' Constructing the "calendar" as a dictionary  ''' 
month_names=["Gennaio","Febbraio","Marzo","Aprile","Maggio","Giugno","Luglio","Agosto","Settembre","Ottobre","Novembre","Dicembre"]
zero=list(range(1,31))
one=list(range(1,32))
feb1=list(range(1,29))
feb2=list(range(1,30))
days=[one,zero,one,zero,one,zero,one,one,zero,one,zero,one]  
##  allocating the correct number of days to each month (except February) 
calendar={month_names[i]:days[i] for i in range(12)}   

'''  Construction of the final links to the wanted data tables  ''' 
table_links=[]
for city_link in cities1_links:
    for year in years:
        if int(year)%4==0:
            calendar["Febbraio"]=feb2                # adjusting February days in a leap year
            for month in list(calendar.keys()):
                for day in calendar[month]:
                    table_links.append(city_link+"/"+year+"/"+month+"/"+str(day))
        else:
            calendar["Febbraio"]=feb1                # adjusting February days in a non-leap year
            for month in list(calendar.keys()):
                for day in calendar[month]:
                    table_links.append(city_link+"/"+year+"/"+month+"/"+str(day))

In [12]:

%%timeit
final_values=[]
final_variables=[]
for table in table_links[15900:15900+200]:
    page = requests.get(table)
    page_soup= Bsoup(page.text) 
    temps_dark = page_soup.findAll("tr",{'class':'dark'})     # data are inside of a dark-light table
    temps_light = page_soup.findAll("tr",{'class':'light'})
    variables=[]
    values=[]
    for x in temps_dark:
        tds=x.findAll("td")                        
        variables.append(tds[0].text)           # first column of the table gives variable name
        values.append(tds[1].text)               # second column of the table gives the value
    for y in temps_light:
        tds=y.findAll("td")
        variables.append(tds[0].text)
        values.append(tds[1].text)
    final_values.append(values)
    final_variables.append(variables)



29.3 s ± 17.8 s per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [608]:
len(table_links)


357924

In [69]:
''' getting html (or what it is) of our webpage - Rome, June 2011 '''

r = requests.get("https://www.ilmeteo.it/portale/archivio-meteo/Roma/2001/Maggio") 

In [227]:

soup1= Bsoup(r.text) 
temps_dark = soup1.findAll("tr",class_="dark")
temps_light = soup1.findAll("tr",{'class':'light'})
table1=[(x.text,y.text) for x,y in list(zip(temps_dark,temps_light))]
#table2=[x.text for x in temps_light]
#table=list(zip(table1,table2))
table=[]
for i in range(len(temps_light)): 
    table.append(temps_dark[i].text)
    table.append(temps_light[i].text)
    

In [621]:
''' getting rid of non-numerical text '''
import re
for i in range(len(table)):
    table[i]=re.sub("[^0-9]"," ",table[i])
table

['121 3   10   29    43  9 4              ',
 '222 6   13   30    45  16 5              ',
 '323 7   14   29    48  14 8              ',
 '423 6   12   30    42  11 1              ',
 '522 8   12   30    49  11 1              ',
 '623 3   11   31    50  14 8              ',
 '723 3   15   30    60  13              ',
 '820 3   13   24    57  22 2              ',
 '922 6   14   29    33  11 1              ',
 '1021 6   10   29    44  7 6              ',
 '1119 9   10   27      74  9 4       ',
 '1220 6   16   28      81  35 2     63      ',
 '1321 8   14   26    62  14 8              ',
 '1418 8   17   23      83  5 4       ',
 '1520   16   24      76  7 6       ',
 '1621 3   13   27    61  14 8              ',
 '1722 3   13   29    54  5 4              ',
 '1822 7   13   29    61  16 5              ',
 '1921 7   12   29    65  5 4              ',
 '2022 3   13   28    64  7 6       ',
 '2122 7   17   27    69  7 6              ',
 '2223 4   18   29    69  13              ',
 '2320 5   

In [622]:
''' getting rid of the day of the month number so it does not mix up with median temperature '''

# !! important!! -> do not repeat this code twice - if yes then we need to redefine table from the beginning (cell 3)

for i in range(9):
    table[i]=table[i][1:]
for i in range(9,len(table)):
    table[i]=table[i][2:]
table    

['21 3   10   29    43  9 4              ',
 '22 6   13   30    45  16 5              ',
 '23 7   14   29    48  14 8              ',
 '23 6   12   30    42  11 1              ',
 '22 8   12   30    49  11 1              ',
 '23 3   11   31    50  14 8              ',
 '23 3   15   30    60  13              ',
 '20 3   13   24    57  22 2              ',
 '22 6   14   29    33  11 1              ',
 '21 6   10   29    44  7 6              ',
 '19 9   10   27      74  9 4       ',
 '20 6   16   28      81  35 2     63      ',
 '21 8   14   26    62  14 8              ',
 '18 8   17   23      83  5 4       ',
 '20   16   24      76  7 6       ',
 '21 3   13   27    61  14 8              ',
 '22 3   13   29    54  5 4              ',
 '22 7   13   29    61  16 5              ',
 '21 7   12   29    65  5 4              ',
 '22 3   13   28    64  7 6       ',
 '22 7   17   27    69  7 6              ',
 '23 4   18   29    69  13              ',
 '20 5   19   26      86  11 1       ',
 '22 2

In [623]:
''' transforming table (which is truly a list of numbers and spaces) into a list of lists of individual values '''

list_all=[]
for row in table:
    list_all.append(row.split())

In [624]:
''' our values are still string characters, we want numbers '''
for x in list_all:
    for j in range(len(x)):
        x[j]=float(x[j])


In [625]:
list_all

[[21.0, 3.0, 10.0, 29.0, 43.0, 9.0, 4.0],
 [22.0, 6.0, 13.0, 30.0, 45.0, 16.0, 5.0],
 [23.0, 7.0, 14.0, 29.0, 48.0, 14.0, 8.0],
 [23.0, 6.0, 12.0, 30.0, 42.0, 11.0, 1.0],
 [22.0, 8.0, 12.0, 30.0, 49.0, 11.0, 1.0],
 [23.0, 3.0, 11.0, 31.0, 50.0, 14.0, 8.0],
 [23.0, 3.0, 15.0, 30.0, 60.0, 13.0],
 [20.0, 3.0, 13.0, 24.0, 57.0, 22.0, 2.0],
 [22.0, 6.0, 14.0, 29.0, 33.0, 11.0, 1.0],
 [21.0, 6.0, 10.0, 29.0, 44.0, 7.0, 6.0],
 [19.0, 9.0, 10.0, 27.0, 74.0, 9.0, 4.0],
 [20.0, 6.0, 16.0, 28.0, 81.0, 35.0, 2.0, 63.0],
 [21.0, 8.0, 14.0, 26.0, 62.0, 14.0, 8.0],
 [18.0, 8.0, 17.0, 23.0, 83.0, 5.0, 4.0],
 [20.0, 16.0, 24.0, 76.0, 7.0, 6.0],
 [21.0, 3.0, 13.0, 27.0, 61.0, 14.0, 8.0],
 [22.0, 3.0, 13.0, 29.0, 54.0, 5.0, 4.0],
 [22.0, 7.0, 13.0, 29.0, 61.0, 16.0, 5.0],
 [21.0, 7.0, 12.0, 29.0, 65.0, 5.0, 4.0],
 [22.0, 3.0, 13.0, 28.0, 64.0, 7.0, 6.0],
 [22.0, 7.0, 17.0, 27.0, 69.0, 7.0, 6.0],
 [23.0, 4.0, 18.0, 29.0, 69.0, 13.0],
 [20.0, 5.0, 19.0, 26.0, 86.0, 11.0, 1.0],
 [22.0, 2.0, 13.0, 27.0, 53.0