In [1]:
import numpy as np
import pandas as pd
import datetime as dt
import requests

# PyMySQL 
import pymysql
pymysql.install_as_MySQLdb()

# SQL Alchemy
import sqlalchemy
from sqlalchemy.ext.automap import automap_base
from sqlalchemy.orm import Session
from sqlalchemy import create_engine, func, inspect

In [2]:
#Get teams and cities of English Premier League
UK_league_url = "https://en.wikipedia.org/wiki/List_of_Premier_League_clubs"
UK_all_tables = pd.read_html(UK_league_url,header =0)
UK_table=UK_all_tables[0]
UK_club_city_df = UK_table[['Club', 'Location']].copy()
UK_club_city_df.head()

Unnamed: 0,Club,Location
0,Arsenal,London (Highbury)
1,Aston Villa,Birmingham (Aston)
2,Barnsley,Barnsley
3,Birmingham City,Birmingham (Bordesley)
4,Blackburn Rovers,Blackburn


In [3]:
#Get teams and cities of German footbal leagues
GER_league_url = "https://en.wikipedia.org/wiki/List_of_football_clubs_in_Germany"
GER_all_tables = pd.read_html(GER_league_url,header =0)
GER_table=GER_all_tables[0]
GER_club_city_df = GER_table[['Name', 'City']].copy()
GER_club_city_df=GER_club_city_df.rename(index=str, columns={"Name": "Club", "City": "Location"})
GER_club_city_df.head()

Unnamed: 0,Club,Location
0,VfV 06 Hildesheim,Hildesheim
1,VfL 07 Bremen,Bremen
2,FSV 08 Bissingen,Bietigheim-Bissingen
3,TSV 1860 Munich,Munich
4,TSV 1860 Rosenheim,Rosenheim


In [4]:
#Get teams and cities of Spanish footbal leagues, table 1 for tier 1 league, table 2 for tier 2 league
ESP_league_url = "https://en.wikipedia.org/wiki/List_of_football_clubs_in_Spain"
ESP_all_tables = pd.read_html(ESP_league_url,header =0)
ESP_table=ESP_all_tables[0]
ESP_club_city_df = ESP_table[['Club', 'Home city']].copy()
ESP_club_city_df=ESP_club_city_df.rename(index=str, columns={"Home city": "Location"})
ESP_club_city_df.head()

ESP_table_2=ESP_all_tables[1]
ESP_club_city_df_2 = ESP_table_2[['Club', 'Home city']].copy()
ESP_club_city_df_2=ESP_club_city_df_2.rename(index=str, columns={"Home city": "Location"})
ESP_club_city_df_2.head()

Unnamed: 0,Club,Location
0,Albacete,Albacete
1,Alcorcón,Alcorcón
2,Almería,Almería
3,Cádiz,Cádiz
4,Córdoba,Córdoba


In [5]:
#Get teams and cities of Italian footbal leagues, table 1 for tier 1 league, table 2 for tier 2 league
ITA_league_url = "https://en.wikipedia.org/wiki/List_of_football_clubs_in_Italy"
ITA_all_tables = pd.read_html(ITA_league_url,header =0)
ITA_table=ITA_all_tables[0]
ITA_club_city_df = ITA_table[['Team', 'Home city']].copy()
ITA_club_city_df=ITA_club_city_df.rename(index=str, columns={"Team": "Club", "Home city": "Location"})
ITA_club_city_df.head()

ITA_table_2=ITA_all_tables[1]
ITA_club_city_df_2 = ITA_table_2[['Team', 'Home city']].copy()
ITA_club_city_df_2=ITA_club_city_df_2.rename(index=str, columns={"Team": "Club", "Home city": "Location"})
ITA_club_city_df_2.head()

Unnamed: 0,Club,Location
0,Ascoli,Ascoli Piceno
1,Benevento,Benevento
2,Brescia,Brescia
3,Carpi,Carpi
4,Cittadella,Cittadella


In [6]:
#Combine all league dataframes into one single dataframe
merged_df = pd.concat([UK_club_city_df, GER_club_city_df,ESP_club_city_df,ESP_club_city_df_2,ITA_club_city_df,ITA_club_city_df_2])
#Reset the index
merged_index_reset_df = merged_df.reset_index()
#Remove the index column
consolidated_df = merged_index_reset_df[['Club', 'Location']].copy()
consolidated_df.head()

Unnamed: 0,Club,Location
0,Arsenal,London (Highbury)
1,Aston Villa,Birmingham (Aston)
2,Barnsley,Barnsley
3,Birmingham City,Birmingham (Bordesley)
4,Blackburn Rovers,Blackburn


In [7]:
# Clean up Club data
# Remove asterisk in club name
consolidated_df['Club'] = consolidated_df['Club'].map(lambda x: x.rstrip('*'))
# Remove brackets in club name
sub ='['
consolidated_df["location_left_bracket"]= consolidated_df["Club"].str.find(sub)
consolidated_df['Club'] = consolidated_df.apply(lambda x: x['Club'] if x['location_left_bracket']==-1 else x['Club'][:x['location_left_bracket']], 1)
# Remove parentheses in location name
sub ='('
consolidated_df["location_left_parentheses"]= consolidated_df["Location"].str.find(sub)-1
consolidated_df['Location'] = consolidated_df.apply(lambda x: x['Location'] if x['location_left_parentheses']==-2 else x['Location'][:x['location_left_parentheses']], 1) 

consolidated_df.head()

Unnamed: 0,Club,Location,location_left_bracket,location_left_parentheses
0,Arsenal,London,-1,6
1,Aston Villa,Birmingham,-1,10
2,Barnsley,Barnsley,-1,-2
3,Birmingham City,Birmingham,-1,10
4,Blackburn Rovers,Blackburn,-1,-2


In [8]:
# Tweak value to match with soccer stats data and GDP data
Value_to_tweak = {'Hertha BSC':'Hertha BSC Berlin','Sevilla':'Sevilla FC','Las Palmas':'UD Las Palmas','Athletic Bilbao':'Athletic Club de Bilbao','Chievo':'Chievo Verona','Levante':'Levante UD','Real Betis':'Real Betis Balompi√©','Real Madrid':'Real Madrid CF','Barcelona':'FC Barcelona','Internazionale':'Inter','Atl√É¬©tico Madrid':'Atl√©tico Madrid','Queens ParkRangers':'Queens Park Rangers','Valencia':'Valencia CF'}
consolidated_df = consolidated_df.replace(Value_to_tweak)

In [9]:
# Keep Club and Location only
final_df = consolidated_df[['Club', 'Location']].copy()
final_df.head()

Unnamed: 0,Club,Location
0,Arsenal,London
1,Aston Villa,Birmingham
2,Barnsley,Barnsley
3,Birmingham City,Birmingham
4,Blackburn Rovers,Blackburn


In [10]:
# Export to csv
final_df.to_csv("output/club_city.csv", index=False, header=True)