In [1]:
import psycopg2 #import the PostgreSQL adapter
import pandas as pd #Used for data analysis and manipilation
from sqlalchemy import create_engine #function to vreat a database engine
import numpy as np #For array manipulation
from psycopg2 import OperationalError
import re #python string library

In [2]:
def create_connection(db_name, db_user, db_password, db_host, db_port): #Python function to create a connection to PostgreSQL server. (Reused from Data Warehousing Lab3)
    connection = None
    try:
        connection = psycopg2.connect(
            database=db_name,
            user=db_user,
            password=db_password,
            host=db_host,
            port=db_port,
        )
        print("Connection to PostgreSQL DB successful")
    except OperationalError as e:
        print(f"The error '{e}' occurred")
    return connection


In [3]:
db_name = "Olympic"  #database parameters
db_user = "postgres"
db_password = "postgres"  
db_host = "localhost"  
db_port = "5432"
db_server = "dw_2024"

In [4]:
connection = create_connection(db_name, db_user, db_password, db_host, db_port)  #Create connection to the PostgreSQL server function

Connection to PostgreSQL DB successful


In [5]:
cursor = connection.cursor() #Cursor allow python code to execute PostgreSQL command in a database session

In [6]:
connection_string = f"postgresql+psycopg2://{db_user}:{db_password}@{db_host}:{db_port}/{db_name}" #connection string to the database
engine = create_engine(connection_string) #Create engine for the database

In [7]:
countries_df = pd.read_csv("./list-of-countries_areas-by-continent-2024.csv", header=None)

In [8]:
countries_df.columns = ["country", "continent"]

In [9]:
hosts_df = pd.read_csv("./olympic_hosts.csv", header=0)

In [10]:
hosts_df.columns = ["hostid", "enddate", "startdate","location","name","season","year"]

In [11]:
lifeexp_df = pd.read_csv("./life-expectancy.csv", header=0)

In [12]:
lifeexp_df.columns= ["country","code","year","lifeexpectancy"]

In [98]:
medals_df = pd.read_csv("./olympic_medals.csv", header=0)

In [99]:
medals_df.columns = ["discipline","year","event","gender","medal","participanttype","participanttitle","url","name","country","code","code3"]

In [803]:
mental_df = pd.read_csv("./mental-illness.csv",header=0)

In [804]:
mental_df.columns = ["country","code","year","depression","schizophrenia","bipolar","eatingdisorder","anxiety"]

In [17]:
economic_df = pd.read_csv("./Economic data.csv",header=0)

In [18]:
economic_df.columns = ["year", "yearcode", "country", "code", "poverty", "gdpcap", "gdpgrowth", "intsrv","mort", "hlthexp", "govhlthcap", "prvhlthcap", "exthlthcap"]

In [19]:
population_df = pd.read_csv("./Global Population.csv",header=0,encoding='ISO-8859-1')

In [20]:
#Data Cleaning

In [21]:
hosts_df['enddate'] = pd.to_datetime(hosts_df['enddate'], format='%Y-%m-%dT%H:%M:%SZ') #Convert the start times to datetime format
hosts_df['startdate'] = pd.to_datetime(hosts_df['startdate'], format='%Y-%m-%dT%H:%M:%SZ') #Convert the end times to datetime format

In [22]:
lifeexp_df = lifeexp_df.dropna() #lifeexpectancy has continents with na country codes which will not be required so they are dropped from the dataframe

In [23]:
medals_df=medals_df.dropna() #mental illness dataframe has rows with no counntry codes that are regions and continents which will be dropped from the dataframe

In [24]:
economic_df['poverty'] = economic_df['poverty'].replace('..', np.nan) #In each column in the economy data frame there are strings ".." for values with no data which will be replace with NaN 
economic_df['gdpcap'] = economic_df['gdpcap'].replace('..', np.nan)
economic_df['gdpgrowth'] = economic_df['gdpgrowth'].replace('..', np.nan)
economic_df['intsrv'] = economic_df['intsrv'].replace('..', np.nan)
economic_df['mort'] = economic_df['mort'].replace('..', np.nan)
economic_df['hlthexp'] = economic_df['hlthexp'].replace('..', np.nan)
economic_df['govhlthcap'] = economic_df['govhlthcap'].replace('..', np.nan)
economic_df['prvhlthcap'] = economic_df['prvhlthcap'].replace('..', np.nan)
economic_df['exthlthcap'] = economic_df['exthlthcap'].replace('..', np.nan)

In [25]:
economic_df=economic_df[0:-5] # Last five rows are invalid rows containing info about the data and spaces in the csv file

In [26]:
population_df = pd.melt(population_df, id_vars=["Population (Millions of people)"], var_name="Year", value_name="Population")

In [27]:
population_df['Population'] = population_df['Population'].replace('no data', np.nan)

In [28]:
population_df.columns = ["country","year","population"]

In [100]:
def extract_year(value):
    year_pattern = r'\b(18|19|20)\d{2}\b'
    match = re.search(year_pattern, value)
    if match:
        return int(match.group(0))  # Convert the matched year to an integer
    else:
        return None 
        
medals_df['year'] = medals_df['year'].apply(extract_year)  #To obtain the year from the title event.

In [101]:
medals_df = medals_df.drop(columns=["code"]) #To drop the coutnry code with only two characters as standard and other tables follow the three letter structure

In [102]:
medals_df = medals_df.rename(columns={"code3":"code"}) #Rename column to code

In [32]:
#Create SQL tables in the PostgreSQL database and upload the data from the dataframes to the tables

In [None]:
cursor.execute("CREATE TABLE countries (country VARCHAR(255) NOT NULL PRIMARY KEY, continent VARCHAR(255) NOT NULL);")

In [34]:
connection.commit()

In [None]:
countries_df.to_sql("countries", con=engine, if_exists="append", index=False)

In [42]:
cursor.execute("CREATE TABLE economic (year INT NOT NULL, yearcode VARCHAR(255) NOT NULL, country VARCHAR(255) NOT NULL, code VARCHAR(3) NOT NULL,poverty FLOAT,gdpcap FLOAT,gdpgrowth FLOAT,intsrv FLOAT, mort FLOAT, hlthexp FLOAT, govhlthcap FLOAT, prvhlthcap FLOAT, exthlthcap FLOAT);")

In [43]:
connection.commit()

In [44]:
economic_df.to_sql("economic", con=engine, if_exists="append", index=False)

200

In [54]:
cursor.execute("CREATE TABLE population (country VARCHAR(255), year INT NOT NULL, population FLOAT);")

In [55]:
connection.commit()

In [56]:
population_df.to_sql("population", con=engine, if_exists="append", index=False)

318

In [89]:
cursor.execute("ROLLBACK")

In [66]:
cursor.execute("CREATE TABLE lifeexp  (country VARCHAR(255) NOT NULL, code VARCHAR(255) NOT NULL, year INT NOT NULL, lifeexpectancy FLOAT);")

In [67]:
connection.commit()

In [68]:
lifeexp_df.to_sql("lifeexp", con=engine, if_exists="append", index=False)

61

In [76]:
cursor.execute("CREATE TABLE millness  (country VARCHAR(255) NOT NULL, code VARCHAR(255), year INT NOT NULL, depression FLOAT, schizophrenia FLOAT, bipolar FLOAT, eatingdisorder FLOAT, anxiety FLOAT);")

In [77]:
connection.commit()

In [78]:
mental_df.to_sql("millness", con=engine, if_exists="append", index=False)

840

In [81]:
cursor.execute("CREATE TABLE hosts  (hostid VARCHAR(255) NOT NULL PRIMARY KEY, enddate TIMESTAMP WITH TIME ZONE, startdate TIMESTAMP WITH TIME ZONE, location VARCHAR(255), name	 VARCHAR(255), season VARCHAR(255), year INT);")

In [82]:
connection.commit()

In [83]:
hosts_df.to_sql("hosts", con=engine, if_exists="append", index=False)

53

In [94]:
cursor.execute("CREATE TABLE medals (discipline VARCHAR(255), year INT NOT NULL, event VARCHAR(255), gender VARCHAR(255), medal VARCHAR(255), participanttype VARCHAR(255),participanttitle VARCHAR(255), url VARCHAR(255),name VARCHAR(255), country VARCHAR(255), code VARCHAR(255));")

In [96]:
connection.commit()

In [97]:
medals_df.to_sql("medals",con=engine,if_exists="append", index=False)

667

In [33]:
#Creating the facttable and dimension tables

In [34]:
#For the facttable my columns are factid,country,year,sum of bronze for that year, sum of silver for that year, sum of gold for that year, mental health statistics

In [805]:
medal_sums = medals_df.groupby(['year', 'country', 'medal']).size().unstack(fill_value=0).reset_index()

In [806]:
medal_sums

medal,year,country,BRONZE,GOLD,SILVER
0,1896,Australia,0,2,0
1,1896,Austria,2,2,1
2,1896,Denmark,3,1,2
3,1896,France,2,5,4
4,1896,Germany,2,6,5
...,...,...,...,...,...
1561,2022,Spain,0,0,1
1562,2022,Sweden,6,8,6
1563,2022,Switzerland,5,7,2
1564,2022,Ukraine,0,0,1


In [807]:
medal_sums["total"] = medal_sums["BRONZE"] +  medal_sums["GOLD"] +  medal_sums["SILVER"]

In [809]:
medal_sums.columns = ["year","country","bronze","gold","silver","total"]

In [810]:
years = hosts_df.drop_duplicates()

In [811]:
years = hosts_df["year"]

In [812]:
year = pd.DataFrame(data = hosts_df["year"],columns=["year"])

In [813]:
year = year.drop_duplicates()

In [814]:
year

Unnamed: 0,year
0,2022
1,2020
2,2018
3,2016
4,2014
5,2012
6,2010
7,2008
8,2006
9,2004


In [815]:
all_years_countries = pd.MultiIndex.from_product([year["year"], countries_df['country'].drop_duplicates()], names=['year', 'country']).to_frame(index=False)

In [816]:
all_years_countries

Unnamed: 0,year,country
0,2022,country
1,2022,India
2,2022,China
3,2022,United States
4,2022,Indonesia
...,...,...
8690,1896,Montserrat
8691,1896,Falkland Islands
8692,1896,Niue
8693,1896,Tokelau


In [817]:
medal_sums.dtypes

year        int64
country    object
bronze      int64
gold        int64
silver      int64
total       int64
dtype: object

In [829]:
medal_sums = medal_sums.fillna(0)

In [830]:
medal_sums["bronze"] = medal_sums["bronze"].astype(int)

In [831]:
medal_sums["gold"] = medal_sums["gold"].astype(int)

In [832]:
medal_sums["silver"] = medal_sums["silver"].astype(int)

In [833]:
medal_sums["total"] = medal_sums["total"].astype(int)

In [834]:
medal_sums

Unnamed: 0,year,country,bronze,gold,silver,total
0,2022,country,0,0,0,0
1,2022,India,0,0,0,0
2,2022,China,2,10,4,16
3,2022,United States,9,9,10,28
4,2022,Indonesia,0,0,0,0
...,...,...,...,...,...,...
8690,1896,Montserrat,0,0,0,0
8691,1896,Falkland Islands,0,0,0,0
8692,1896,Niue,0,0,0,0
8693,1896,Tokelau,0,0,0,0


In [818]:
australia_data = medal_sums[medal_sums['country'] == 'China'].to_dict('records')


In [819]:
olympic_countries = set(medals_df['country'].unique())
list_countries = set(countries_df['country'].unique())

In [820]:
olympic_countries - list_countries #Check countries that have never won 

{'Australasia',
 'Bohemia',
 'Chinese Taipei',
 'Czechoslovakia',
 "Côte d'Ivoire",
 "Democratic People's Republic of Korea",
 'Federal Republic of Germany',
 'German Democratic Republic (Germany)',
 'Great Britain',
 'Hong Kong, China',
 'Independent Olympic Athletes',
 'Islamic Republic of Iran',
 'Kosovo',
 'MIX',
 'Netherlands Antilles',
 'Olympic Athletes from Russia',
 "People's Republic of China",
 'ROC',
 'Republic of Korea',
 'Republic of Moldova',
 'Russian Federation',
 'Serbia and Montenegro',
 'Soviet Union',
 'Syrian Arab Republic',
 'Unified Team',
 'United Arab Republic',
 'United Republic of Tanzania',
 'United States of America',
 'Virgin Islands, US',
 'West Indies Federation',
 'Yugoslavia'}

In [821]:
name_mapping = {
    "Great Britain": "United Kingdom",
    "United States of America": "United States",
    "People's Republic of China": "China",
    "Republic of Korea": "South Korea",
    "Democratic People's Republic of Korea": "North Korea",
    "Russian Federation": "Russia",
    "Republic of Moldova": "Moldova",
    "Islamic Republic of Iran": "Iran",
    "Syrian Arab Republic": "Syria",
    "United Republic of Tanzania": "Tanzania",
    "Hong Kong, China": "Hong Kong",
    "Virgin Islands, US": "United States Virgin Islands",
    "Côte d'Ivoire": "Ivory Coast",
    # Special cases handled individually
    "Chinese Taipei": "Taiwan",  # Taiwan is often referred to as Chinese Taipei in international sports
    "Kosovo": "Kosovo",  # May not be in some lists due to political recognition issues
}

In [822]:
medal_sums['country'] = medal_sums['country'].map(name_mapping).fillna(medal_sums['country'])

In [823]:
entities_to_remove = [
    'ROC',
    'Olympic Athletes from Russia',
    'Independent Olympic Athletes',
    'Serbia and Montenegro',
    'Unified Team',
    'Czechoslovakia',
    'Federal Republic of Germany',
    'Soviet Union',
    'German Democratic Republic (Germany)',
    'Yugoslavia',
    'Netherlands Antilles',
    'West Indies Federation',
    'United Arab Republic',
    'Australasia',
    'Bohemia',
    'MIX'
]

In [824]:
medal_sums_filtered = medal_sums[~medal_sums['country'].isin(entities_to_remove)]

In [825]:
medal_sums_filtered

Unnamed: 0,year,country,bronze,gold,silver,total
0,1896,Australia,0,2,0,2
1,1896,Austria,2,2,1,5
2,1896,Denmark,3,1,2,6
3,1896,France,2,5,4,11
4,1896,Germany,2,6,5,13
...,...,...,...,...,...,...
1561,2022,Spain,0,0,1,1
1562,2022,Sweden,6,8,6,20
1563,2022,Switzerland,5,7,2,14
1564,2022,Ukraine,0,0,1,1


In [826]:
olympic_countries = set(medal_sums_filtered['country'].unique())
list_countries = set(countries_df['country'].unique())

In [827]:
medal_sums_filtered = pd.merge(all_years_countries, medal_sums_filtered, on=['year', 'country'], how='inner')

In [828]:
medal_sums = pd.merge(all_years_countries, medal_sums_filtered, on=['year', 'country'], how='left')

In [793]:
medal_sums[medal_sums['country'] == 'Congo'].to_dict('records')

[]

In [835]:
olympic_countries = set(medal_sums['country'].unique())
list_countries = set(mental_df['country'].unique())

In [836]:
list_countries - olympic_countries

{'African Region (WHO)',
 'Congo',
 "Cote d'Ivoire",
 'Czechia',
 'Democratic Republic of Congo',
 'East Asia & Pacific (WB)',
 'East Timor',
 'Eastern Mediterranean Region (WHO)',
 'England',
 'Europe & Central Asia (WB)',
 'European Region (WHO)',
 'G20',
 'High Income (WB)',
 'Latin America & Caribbean (WB)',
 'Low Income (WB)',
 'Lower Middle Income (WB)',
 'Micronesia (country)',
 'Middle East & North Africa (WB)',
 'Middle Income (WB)',
 'North America (WB)',
 'Northern Ireland',
 'OECD Countries',
 'Region of the Americas (WHO)',
 'Scotland',
 'South Asia (WB)',
 'South-East Asia Region (WHO)',
 'Sub-Saharan Africa (WB)',
 'Wales',
 'Western Pacific Region (WHO)',
 'World'}

In [837]:
facttable = pd.merge(medal_sums, mental_df.drop('code',axis=1 ), on=['year', 'country'], how='inner')

In [838]:
medal_sums

Unnamed: 0,year,country,bronze,gold,silver,total
0,2022,country,0,0,0,0
1,2022,India,0,0,0,0
2,2022,China,2,10,4,16
3,2022,United States,9,9,10,28
4,2022,Indonesia,0,0,0,0
...,...,...,...,...,...,...
8690,1896,Montserrat,0,0,0,0
8691,1896,Falkland Islands,0,0,0,0
8692,1896,Niue,0,0,0,0
8693,1896,Tokelau,0,0,0,0


In [839]:
facttable

Unnamed: 0,year,country,bronze,gold,silver,total,depression,schizophrenia,bipolar,eatingdisorder,anxiety
0,2018,India,0,0,0,0,620.72300,181.51364,75.227180,27.108828,284.22504
1,2018,China,2,1,7,10,419.28560,202.93486,39.638176,24.338549,310.26324
2,2018,United States,7,10,9,26,774.79320,270.07968,127.606200,89.955910,501.62817
3,2018,Indonesia,0,0,0,0,343.55807,188.28181,71.286290,23.671894,338.28217
4,2018,Pakistan,0,0,0,0,662.26276,177.39119,83.039320,25.683395,327.31042
...,...,...,...,...,...,...,...,...,...,...,...
2767,1992,Cook Islands,0,0,0,0,499.22140,199.85428,60.700190,27.229467,374.42496
2768,1992,Nauru,0,0,0,0,495.24045,191.54384,56.292202,26.558008,375.66248
2769,1992,Tuvalu,0,0,0,0,505.07240,173.32458,57.777637,15.737826,384.30576
2770,1992,Niue,0,0,0,0,500.77084,189.70824,59.198162,22.216448,378.40616


In [840]:
facttablex[facttablex['country'] == 'United Kingdom'].to_dict('records')

[{'year': 2018,
  'country': 'United Kingdom',
  'bronze': 4,
  'silver': 0,
  'gold': 1,
  'total': 5,
  'depression': 721.4107,
  'schizophrenia': 169.23366,
  'bipolar': 239.643,
  'eatingdisorder': 90.89389,
  'anxiety': 419.7872},
 {'year': 2018,
  'country': 'United Kingdom',
  'bronze': 4,
  'silver': 0,
  'gold': 1,
  'total': 5,
  'depression': 720.931,
  'schizophrenia': 167.65701,
  'bipolar': 233.98294,
  'eatingdisorder': 90.3386,
  'anxiety': 423.41278},
 {'year': 2016,
  'country': 'United Kingdom',
  'bronze': 19,
  'silver': 26,
  'gold': 30,
  'total': 75,
  'depression': 729.2112,
  'schizophrenia': 176.43358,
  'bipolar': 239.79697,
  'eatingdisorder': 89.719185,
  'anxiety': 420.7919},
 {'year': 2016,
  'country': 'United Kingdom',
  'bronze': 19,
  'silver': 26,
  'gold': 30,
  'total': 75,
  'depression': 728.00446,
  'schizophrenia': 173.417,
  'bipolar': 234.06209,
  'eatingdisorder': 89.164986,
  'anxiety': 424.54456},
 {'year': 2014,
  'country': 'United King

In [800]:
facttable

Unnamed: 0,year,country,bronze,gold,silver,total,depression,schizophrenia,bipolar,eatingdisorder,anxiety
0,2018,India,0,0,0,0,620.72300,181.51364,75.227180,27.108828,284.22504
1,2018,China,2,1,7,10,419.28560,202.93486,39.638176,24.338549,310.26324
2,2018,United States,7,10,9,26,774.79320,270.07968,127.606200,89.955910,501.62817
3,2018,Indonesia,0,0,0,0,343.55807,188.28181,71.286290,23.671894,338.28217
4,2018,Pakistan,0,0,0,0,662.26276,177.39119,83.039320,25.683395,327.31042
...,...,...,...,...,...,...,...,...,...,...,...
2767,1992,Cook Islands,0,0,0,0,499.22140,199.85428,60.700190,27.229467,374.42496
2768,1992,Nauru,0,0,0,0,495.24045,191.54384,56.292202,26.558008,375.66248
2769,1992,Tuvalu,0,0,0,0,505.07240,173.32458,57.777637,15.737826,384.30576
2770,1992,Niue,0,0,0,0,500.77084,189.70824,59.198162,22.216448,378.40616


In [801]:
cursor.execute("CREATE TABLE factolympic ( year INT, country VARCHAR(255), bronze INT, 

SyntaxError: unterminated string literal (detected at line 1) (1141776092.py, line 1)

In [841]:
facttablex = facttable.drop_duplicates()

In [842]:
facttablex

Unnamed: 0,year,country,bronze,gold,silver,total,depression,schizophrenia,bipolar,eatingdisorder,anxiety
0,2018,India,0,0,0,0,620.72300,181.51364,75.227180,27.108828,284.22504
1,2018,China,2,1,7,10,419.28560,202.93486,39.638176,24.338549,310.26324
2,2018,United States,7,10,9,26,774.79320,270.07968,127.606200,89.955910,501.62817
3,2018,Indonesia,0,0,0,0,343.55807,188.28181,71.286290,23.671894,338.28217
4,2018,Pakistan,0,0,0,0,662.26276,177.39119,83.039320,25.683395,327.31042
...,...,...,...,...,...,...,...,...,...,...,...
2767,1992,Cook Islands,0,0,0,0,499.22140,199.85428,60.700190,27.229467,374.42496
2768,1992,Nauru,0,0,0,0,495.24045,191.54384,56.292202,26.558008,375.66248
2769,1992,Tuvalu,0,0,0,0,505.07240,173.32458,57.777637,15.737826,384.30576
2770,1992,Niue,0,0,0,0,500.77084,189.70824,59.198162,22.216448,378.40616


In [522]:
cursor.execute("CREATE TABLE factolympic ( year INT, country VARCHAR(255), bronze INT,silver INT, gold INT,total INT,depression FLOAT, schizophrenia FLOAT, bipolar FLOAT, eatingdisorder FLOAT,anxiety FLOAT, PRIMARY KEY(year,country) );")

In [518]:
cursor.execute("ROLLBACK")

In [845]:
connection.commit()

In [843]:
facttablex.to_sql("factolympic", con=engine, if_exists="append", index=False)

772

In [844]:
cursor.execute("CREATE TABLE time (year INT PRIMARY KEY);")

In [847]:
year.to_sql("time",con=engine,if_exists="append",index=False)

37