In [2]:
import psycopg2

In [3]:
def create_connection(db_name, db_user, db_password, db_host, db_port):
    connection = None
    try:
        connection = psycopg2.connect(
            database=db_name,
            user=db_user,
            password=db_password,
            host=db_host,
            port=db_port,
        )
        print("Connection to PostgreSQL DB successful")
    except OperationalError as e:
        print(f"The error '{e}' occurred")
    return connection

In [4]:
db_name = "Project1"
db_user = "postgres"
db_password = "postgres"  
db_host = "localhost"  
db_port = "5432"

In [5]:
connection = create_connection(db_name, db_user, db_password, db_host, db_port)

Connection to PostgreSQL DB successful


In [6]:
cursor = connection.cursor()

In [7]:
import pandas as pd

In [8]:
from sqlalchemy import create_engine

In [9]:
db_server = "dw_2024"

In [10]:
connection_string = f"postgresql+psycopg2://{db_user}:{db_password}@{db_host}:{db_port}/{db_name}"
engine = create_engine(connection_string)

In [11]:
print(engine)

Engine(postgresql+psycopg2://postgres:***@localhost:5432/Project1)


In [11]:
from pathlib import Path
script_path = Path.cwd().parent

In [12]:
data_path = script_path / "data" / "Project1"
countrycsv = data_path / "list-of-countries_areas-by-continent-2024.csv"

In [57]:
countrydf = pd.read_csv("./list-of-countries_areas-by-continent-2024.csv", header=None)

In [58]:
countrydf.columns = ["country", "continent"]

In [59]:
countrydf

Unnamed: 0,country,continent
0,country,region
1,India,Asia
2,China,Asia
3,United States,North America
4,Indonesia,Asia
...,...,...
230,Montserrat,North America
231,Falkland Islands,South America
232,Niue,Oceania
233,Tokelau,Oceania


In [16]:
cursor.execute("CREATE TABLE olympicHosts (hostID VARCHAR(255) PRIMARY KEY, endDate TIMESTAMP WITH TIME ZONE, startDate TIMESTAMP WITH TIME ZONE, location VARCHAR(255), name VARCHAR(255), season VARCHAR(255), year INT);")

In [17]:
connection.commit()

In [25]:
olympic_hosts = pd.read_csv("./olympic_hosts.csv", header=0)

In [30]:
olympic_hosts['game_end_date'] = pd.to_datetime(olympic_hosts['game_end_date'], format='%Y-%m-%dT%H:%M:%SZ')

In [32]:
olympic_hosts['game_start_date'] = pd.to_datetime(olympic_hosts['game_start_date'], format='%Y-%m-%dT%H:%M:%SZ')

In [38]:
olympic_hosts['game_slug'] = olympic_hosts['game_slug'].astype(str)

In [42]:
olympic_hosts['game_year'] = pd.to_numeric(olympic_hosts['game_year'])

In [43]:
print(olympic_hosts.dtypes)

game_slug                  object
game_end_date      datetime64[ns]
game_start_date    datetime64[ns]
game_location              object
game_name                  object
game_season                object
game_year                   int64
dtype: object


In [46]:
olympic_hosts

Unnamed: 0,hostID,endDate,startDate,location,name,season,year
0,beijing-2022,2022-02-20 12:00:00,2022-02-04 15:00:00,China,Beijing 2022,Winter,2022
1,tokyo-2020,2021-08-08 14:00:00,2021-07-23 11:00:00,Japan,Tokyo 2020,Summer,2020
2,pyeongchang-2018,2018-02-25 08:00:00,2018-02-08 23:00:00,Republic of Korea,PyeongChang 2018,Winter,2018
3,rio-2016,2016-08-21 21:00:00,2016-08-05 12:00:00,Brazil,Rio 2016,Summer,2016
4,sochi-2014,2014-02-23 16:00:00,2014-02-07 04:00:00,Russian Federation,Sochi 2014,Winter,2014
5,london-2012,2012-08-12 19:00:00,2012-07-27 07:00:00,Great Britain,London 2012,Summer,2012
6,vancouver-2010,2010-02-28 04:00:00,2010-02-12 16:00:00,Canada,Vancouver 2010,Winter,2010
7,beijing-2008,2008-08-24 12:00:00,2008-08-08 00:00:00,China,Beijing 2008,Summer,2008
8,turin-2006,2006-02-26 19:00:00,2006-02-10 07:00:00,Italy,Turin 2006,Winter,2006
9,athens-2004,2004-08-29 18:00:00,2004-08-13 06:00:00,Greece,Athens 2004,Summer,2004


In [48]:
olympic_hosts.columns = ["hostid", "enddate", "startdate","location","name","season","year"]

In [49]:
olympic_hosts.to_sql("olympichosts", con=engine, if_exists="append", index=False)

53

In [51]:
life_expectancy = pd.read_csv("./life-expectancy.csv", header=0)

In [59]:
life_expectancy

Unnamed: 0,Entity,Code,Year,Period life expectancy at birth - Sex: all - Age: 0
0,Afghanistan,AFG,1950,27.7275
1,Afghanistan,AFG,1951,27.9634
2,Afghanistan,AFG,1952,28.4456
3,Afghanistan,AFG,1953,28.9304
4,Afghanistan,AFG,1954,29.2258
...,...,...,...,...
20750,Zimbabwe,ZWE,2017,60.7095
20751,Zimbabwe,ZWE,2018,61.4141
20752,Zimbabwe,ZWE,2019,61.2925
20753,Zimbabwe,ZWE,2020,61.1242


In [57]:
life_expectancy_cleaned = life_expectancy.dropna()

In [58]:
life_expectancy_cleaned

Unnamed: 0,Entity,Code,Year,Period life expectancy at birth - Sex: all - Age: 0
0,Afghanistan,AFG,1950,27.7275
1,Afghanistan,AFG,1951,27.9634
2,Afghanistan,AFG,1952,28.4456
3,Afghanistan,AFG,1953,28.9304
4,Afghanistan,AFG,1954,29.2258
...,...,...,...,...
20750,Zimbabwe,ZWE,2017,60.7095
20751,Zimbabwe,ZWE,2018,61.4141
20752,Zimbabwe,ZWE,2019,61.2925
20753,Zimbabwe,ZWE,2020,61.1242


In [94]:
cursor.execute("CREATE TABLE lifeexpectancy (country VARCHAR(255), code VARCHAR(255), year INT, lifeexpectancy FLOAT,PRIMARY KEY (code, year));")

In [95]:
connection.commit()

In [96]:
life_expectancy_cleaned.columns= ["country","code","year","lifeexpectancy"]

In [91]:
life_expectancy_cleaned["lifeexpectancy"] = pd.to_numeric(life_expectancy_cleaned["lifeexpectancy"])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  life_expectancy_cleaned["lifeexpectancy"] = pd.to_numeric(life_expectancy_cleaned["lifeexpectancy"])


In [97]:
life_expectancy_cleaned.dtypes

country            object
code               object
year                int64
lifeexpectancy    float64
dtype: object

In [98]:
life_expectancy_cleaned.to_sql("lifeexpectancy", con=engine, if_exists="append", index=False)

61

In [162]:
olympic_medals = pd.read_csv("./olympic_medals.csv", header=0)

In [157]:
olympic_medals

Unnamed: 0,discipline_title,slug_game,event_title,event_gender,medal_type,participant_type,participant_title,athlete_url,athlete_full_name,country_name,country_code,country_3_letter_code
0,Curling,beijing-2022,Mixed Doubles,Mixed,GOLD,GameTeam,Italy,https://olympics.com/en/athletes/stefania-cons...,Stefania CONSTANTINI,Italy,IT,ITA
1,Curling,beijing-2022,Mixed Doubles,Mixed,GOLD,GameTeam,Italy,https://olympics.com/en/athletes/amos-mosaner,Amos MOSANER,Italy,IT,ITA
2,Curling,beijing-2022,Mixed Doubles,Mixed,SILVER,GameTeam,Norway,https://olympics.com/en/athletes/kristin-skaslien,Kristin SKASLIEN,Norway,NO,NOR
3,Curling,beijing-2022,Mixed Doubles,Mixed,SILVER,GameTeam,Norway,https://olympics.com/en/athletes/magnus-nedreg...,Magnus NEDREGOTTEN,Norway,NO,NOR
4,Curling,beijing-2022,Mixed Doubles,Mixed,BRONZE,GameTeam,Sweden,https://olympics.com/en/athletes/almida-de-val,Almida DE VAL,Sweden,SE,SWE
...,...,...,...,...,...,...,...,...,...,...,...,...
21692,Weightlifting,athens-1896,heavyweight - one hand lift men,Men,SILVER,Athlete,,https://olympics.com/en/athletes/viggo-jensen,Viggo JENSEN,Denmark,DK,DEN
21693,Weightlifting,athens-1896,heavyweight - one hand lift men,Men,BRONZE,Athlete,,,Alexandros Nikolopoulos,Greece,GR,GRE
21694,Weightlifting,athens-1896,heavyweight - two hand lift men,Men,GOLD,Athlete,,https://olympics.com/en/athletes/viggo-jensen,Viggo JENSEN,Denmark,DK,DEN
21695,Weightlifting,athens-1896,heavyweight - two hand lift men,Men,SILVER,Athlete,,https://olympics.com/en/athletes/launceston-el...,Launceston ELLIOT,Great Britain,GB,GBR


In [106]:
olympic_medals.dtypes

discipline          object
hostid              object
event               object
gender              object
medal               object
participanttype     object
participanttitle    object
url                 object
name                object
country             object
code                object
code3               object
dtype: object

In [163]:
olympic_medals.columns = ["discipline","hostid","event","gender","medal","participanttype","participanttitle","url","name","country","code","code3"]

In [244]:
cursor.execute("CREATE TABLE olympicmedals (discipline VARCHAR(255),hostid VARCHAR(255),event VARCHAR(255),gender VARCHAR(255),medal VARCHAR(255),participanttype VARCHAR(255),participanttitle VARCHAR(255),url VARCHAR(255),name VARCHAR(255),country VARCHAR(255),code VARCHAR(255),code3 VARCHAR(255));")

In [245]:
connection.commit()

In [248]:
olympic_medals.to_sql("olympicmedals", con=engine, if_exists="append", index=False)

697

In [247]:
connection.rollback()

In [77]:
mental_illness = pd.read_csv("./mental-illness.csv",header=0)

In [119]:
mental_illness

Unnamed: 0,country,code,year,depression,schizophrenia,bipolar,eatingdisorder,anxiety
0,Afghanistan,AFG,1990,895.22565,138.248250,147.64412,26.471115,440.33000
1,Afghanistan,AFG,1991,893.88434,137.761220,147.56696,25.548681,439.47202
2,Afghanistan,AFG,1992,892.34973,137.080300,147.13086,24.637949,437.60718
3,Afghanistan,AFG,1993,891.51587,136.486020,146.78812,23.863169,436.69104
4,Afghanistan,AFG,1994,891.39160,136.183230,146.58481,23.189074,436.76800
...,...,...,...,...,...,...,...,...
6835,Zimbabwe,ZWE,2015,546.46204,127.108720,115.32073,20.423056,302.20868
6836,Zimbabwe,ZWE,2016,547.27765,127.142105,114.98700,20.647228,302.68216
6837,Zimbabwe,ZWE,2017,547.62270,127.465050,115.32798,20.791725,302.88626
6838,Zimbabwe,ZWE,2018,546.57184,127.681210,115.42796,20.916480,301.58250


In [94]:
mental_illness.columns = ["country","code","year","depression","schizophrenia","bipolar","eatingdisorder","anxiety"]

In [62]:
mental_illness.dtypes

country            object
code               object
year                int64
depression        float64
schizophrenia     float64
bipolar           float64
eatingdisorder    float64
anxiety           float64
dtype: object

In [123]:
cursor.execute("CREATE TABLE mentalillness (country VARCHAR(255),code VARCHAR(255),year INT,depression FLOAT,schizophrenia FLOAT,bipolar FLOAT,eatingdisorder FLOAT,anxiety FLOAT,PRIMARY KEY (code, year));")

In [127]:
mental_illness=mental_illness.dropna()

In [124]:
connection.commit()

In [128]:
mental_illness.to_sql("mentalillness", con=engine, if_exists="append", index=False)

150

In [130]:
economic_data = pd.read_csv("./Economic data.csv",header=0)

In [161]:
economic_data

Unnamed: 0,year,yearCode,country,code,poverty,gdpCap,gdpGrowth,intSrv,mort,hlthExp,govHlthCap,prvHlthCap,extHlthCap
0,2020,YR2020,Afghanistan,AFG,,516.8667974,-5.364665931,34.94796166,44.8,15.53361392,6.1311352,61.20481468,12.95210116
1,2020,YR2020,Albania,ALB,0,5343.037704,-2.745238678,884.8250911,8.4,..,154.8976524,..,..
2,2020,YR2020,Algeria,DZA,,3354.157303,-6.729941651,48.46764679,19.6,6.32117987,134.4669044,80.29914715,0.08572689
3,2020,YR2020,Andorra,AND,,37207.222,-12.73507756,9665.379665,2.7,9.05175877,2441.683051,895.234785,..
4,2020,YR2020,Angola,AGO,,1502.950754,-8.672432129,19.7436402,48.7,2.91183472,21.34275969,27.28717169,2.11269093
...,...,...,...,...,...,...,...,...,...,...,...,...,...
195,2020,YR2020,Viet Nam,VNM,0.7,3586.347297,1.937355342,3127.957716,16.6,4.68066788,74.99471991,89.8672161,1.36990087
196,2020,YR2020,Virgin Islands (U.S.),VIR,,39552.1686,-1.798090794,649.1673723,..,..,..,..,..
197,2020,YR2020,"Yemen, Rep.",YEM,,583.8756629,..,5.234783769,45.8,..,..,..,..
198,2020,YR2020,Zambia,ZMB,,956.8317475,-5.59573306,39.36027143,41.1,5.61788416,23.36391858,7.74727054,22.64247137


In [176]:
economic_data.columns = ["year", "yearcode", "country", "code", "poverty", "gdpcap", "gdpgrowth", "intsrv","mort", "hlthexp", "govhlthcap", "prvhlthcap", "exthlthcap"]

In [153]:
null_values_summary = economic_data.isnull().sum()
null_values_summary

year          0
yearCode      0
country       0
code          0
poverty       0
gdpCap        0
gdpGrowth     0
intSrv        0
mort          0
hlthExp       0
govHlthCap    0
prvHlthCap    0
extHlthCap    0
dtype: int64

In [157]:
economic_data["poverty"] = pd.to_numeric(economic_data["poverty"])

ValueError: Unable to parse string ".." at position 0

In [44]:
import numpy as np

In [171]:
economic_data['extHlthCap'] = economic_data['extHlthCap'].replace('..', np.nan)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  economic_data['extHlthCap'] = economic_data['extHlthCap'].replace('..', np.nan)


In [174]:
economic_data["poverty"]= pd.to_numeric(economic_data["poverty"])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  economic_data["poverty"]= pd.to_numeric(economic_data["poverty"])


In [177]:
economic_data

Unnamed: 0,year,yearcode,country,code,poverty,gdpcap,gdpgrowth,intsrv,mort,hlthexp,govhlthcap,prvhlthcap,exthlthcap
0,2020,YR2020,Afghanistan,AFG,,516.8667974,-5.364665931,34.94796166,44.8,15.53361392,6.1311352,61.20481468,12.95210116
1,2020,YR2020,Albania,ALB,0.0,5343.037704,-2.745238678,884.8250911,8.4,,154.8976524,,
2,2020,YR2020,Algeria,DZA,,3354.157303,-6.729941651,48.46764679,19.6,6.32117987,134.4669044,80.29914715,0.08572689
3,2020,YR2020,Andorra,AND,,37207.222,-12.73507756,9665.379665,2.7,9.05175877,2441.683051,895.234785,
4,2020,YR2020,Angola,AGO,,1502.950754,-8.672432129,19.7436402,48.7,2.91183472,21.34275969,27.28717169,2.11269093
...,...,...,...,...,...,...,...,...,...,...,...,...,...
195,2020,YR2020,Viet Nam,VNM,0.7,3586.347297,1.937355342,3127.957716,16.6,4.68066788,74.99471991,89.8672161,1.36990087
196,2020,YR2020,Virgin Islands (U.S.),VIR,,39552.1686,-1.798090794,649.1673723,,,,,
197,2020,YR2020,"Yemen, Rep.",YEM,,583.8756629,,5.234783769,45.8,,,,
198,2020,YR2020,Zambia,ZMB,,956.8317475,-5.59573306,39.36027143,41.1,5.61788416,23.36391858,7.74727054,22.64247137


In [182]:
cursor.execute("CREATE TABLE economicdata (year INT,yearcode VARCHAR(255),country VARCHAR(255),code VARCHAR(255),poverty FLOAT,gdpcap FLOAT,gdpgrowth FLOAT,intsrv FLOAT,mort FLOAT,hlthexp FLOAT,govhlthcap FLOAT,prvhlthcap FLOAT,exthlthCap FLOAT, PRIMARY KEY(code,year));")

In [185]:
connection.rollback()

In [183]:
connection.commit()

In [186]:
economic_data.to_sql("economicdata", con=engine, if_exists="append", index=False)

200

In [191]:
global_population = pd.read_csv("./Global Population.csv",header=0,encoding='ISO-8859-1')

In [190]:
import chardet

# Detect encoding of the CSV file
with open('./Global Population.csv', 'rb') as f:
    result = chardet.detect(f.read())
    encoding = result['encoding']

print(f"Detected encoding: {encoding}")

Detected encoding: ISO-8859-1


In [197]:
global_population = global_population[:-1]

Unnamed: 0,Population (Millions of people),1980,1981,1982,1983,1984,1985,1986,1987,1988,...,2019,2020,2021,2022,2023,2024,2025,2026,2027,2028
1,Afghanistan,no data,no data,no data,no data,no data,no data,no data,no data,no data,...,32.2,32.941,33.698,34.263,no data,no data,no data,no data,no data,no data
2,Albania,2.672,2.726,2.784,2.844,2.904,2.965,3.023,3.084,3.142,...,2.881,2.878,2.873,2.866,2.858,2.85,2.84,2.831,2.821,2.81
3,Algeria,18.666,19.246,19.864,20.516,21.175,22.2,22.8,23.4,24.1,...,43.424,43.851,44.577,45.291,45.973,46.626,47.251,47.851,48.428,48.983
4,Andorra,no data,no data,no data,no data,no data,no data,no data,no data,no data,...,0.078,0.078,0.08,0.082,0.084,0.086,0.088,0.09,0.093,0.095
5,Angola,8.272,8.495,8.72,8.948,9.185,10.35,10.646,10.918,11.214,...,32.354,33.428,34.504,35.711,36.783,37.886,39.023,40.194,41.399,42.641
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
224,Major advanced economies (G7),612.155,616.177,619.745,623.047,626.158,629.495,633.018,636.492,640.455,...,767.111,770.236,771.188,773.813,776.204,778.064,779.836,781.511,783.115,784.666
225,Middle East and Central Asia,254.673,262.85,271.095,279.549,288.557,297.65,306.682,314.912,323.543,...,822.958,838.452,852.037,867.872,848.6,863.738,879.116,894.74,910.628,926.964
226,Other advanced economies,112.56,114.089,115.636,117.012,118.267,119.403,120.51,121.721,123.038,...,173.316,173.729,173.571,174.874,175.992,176.883,177.651,178.412,179.175,179.94
227,Sub-Saharan Africa,342.745,352.398,362.565,373.099,384.021,395.902,407.207,418.858,430.534,...,1026.814,1054.129,1081.266,1108.902,1137.025,1165.892,1195.365,1225.766,1257.372,1289.432


In [205]:
global_population_melted = pd.melt(global_population, id_vars=["Population (Millions of people)"], var_name="Year", value_name="Population")

In [206]:
global_population_melted

Unnamed: 0,Population (Millions of people),Year,Population
0,Afghanistan,1980,no data
1,Albania,1980,2.672
2,Algeria,1980,18.666
3,Andorra,1980,no data
4,Angola,1980,8.272
...,...,...,...
11167,Major advanced economies (G7),2028,784.666
11168,Middle East and Central Asia,2028,926.964
11169,Other advanced economies,2028,179.94
11170,Sub-Saharan Africa,2028,1289.432


In [212]:
global_population_melted.dtypes

Population (Millions of people)    object
Year                               object
Population                         object
dtype: object

In [215]:
global_population_melted['Population'] = global_population_melted['Population'].replace('no data', np.nan)

In [216]:
global_population_melted

Unnamed: 0,Population (Millions of people),Year,Population
0,Afghanistan,1980,
1,Albania,1980,2.672
2,Algeria,1980,18.666
3,Andorra,1980,
4,Angola,1980,8.272
...,...,...,...
11167,Major advanced economies (G7),2028,784.666
11168,Middle East and Central Asia,2028,926.964
11169,Other advanced economies,2028,179.94
11170,Sub-Saharan Africa,2028,1289.432


In [217]:
cursor.execute("CREATE TABLE population (country VARCHAR(255), year INT, population FLOAT,PRIMARY KEY (country, year));")

In [218]:
connection.commit()

In [219]:
global_population_melted.columns = ["country","year","population"]

In [220]:
global_population_melted.to_sql("population", con=engine, if_exists="append", index=False)

172

In [251]:
cursor.execute("ALTER TABLE olympicmedals DROP COLUMN discipline, DROP COLUMN hostid;")

In [252]:
connection.commit()

In [253]:
cursor.execute("ALTER TABLE olympicmedals DROP COLUMN event, DROP COLUMN gender, DROP COLUMN participanttype;")

In [254]:
connection.commit()

In [255]:
cursor.execute("ALTER TABLE olympicmedals DROP COLUMN participanttitle, DROP COLUMN url, DROP COLUMN name;")

In [256]:
connection.commit()

In [249]:
cursor.execute("ALTER TABLE olympicmedals ADD COLUMN id SERIAL;")

In [250]:
connection.commit()

In [257]:
cursor.execute("ALTER TABLE olympicmedals ADD PRIMARY KEY (id);")

In [258]:
connection.commit()

In [259]:
cursor.execute("ALTER TABLE countries ALTER country SET NOT NULL")

In [262]:
connection.commit()

In [261]:
cursor.execute("ALTER TABLE countries ALTER continent SET NOT NULL")

In [12]:
cursor.execute("ALTER TABLE economicdata ALTER year SET NOT NULL")

In [13]:
connection.commit()

In [14]:
cursor.execute("ALTER TABLE economicdata ALTER yearcode SET NOT NULL")

In [15]:
cursor.execute("ALTER TABLE economicdata ALTER country SET NOT NULL")

In [16]:
cursor.execute("ALTER TABLE economicdata ALTER code SET NOT NULL")

In [17]:
connection.commit()

In [37]:
cursor.execute("ROLLBACK;")

In [21]:
cursor.execute("ALTER TABLE lifeexpectancy ALTER country SET NOT NULL,ALTER code SET NOT NULL,ALTER year SET NOT NULL,ALTER lifeexpectancy SET NOT NULL;")

In [43]:
connection.commit()

In [23]:
cursor.execute("ALTER TABLE mentalillness ALTER country SET NOT NULL,ALTER code SET NOT NULL,ALTER year SET NOT NULL;")

In [25]:
cursor.execute("ALTER TABLE olympicmedals ALTER medal SET NOT NULL,ALTER country SET NOT NULL,ALTER code3 SET NOT NULL;")

In [27]:
cursor.execute("ALTER TABLE olympicmedals DROP COLUMN code;")

In [31]:
cursor.execute("ALTER TABLE olympicmedals RENAME COLUMN code3 TO code;")

In [35]:
cursor.execute("ALTER TABLE olympichosts DROP CONSTRAINT olympichosts_pkey;")

In [38]:
cursor.execute("ALTER TABLE olympichosts ADD CONSTRAINT olympichosts_pkey PRIMARY KEY (hostid);")

In [40]:
cursor.execute("ALTER TABLE olympichosts ALTER location SET NOT NULL,ALTER year SET NOT NULL,ALTER season SET NOT NULL;")

In [42]:
cursor.execute("ALTER TABLE olympichosts DROP COLUMN name;")

In [45]:
cursor.execute("CREATE TABLE factolympicmental ( year INT, code VARCHAR(255),country VARCHAR(255), anxietystat FLOAT, depressionstat FLOAT,medals INT);")

In [46]:
connection.commit()

In [84]:
factolympicdata = []
factolympiccolumns =  ["year","code","country","medalid"]

In [124]:
factdf = pd.DataFrame(factolympicdata,columns=factolympiccolumns)

In [126]:
factdf

Unnamed: 0,year,code,country,medalid


In [130]:
medals_aggregated = medals.groupby(['country']).size().reset_index(name='TotalMedals')

In [131]:
medals_aggregated

Unnamed: 0,country,TotalMedals
0,Afghanistan,2
1,Algeria,17
2,Argentina,87
3,Armenia,18
4,Australasia,12
...,...,...
149,"Virgin Islands, US",1
150,West Indies Federation,2
151,Yugoslavia,94
152,Zambia,2


In [136]:
medal_types = medals.groupby(['medal','country']).size().reset_index(name='TotalMedals')

In [141]:
medal_types

Unnamed: 0,medal,country,TotalMedals
0,BRONZE,Afghanistan,2
1,BRONZE,Algeria,8
2,BRONZE,Argentina,37
3,BRONZE,Armenia,8
4,BRONZE,Australasia,5
...,...,...,...
376,SILVER,Vietnam,3
377,SILVER,"Virgin Islands, US",1
378,SILVER,Yugoslavia,35
379,SILVER,Zambia,1


In [142]:
olympic_medals

Unnamed: 0,discipline,hostid,event,gender,medal,participanttype,participanttitle,url,name,country,code,code3
0,Curling,beijing-2022,Mixed Doubles,Mixed,GOLD,GameTeam,Italy,https://olympics.com/en/athletes/stefania-cons...,Stefania CONSTANTINI,Italy,IT,ITA
1,Curling,beijing-2022,Mixed Doubles,Mixed,GOLD,GameTeam,Italy,https://olympics.com/en/athletes/amos-mosaner,Amos MOSANER,Italy,IT,ITA
2,Curling,beijing-2022,Mixed Doubles,Mixed,SILVER,GameTeam,Norway,https://olympics.com/en/athletes/kristin-skaslien,Kristin SKASLIEN,Norway,NO,NOR
3,Curling,beijing-2022,Mixed Doubles,Mixed,SILVER,GameTeam,Norway,https://olympics.com/en/athletes/magnus-nedreg...,Magnus NEDREGOTTEN,Norway,NO,NOR
4,Curling,beijing-2022,Mixed Doubles,Mixed,BRONZE,GameTeam,Sweden,https://olympics.com/en/athletes/almida-de-val,Almida DE VAL,Sweden,SE,SWE
...,...,...,...,...,...,...,...,...,...,...,...,...
21692,Weightlifting,athens-1896,heavyweight - one hand lift men,Men,SILVER,Athlete,,https://olympics.com/en/athletes/viggo-jensen,Viggo JENSEN,Denmark,DK,DEN
21693,Weightlifting,athens-1896,heavyweight - one hand lift men,Men,BRONZE,Athlete,,,Alexandros Nikolopoulos,Greece,GR,GRE
21694,Weightlifting,athens-1896,heavyweight - two hand lift men,Men,GOLD,Athlete,,https://olympics.com/en/athletes/viggo-jensen,Viggo JENSEN,Denmark,DK,DEN
21695,Weightlifting,athens-1896,heavyweight - two hand lift men,Men,SILVER,Athlete,,https://olympics.com/en/athletes/launceston-el...,Launceston ELLIOT,Great Britain,GB,GBR


In [143]:
import re

In [144]:
 year_pattern = r'\b(18|19|20)\d{2}\b'

In [161]:
def extract_year(value):
    year_pattern = r'\b(18|19|20)\d{2}\b'
    match = re.search(year_pattern, value)
    if match:
        return int(match.group(0))  # Convert the matched year to an integer
    else:
        return None 

In [164]:
olympic_medals['hostid'] = olympic_medals['hostid'].apply(extract_year)

In [165]:
medalsolympic_medals

Unnamed: 0,discipline,hostid,event,gender,medal,participanttype,participanttitle,url,name,country,code,code3
0,Curling,2022,Mixed Doubles,Mixed,GOLD,GameTeam,Italy,https://olympics.com/en/athletes/stefania-cons...,Stefania CONSTANTINI,Italy,IT,ITA
1,Curling,2022,Mixed Doubles,Mixed,GOLD,GameTeam,Italy,https://olympics.com/en/athletes/amos-mosaner,Amos MOSANER,Italy,IT,ITA
2,Curling,2022,Mixed Doubles,Mixed,SILVER,GameTeam,Norway,https://olympics.com/en/athletes/kristin-skaslien,Kristin SKASLIEN,Norway,NO,NOR
3,Curling,2022,Mixed Doubles,Mixed,SILVER,GameTeam,Norway,https://olympics.com/en/athletes/magnus-nedreg...,Magnus NEDREGOTTEN,Norway,NO,NOR
4,Curling,2022,Mixed Doubles,Mixed,BRONZE,GameTeam,Sweden,https://olympics.com/en/athletes/almida-de-val,Almida DE VAL,Sweden,SE,SWE
...,...,...,...,...,...,...,...,...,...,...,...,...
21692,Weightlifting,1896,heavyweight - one hand lift men,Men,SILVER,Athlete,,https://olympics.com/en/athletes/viggo-jensen,Viggo JENSEN,Denmark,DK,DEN
21693,Weightlifting,1896,heavyweight - one hand lift men,Men,BRONZE,Athlete,,,Alexandros Nikolopoulos,Greece,GR,GRE
21694,Weightlifting,1896,heavyweight - two hand lift men,Men,GOLD,Athlete,,https://olympics.com/en/athletes/viggo-jensen,Viggo JENSEN,Denmark,DK,DEN
21695,Weightlifting,1896,heavyweight - two hand lift men,Men,SILVER,Athlete,,https://olympics.com/en/athletes/launceston-el...,Launceston ELLIOT,Great Britain,GB,GBR


In [167]:
medals.insert(0,"Year",olympic_medals["hostid"],True)

In [195]:
mental_illness

Unnamed: 0,country,code,year,depression,schizophrenia,bipolar,eatingdisorder,anxiety
0,Afghanistan,AFG,1990,895.22565,138.248250,147.64412,26.471115,440.33000
1,Afghanistan,AFG,1991,893.88434,137.761220,147.56696,25.548681,439.47202
2,Afghanistan,AFG,1992,892.34973,137.080300,147.13086,24.637949,437.60718
3,Afghanistan,AFG,1993,891.51587,136.486020,146.78812,23.863169,436.69104
4,Afghanistan,AFG,1994,891.39160,136.183230,146.58481,23.189074,436.76800
...,...,...,...,...,...,...,...,...
6835,Zimbabwe,ZWE,2015,546.46204,127.108720,115.32073,20.423056,302.20868
6836,Zimbabwe,ZWE,2016,547.27765,127.142105,114.98700,20.647228,302.68216
6837,Zimbabwe,ZWE,2017,547.62270,127.465050,115.32798,20.791725,302.88626
6838,Zimbabwe,ZWE,2018,546.57184,127.681210,115.42796,20.916480,301.58250


In [194]:
medals

Unnamed: 0,year,medal,country,code,id
0,2022,GOLD,Italy,ITA,1
1,2022,GOLD,Italy,ITA,2
2,2022,SILVER,Norway,NOR,3
3,2022,SILVER,Norway,NOR,4
4,2022,BRONZE,Sweden,SWE,5
...,...,...,...,...,...
21692,1896,SILVER,Denmark,DEN,21693
21693,1896,BRONZE,Greece,GRE,21694
21694,1896,GOLD,Denmark,DEN,21695
21695,1896,SILVER,Great Britain,GBR,21696


In [196]:
countrydf

Unnamed: 0,country,continent
0,country,region
1,India,Asia
2,China,Asia
3,United States,North America
4,Indonesia,Asia
...,...,...
230,Montserrat,North America
231,Falkland Islands,South America
232,Niue,Oceania
233,Tokelau,Oceania


In [172]:
cursor.execute("ROLLBACK")

In [181]:
cursor.execute("CREATE TABLE medalsnew ( year int, medal VARCHAR(255), country VARCHAR(255),code VARCHAR (255), id INT PRIMARY KEY);")

In [182]:
connection.commit()

In [190]:
medals.to_sql("medalsnew", con=engine, if_exists="append", index=False)

697

In [191]:
facttabletest = pd.DataFrame(factolympicdata,columns=factolympiccolumns)

In [192]:
facttabletest

Unnamed: 0,year,code,country,medalid


In [None]:
facttabletest

In [214]:
test = pd.merge(medals, mental_illness, on=['year', 'code','country'], how='inner')

In [215]:
test

Unnamed: 0,year,medal,country,code,id,depression,schizophrenia,bipolar,eatingdisorder,anxiety
0,2018,GOLD,Norway,NOR,1544,560.33820,180.55357,178.11180,100.930960,644.23810
1,2018,GOLD,Norway,NOR,1545,560.33820,180.55357,178.11180,100.930960,644.23810
2,2018,BRONZE,Norway,NOR,1558,560.33820,180.55357,178.11180,100.930960,644.23810
3,2018,GOLD,Norway,NOR,1562,560.33820,180.55357,178.11180,100.930960,644.23810
4,2018,GOLD,Norway,NOR,1568,560.33820,180.55357,178.11180,100.930960,644.23810
...,...,...,...,...,...,...,...,...,...,...
4215,1992,BRONZE,Suriname,SUR,10296,829.23750,173.13023,193.65190,40.801826,390.63160
4216,1992,BRONZE,Argentina,ARG,10380,458.25125,199.43263,211.82477,61.260650,485.97446
4217,1992,BRONZE,Argentina,ARG,10381,458.25125,199.43263,211.82477,61.260650,485.97446
4218,1992,SILVER,Luxembourg,LUX,10518,664.50543,177.03981,192.51715,97.331680,508.95790


In [216]:
test1 = pd.merge(test,countrydf,on=['country'],how="inner")

In [217]:
test1

Unnamed: 0,year,medal,country,code,id,depression,schizophrenia,bipolar,eatingdisorder,anxiety,continent
0,2018,GOLD,Norway,NOR,1544,560.33820,180.55357,178.11180,100.930960,644.2381,Europe
1,2018,GOLD,Norway,NOR,1545,560.33820,180.55357,178.11180,100.930960,644.2381,Europe
2,2018,BRONZE,Norway,NOR,1558,560.33820,180.55357,178.11180,100.930960,644.2381,Europe
3,2018,GOLD,Norway,NOR,1562,560.33820,180.55357,178.11180,100.930960,644.2381,Europe
4,2018,GOLD,Norway,NOR,1568,560.33820,180.55357,178.11180,100.930960,644.2381,Europe
...,...,...,...,...,...,...,...,...,...,...,...
4215,1992,BRONZE,Ghana,GHA,10154,751.65350,135.00745,117.25983,19.932873,283.0296,Africa
4216,1992,SILVER,Peru,PER,10177,396.24810,175.47664,199.75580,52.334090,521.2572,South America
4217,1992,BRONZE,Suriname,SUR,10296,829.23750,173.13023,193.65190,40.801826,390.6316,South America
4218,1992,SILVER,Luxembourg,LUX,10518,664.50543,177.03981,192.51715,97.331680,508.9579,Europe


In [218]:
cursor.execute("CREATE TABLE facttest (year INT,	medal VARCHAR,	country VARCHAR,	code VARCHAR,	id INT PRIMARY KEY	,depression FLOAT,	schizophrenia FLOAT,	bipolar FLOAT,	eatingdisorder FLOAT,	anxiety FLOAT,	continent VARCHAR);")

In [219]:
connection.commit()

In [220]:
test1.to_sql("facttest", con=engine, if_exists="append", index=False)

220