## CHECKPOINT 3 - CREATE TABLES

In [1]:
import matplotlib
import pandas as pd
from sqlalchemy import create_engine
import matplotlib.pyplot as plt
import numpy as np
import psycopg2, os

### Justifications

We create the users table to introduce the basic information about users, which contains user_id, user_name, and user_performance_tier. User_id is the primary key and is integer, then we choose INTEGER as its datatype. User_name is character strings and some names are long, then we choose VARCHAR(100) as its datatype. User_performance_tier is integer, then we choose integer as its datatype. 

We then create the achievement table to introduce several achievement types, which contains achievement_id and achievement_type. Achievement_id is the primary key and is integer, then we choose INTEGER as its datatype. Achievement_type is character strings, then we choose VARCHAR(20) as its datatype. 
To ensure the 3NF normalization rules, we create user_achievement as the bridge table between the users table and the achievement table, which only contains user_id and achievement_id. User_id and achievement_id are integer, then we choose INTEGER as their datatype. We specify user_id and achievement_id as the composite primary key to ensure to uniquely identify a record. Besides, user_id and achievement_id are foreign keys that reference the primary keys in the users table and achievement table respectively. 

The competition table has some attributes for the whole competition, which contains competition_id, slug, title, subtitle, deadline_date, has_leaderboard, max_dailysub, max_teamsize, reward_type, reward_quantity, total_teams, and total_subs. Competition_id, max_dailysub, max_teamsize, total_teams, and total_subs are integer, then we choose INTEGER as their datatype. Slug, title, subtitle, and reward_type are character strings, then we choose VARCHAR as their datatype. The datatype of deadline_date should be DATE, and the datatype of has_leaderboard should be BOOLEAN. Reward_quantity is fixed point number, then we choose NUMERIC(10,2) as its datatype. Competiton_id is the primary key, and we specify slug, title, has_leaderboard, max_dailysub, max_teamsize, total_teams, and total_subs as not null. 

The team table introduces the basic information about each team, which contains team_id, team_name, public_leaderboard_rank, and private_leaderborad_rank. Team_id, public_leaderboard_rank, and private_leaderboard_rank are integer, then we choose INTEGER as their datatype. Team_name is character strings, then we choose VARCHAR as its datatype. Team_id is the primary key. 


The submission table has the basic submission information by each team, which contains submission_id, team_id, submission_date, is_after_deadline, and public_leaderboard_score. Submission_id and team_id are integer, then we choose INTEGER as their datatype. The datatype of submission_date should be DATE and the datatype of is_after_deadline should be BOOLEAN. Public_leaderboard_score is numeric, then we choose FLOAT as its datatype. Submission_id is the primary key, and we specify team_id as the foreign key that references the primary key in the team table. 
We then create team_submission as the bridge table between the team table and the submission table, which only contains team_id and submission_id. Team_id and submission_id are integer, then we choose INTEGER as their datatype. We specify team_id and submission_id as the composite primary key to ensure to uniquely identify a record. Besides, team_id and submission_id are foreign keys that reference the primary keys in the team table and submission table respectively. 

The algorithm table contains algorithm_id, algorithm_abbr, algorithm_name, and algorithm_descrip. Algorithm_id is integer, then we choose INTEGER as its datatype. Algorithm_abbr, algorithm_name, and algorithm_descrip are character strings, then we choose VARCHAR as their datatype. The primary key is algorithm_id. 

We then create the competition_algorithm table as the bridge table between the competition table and the algorithm table, which only contains competition_id and algorithm_id. Competition_id and algorithm_id are integer, then we choose INTEGER as their datatype. We specify competition_id and algorithm_id as the composite primary key to ensure to uniquely identify a record. Besides, competition_id and algorithm_id are foreign keys that reference the primary keys in the competition table and algorithm table respectively. 

We create the tag table to contain the information about competitor types so that it is easy for users to search for the best-fit competition field and topic for them. We create the unique tag id as the primary key in integer format. The name of the tag and the full path of the tag will all be varchar not null since each tag needs to have a name and path that leads to it. Then we set the length of the tag description to 300 to include enough statements. All of the dataset count, kernel count and competition count will be set up to integers not null so that the users can have an entire idea of how many competitions, kernel, and dataset actually belongs to each tag to see the level of popularity. 


Since we need to have a connection between the tag and its related competition, the competitionn_tag table is created. It does not have any primary key since it is a weak entity relationship. It has competition id and tag id as its two foreign keys which reference the competition table and tag table. 
Then we have the organization table created for the users to find out the specific organization introduction information so that users can easily choose if they want to take a closer look at the competition that is held by the specific organization after reading their description of the company. The primary key will be the organization id which is in integer format. Then the organization name will be varchar not null because we need a name for each organization. Then we set up a long length for each organization’s description as varchar so that users can have a full understanding of the organization. 
Then we need to connect the information of the organization and the competition so people can know which companies are holding which competition by searching this table. We have the bridge table for competition and organization. We set the competition id as the primary key. And we have both foreign key competition id and organization id reference to the competition table and the organization table. 
Then we move to create the host table to introduce the information about the host. We set the unique host id as the primary and the hostname can not be null in this case. 

Then we need to create the bridge table for the host and competition. We include the host id and competition id inside this table. The primary key will be the competition id, and both of them will be foreign keys reference to the competition table and host table. 
Then we will create the table for categories which we use to describe the type of competitions, such as research purpose or recruitment purpose, and so on. We set the category id as the primary key in integer format. And then we have the category name in this table to be not null for future searching. 
Finally, we create the bridge table for competition and category which is the competition_category. We have the competition id and category id inside this table and both of them are the primary keys for this bridge table. They are also foreign keys reference to the competition table and category table. 


### Connecting database

In [2]:
conn_url = 'postgresql://postgres:123@localhost/checkpoint5'
engine = create_engine(conn_url)
connection = engine.connect()

conn = psycopg2.connect(
    host="localhost",
    port='5432',
    database="checkpoint5",
    user="postgres",
    password="123")
cur = conn.cursor()

### Creating tables

In [5]:
createTB = """
CREATE TABLE users (
    user_id INTEGER,
    user_name VARCHAR(100),
    user_performance_tier INTEGER,
    PRIMARY KEY(user_id)
);

CREATE TABLE achievement (
    achievement_id INTEGER,
    achievement_type VARCHAR(20),
    PRIMARY KEY(achievement_id)
);

CREATE TABLE user_achievement (
    user_id INTEGER,
    achievement_id INTEGER,
    PRIMARY KEY (user_id, achievement_id),
    FOREIGN KEY(user_id) REFERENCES users(user_id),
    FOREIGN KEY(achievement_id) REFERENCES achievement(achievement_id)
);

CREATE TABLE competition (
    competition_id INTEGER,
    slug VARCHAR(100) NOT NULL,
    title VARCHAR(100) NOT NULL,
    subtitle VARCHAR(250),
    deadline_date DATE,
    has_leaderboard BOOLEAN NOT NULL,
    max_dailysub INTEGER NOT NULL,
    max_teamsize INTEGER NOT NULL,
    reward_type VARCHAR(20),
    reward_quantity NUMERIC(10,2),
    total_teams INTEGER NOT NULL,
    total_subs INTEGER NOT NULL,
    PRIMARY KEY(competition_id)
);

CREATE TABLE team (
    team_id INTEGER,
    team_name VARCHAR(500),
    public_leaderboard_rank INTEGER,
    private_leaderboard_rank INTEGER,
    PRIMARY KEY(team_id)
);

CREATE TABLE submission (
    submission_id INTEGER,
    team_id INTEGER,
    submission_date DATE,
    is_after_deadline BOOLEAN,
    public_leaderboard_score FLOAT,
    PRIMARY KEY(submission_id),
    FOREIGN KEY(team_id) REFERENCES team(team_id)
);

CREATE TABLE team_submission(
    team_id INTEGER,
    submission_id INTEGER,
    PRIMARY KEY(team_id, submission_id),
    FOREIGN KEY(team_id) REFERENCES team(team_id),
    FOREIGN KEY(submission_id) REFERENCES submission(submission_id)
);

CREATE TABLE algorithm (
    algorithm_id INTEGER,
    algorithm_abbr VARCHAR(100),
    algorithm_name VARCHAR(100),
    algorithm_descrip VARCHAR(250),
    PRIMARY KEY(algorithm_id)
);

CREATE TABLE competition_algorithm (
    competition_id INTEGER,
    algorithm_id INTEGER,
    PRIMARY KEY(competition_id, algorithm_id),
    FOREIGN KEY(competition_id) REFERENCES competition(competition_id),
    FOREIGN KEY(algorithm_id) REFERENCES algorithm(algorithm_id)
);


CREATE TABLE tag (
    tag_id INTEGER,
    tag_name VARCHAR(50) NOT NULL,
    fullpath VARCHAR(100) NOT NULL,
    tag_descrip VARCHAR(300),
    datasetcount INTEGER NOT NULL,
    competitioncount INTEGER NOT NULL,
    kernelcount INTEGER NOT NULL,
    PRIMARY KEY(tag_id)
);

CREATE TABLE competition_tag (
    competition_id INTEGER,
    tag_id INTEGER,
    FOREIGN KEY(competition_id) REFERENCES competition(competition_id),
    FOREIGN KEY(tag_id) REFERENCES tag(tag_id)
);

CREATE TABLE organization (
    organization_id INTEGER,
    organization_name VARCHAR(100) NOT NULL,
    organization_descrip VARCHAR(5000),
    PRIMARY KEY(organization_id)
);

CREATE TABLE competition_organization (
    competition_id INTEGER,
    organization_id INTEGER,
    PRIMARY KEY(competition_id),
    FOREIGN KEY(competition_id) REFERENCES competition(competition_id),
    FOREIGN KEY(organization_id) REFERENCES organization(organization_id)
);

CREATE TABLE host (
    host_id INTEGER,
    host_name VARCHAR(100) NOT NULL,
    PRIMARY KEY(host_id)
);

CREATE TABLE competition_host (
    competition_id INTEGER,
    host_id INTEGER,
    PRIMARY KEY(competition_id),
    FOREIGN KEY(competition_id) REFERENCES competition(competition_id),
    FOREIGN KEY(host_id) REFERENCES host(host_id)
);


CREATE TABLE category (
    category_id INTEGER,
    category_name VARCHAR(20) NOT NULL,
    PRIMARY KEY(category_id)
);

CREATE TABLE competition_category (
    competition_id INTEGER,
    category_id INTEGER,
    PRIMARY KEY(competition_id, category_id),
    FOREIGN KEY(competition_id) REFERENCES competition(competition_id),
    FOREIGN KEY(category_id) REFERENCES category(category_id)
);

"""

cur.execute(createTB)
conn.commit()
#17 tables created