# PoC-boardgamegeeks

Use the "Run" button to execute the code.

In [1]:
!pip install opendatasets --upgrade --quiet
!pip install pandas --upgrade --quiet
!pip install pandas-profiling




In [2]:
import opendatasets as od
import os
import pandas as pd
import numpy as np
import psycopg2.extras as extras
from pandas_profiling import ProfileReport

### Retrieving Dataset from Kaggle 

In [3]:
#Setting a variable for the kaggle dataset link
dataset_url = 'https://www.kaggle.com/threnjen/board-games-database-from-boardgamegeek' 


#### - You will need a kaggle account and a kaggle key in order to download this data set
#### - For the kaggle key, log in to your kaggle account page, scroll down to API - Create New API Token.
#### - Download and open the file and your kaggle key will be there.

In [4]:
od.download(dataset_url)

Skipping, found downloaded files in ".\board-games-database-from-boardgamegeek" (use force=True to force download)


In [5]:
#Setting a variable to the path of the dataset folder
data_dir = './board-games-database-from-boardgamegeek'

In [6]:
#Listing the contents of the path directory to make sure we have all the cvs files
os.listdir(data_dir)

['artists_reduced.csv',
 'bgg_data_documentation.txt',
 'designers_reduced.csv',
 'games.csv',
 'mechanics.csv',
 'publishers_reduced.csv',
 'ratings_distribution.csv',
 'subcategories.csv',
 'themes.csv',
 'user_ratings.csv']

In [7]:
pd.set_option('display.max_columns', None) #Always show all columns
pd.set_option('display.max_rows', 10) #Always show upto 200 rows

In [8]:
games_raw_df=pd.read_csv('./board-games-database-from-boardgamegeek/games.csv')


### Examine the column names of the dataset

In [9]:
games_raw_df.columns


Index(['BGGId', 'Name', 'Description', 'YearPublished', 'GameWeight',
       'AvgRating', 'BayesAvgRating', 'StdDev', 'MinPlayers', 'MaxPlayers',
       'ComAgeRec', 'LanguageEase', 'BestPlayers', 'GoodPlayers', 'NumOwned',
       'NumWant', 'NumWish', 'NumWeightVotes', 'MfgPlaytime', 'ComMinPlaytime',
       'ComMaxPlaytime', 'MfgAgeRec', 'NumUserRatings', 'NumComments',
       'NumAlternates', 'NumExpansions', 'NumImplementations',
       'IsReimplementation', 'Family', 'Kickstarted', 'ImagePath',
       'Rank:boardgame', 'Rank:strategygames', 'Rank:abstracts',
       'Rank:familygames', 'Rank:thematic', 'Rank:cgs', 'Rank:wargames',
       'Rank:partygames', 'Rank:childrensgames', 'Cat:Thematic',
       'Cat:Strategy', 'Cat:War', 'Cat:Family', 'Cat:CGS', 'Cat:Abstract',
       'Cat:Party', 'Cat:Childrens'],
      dtype='object')

In [10]:
games_raw_df.columns = ['BGGId', 'Name', 'Description', 'YearPublished', 'GameWeight',
       'AvgRating', 'BayesAvgRating', 'StdDev', 'MinPlayers', 'MaxPlayers',
       'ComAgeRec', 'LanguageEase', 'BestPlayers', 'GoodPlayers', 'NumOwned',
       'NumWant', 'NumWish', 'NumWeightVotes', 'MfgPlaytime', 'ComMinPlaytime',
       'ComMaxPlaytime', 'MfgAgeRec', 'NumUserRatings', 'NumComments',
       'NumAlternates', 'NumExpansions', 'NumImplementations',
       'IsReimplementation', 'Family', 'Kickstarted', 'ImagePath',
       'RankBoardgame', 'RankStrategygames', 'RankAbstracts',
       'RankFamilygames', 'RankThematic', 'RankCgs', 'RankWargames',
       'RankPartygames', 'RankChildrensgames', 'CatThematic',
       'CatStrategy', 'CatWar', 'CatFamily', 'CatCGS', 'CatAbstract',
       'CatParty', 'CatChildrens']

In [11]:
games_raw_df.describe()

Unnamed: 0,BGGId,YearPublished,GameWeight,AvgRating,BayesAvgRating,StdDev,MinPlayers,MaxPlayers,ComAgeRec,LanguageEase,BestPlayers,NumOwned,NumWant,NumWish,NumWeightVotes,MfgPlaytime,ComMinPlaytime,ComMaxPlaytime,MfgAgeRec,NumUserRatings,NumComments,NumAlternates,NumExpansions,NumImplementations,IsReimplementation,Kickstarted,RankBoardgame,RankStrategygames,RankAbstracts,RankFamilygames,RankThematic,RankCgs,RankWargames,RankPartygames,RankChildrensgames,CatThematic,CatStrategy,CatWar,CatFamily,CatCGS,CatAbstract,CatParty,CatChildrens
count,21925.0,21925.0,21925.0,21925.0,21925.0,21925.0,21925.0,21925.0,16395.0,16034.0,21925.0,21925.0,21925.0,21925.0,21925.0,21925.0,21925.0,21925.0,21925.0,21925.0,21925.0,21925.0,21925.0,21925.0,21925.0,21925.0,21925.0,21925.0,21925.0,21925.0,21925.0,21925.0,21925.0,21925.0,21925.0,21925.0,21925.0,21925.0,21925.0,21925.0,21925.0,21925.0,21925.0
mean,117652.663216,1985.494914,1.982131,6.424922,5.685673,1.516374,2.007343,5.707868,10.004391,216.461819,0.311517,1467.848164,41.690946,228.457013,49.480137,90.513523,63.678586,90.513523,9.613409,861.668324,0.0,1.603786,1.380068,0.308734,0.116762,0.153341,11019.514071,19729.826956,20839.347184,19732.270011,20736.185815,21625.086796,18680.185678,21295.352201,21062.680274,0.055827,0.10577,0.161003,0.105633,0.01382,0.050855,0.02919,0.040182
std,104628.721777,212.486214,0.848983,0.932477,0.365311,0.285578,0.693093,15.014643,3.269157,236.595136,1.067002,5294.120574,117.255229,788.477151,205.762375,529.657389,443.916212,529.657389,3.64156,3638.680857,0.0,9.619364,7.701036,0.848095,0.321143,0.360324,6372.926817,6389.614077,4695.170448,6387.089598,4893.928466,2542.033794,7420.906104,3637.139987,4219.776597,0.229592,0.30755,0.367542,0.307374,0.116745,0.219707,0.168344,0.196391
min,1.0,-3500.0,0.0,1.04133,3.57481,0.196023,0.0,0.0,2.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,30.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,12346.0,2001.0,1.3333,5.83696,5.5103,1.32072,2.0,4.0,8.0,24.027778,0.0,150.0,3.0,14.0,4.0,25.0,20.0,25.0,8.0,56.0,0.0,0.0,0.0,0.0,0.0,0.0,5488.0,21926.0,21926.0,21926.0,21926.0,21926.0,21926.0,21926.0,21926.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,105305.0,2011.0,1.9688,6.45395,5.54654,1.47688,2.0,4.0,10.0,138.0,0.0,320.0,9.0,39.0,9.0,45.0,30.0,45.0,10.0,123.0,0.0,0.0,0.0,0.0,0.0,0.0,11022.0,21926.0,21926.0,21926.0,21926.0,21926.0,21926.0,21926.0,21926.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,206169.0,2017.0,2.5252,7.05245,5.67989,1.66547,2.0,6.0,12.0,351.0,0.0,899.0,28.0,127.0,26.0,90.0,60.0,90.0,12.0,395.0,0.0,1.0,1.0,0.0,0.0,0.0,16544.0,21926.0,21926.0,21926.0,21926.0,21926.0,21926.0,21926.0,21926.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,349161.0,2021.0,5.0,9.91429,8.51488,4.27728,10.0,999.0,21.0,1757.0,15.0,166497.0,2031.0,19182.0,7673.0,60000.0,60000.0,60000.0,25.0,108101.0,0.0,850.0,525.0,38.0,1.0,1.0,21926.0,21926.0,21926.0,21926.0,21926.0,21926.0,21926.0,21926.0,21926.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


##### Dropping unwanted columns

In [12]:
games_updated_df = games_raw_df.drop(['StdDev',"ComAgeRec","LanguageEase","GameWeight","BayesAvgRating",'NumOwned',
       'NumWant', 'NumWish', 'NumWeightVotes', 'NumComments',
       'NumAlternates', 'NumExpansions', 'NumImplementations',
       'IsReimplementation', 'Family', 'Kickstarted', 'ImagePath'], axis = 1) 


In [13]:
games_updated_df1 = games_updated_df.round({"AvgRating":2}) 

In [14]:
import psycopg2

#establish client connection - specify param values
conn = psycopg2.connect("host=localhost dbname=bgg user=pocuser password=poc123")

#cursor facilitates processing and executing commands such as traverse, add, remove, retrieve in a database
cur = conn.cursor()

#cur.execute('SELECT * FROM notes')
# one = cur.fetchone()
# all = cur.fetchall()


### Create Tables


In [16]:
#Create the games flat table

import psycopg2
conn = psycopg2.connect("host=localhost dbname=bgg user=pocuser password=poc123")
cur = conn.cursor()
cur.execute("""
    CREATE TABLE games_flat(
        bggid integer PRIMARY KEY,
        name text,
        yearpublished int,
        category text,
        mfgplaytime int,
        minplayers int,
        maxplayers int,
        avgrating real,
        mfgagerec int,
        numuserratings int
)
""")
conn.commit()

DuplicateTable: relation "games_flat" already exists


In [17]:
#Create the games table

import psycopg2
conn = psycopg2.connect("host=localhost dbname=bgg user=pocuser password=poc123")
cur = conn.cursor()
cur.execute("""
    CREATE TABLE games(
        bggid integer PRIMARY KEY,
        name text,
        yearpublished int,
        mfgagerec int
)
""")
conn.commit()

DuplicateTable: relation "games" already exists


In [18]:
#Create the players table

import psycopg2
conn = psycopg2.connect("host=localhost dbname=bgg user=pocuser password=poc123")
cur = conn.cursor()
cur.execute("""
    CREATE TABLE users(
        bggid integer PRIMARY KEY,
        minplayers integer,
        maxplayers integer
        )
""")
conn.commit()

DuplicateTable: relation "users" already exists


In [19]:
#Create the ratings table

import psycopg2
conn = psycopg2.connect("host=localhost dbname=bgg user=pocuser password=poc123")
cur = conn.cursor()
cur.execute("""
    CREATE TABLE ratings (
        bggid integer PRIMARY KEY,
        avgrating INTEGER,
        numuserratings INTEGER
)
""")
conn.commit()



DuplicateTable: relation "ratings" already exists


In [20]:
#Create the playtime table

import psycopg2
conn = psycopg2.connect("host=localhost dbname=bgg user=pocuser password=poc123")
cur = conn.cursor()
cur.execute("""
    CREATE TABLE playtime(
        BGGId integer PRIMARY KEY,
        MfgPlaytime INTEGER
)
""")
conn.commit()

DuplicateTable: relation "playtime" already exists


In [21]:
#Create the playtime table

import psycopg2
conn = psycopg2.connect("host=localhost dbname=bgg user=pocuser password=poc123")
cur = conn.cursor()
cur.execute("""
    CREATE TABLE category(
        BGGId integer PRIMARY KEY,
        category text
)
""")
conn.commit()

DuplicateTable: relation "category" already exists


## select_sql() and execute_values() functions

In [22]:
#defining a function to read the sql query and print out into jupyter notebook

def select_sql(sql):
    return pd.read_sql(sql, conn)

In [23]:
#Defining a funtion to export the dataframe from jupter into the postgres database - tables

def execute_values(conn, df, table):
    """
    Using psycopg2.extras.execute_values() to insert the dataframe
    """
    # Create a list of tupples from the dataframe values
    tuples = [tuple(x) for x in df.to_numpy()]
    # Comma-separated dataframe columns
    cols = ','.join(list(df.columns))
    # SQL quert to execute
    query  = "INSERT INTO %s(%s) VALUES %%s" % (table, cols)
    cursor = conn.cursor()
    try:
        extras.execute_values(cursor, query, tuples)
        conn.commit()
    except (Exception, psycopg2.DatabaseError) as error:
        print("Error: %s" % error)
        conn.rollback()
        cursor.close()
        return 1
    print("execute_values() done")
    cursor.close()

## Games filtering for creating smaller dataframes

In [24]:
games_updated_df2 = games_updated_df1.filter(['BGGId'
                                              ,'Name'
                                              #,'Description'
                                              #,'YearPublished'
                                              #,'AvgRating'
                                              #,'MinPLayer'
                                              #,'MaxPlayers'
                                              #,'BestPlayers'
                                              #,'MfgPlaytime'
                                              #,"ComMinPlaytime"
                                              #,'ComMaxPLaytime'
                                              #,'MfgAgeRec'
                                              #,'NumUserRatings'
                                              #,'RankBoardgame'
                                              #,'RankStrategygames'
                                              #,'RankAbstracts'
                                              #,'RankFamilygames'
                                              #,'RankThematic'
                                              #,'RankCgs'
                                              #,'RankWargames'
                                              #,'RankPartygames'
                                              #,'RankChildrensgames'
                                              ,'CatThematic'
                                              ,'CatStrategy'
                                              , 'CatWar'
                                              , 'CatFamily'
                                              , 'CatCGS'
                                              , 'CatAbstract'
                                              ,'CatParty'
                                              , 'CatChildrens'
                                             ])

In [25]:
games_updated_df2

Unnamed: 0,BGGId,Name,CatThematic,CatStrategy,CatWar,CatFamily,CatCGS,CatAbstract,CatParty,CatChildrens
0,1,Die Macher,0,1,0,0,0,0,0,0
1,2,Dragonmaster,0,1,0,0,0,0,0,0
2,3,Samurai,0,1,0,0,0,0,0,0
3,4,Tal der Könige,0,0,0,0,0,0,0,0
4,5,Acquire,0,1,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...
21920,347146,Salvage,0,0,0,0,0,0,0,0
21921,347521,Blitzkrieg!: World War Two in 20 Minutes,0,0,0,0,0,0,0,0
21922,348955,Rock Paper Scissors: Deluxe Edition,0,0,0,0,0,0,0,0
21923,349131,Splitter,0,0,0,0,0,0,0,0


In [26]:
def from_one_hot_to_category(row, names_array): #take a row of a dataframe and a array of names
    cat_array = np.array(row) #interpret the row as an np.array
    names_array = [" " + s + "," for s in names_array] #Add an " " at the beginning and a "," at the end of every name in names_array
    return ''.join(np.repeat(names_array, cat_array)) #Output names_array * cat_array.transpose, whereby string * int = string if int = 1 and 0 else

In [27]:
category_df = games_updated_df2
games_updated_df2['Category'] = games_updated_df2[['CatThematic',
       'CatStrategy', 'CatWar', 'CatFamily', 'CatCGS', 'CatAbstract',
       'CatParty', 'CatChildrens']].apply(lambda row: from_one_hot_to_category(row, list(row.index)), axis = 1) #apply from_one_hot_to_category rowwise

games_updated_df2['Category'] = games_updated_df2['Category'].str[1:-1].str.replace('Cat','') #Get rid of the first " " at left hand side and the last "," at the right hand side

category_df = category_df.merge(games_updated_df2[['BGGId', 'Category']], on = 'BGGId', how = 'left') #Add the new column Category to sub_df

In [28]:
category_df

Unnamed: 0,BGGId,Name,CatThematic,CatStrategy,CatWar,CatFamily,CatCGS,CatAbstract,CatParty,CatChildrens,Category_x,Category_y
0,1,Die Macher,0,1,0,0,0,0,0,0,Strategy,Strategy
1,2,Dragonmaster,0,1,0,0,0,0,0,0,Strategy,Strategy
2,3,Samurai,0,1,0,0,0,0,0,0,Strategy,Strategy
3,4,Tal der Könige,0,0,0,0,0,0,0,0,,
4,5,Acquire,0,1,0,0,0,0,0,0,Strategy,Strategy
...,...,...,...,...,...,...,...,...,...,...,...,...
21920,347146,Salvage,0,0,0,0,0,0,0,0,,
21921,347521,Blitzkrieg!: World War Two in 20 Minutes,0,0,0,0,0,0,0,0,,
21922,348955,Rock Paper Scissors: Deluxe Edition,0,0,0,0,0,0,0,0,,
21923,349131,Splitter,0,0,0,0,0,0,0,0,,


In [29]:
category_df = category_df.drop(['Name','CatThematic', 'CatStrategy', 'CatWar','CatFamily', 'CatAbstract',"CatCGS",'CatParty','CatChildrens', "Category_y"], axis = 1)

In [30]:
#df.rename(columns={"A": "a", "B": "c"})
category_df=category_df.rename(columns={"Category_x": "Category"})
category_df

Unnamed: 0,BGGId,Category
0,1,Strategy
1,2,Strategy
2,3,Strategy
3,4,
4,5,Strategy
...,...,...
21920,347146,
21921,347521,
21922,348955,
21923,349131,


In [31]:
category_df.Category.unique()

array(['Strategy', '', 'Abstract', 'Family', 'Strategy, Family',
       'Thematic, Strategy', 'CGS', 'Thematic, War', 'Thematic', 'War',
       'Family, Party', 'Family, Abstract', 'Strategy, War', 'Party',
       'Strategy, Abstract', 'Strategy, Party', 'Thematic, Family',
       'War, Family', 'Thematic, Strategy, War', 'Family, Childrens',
       'Childrens', 'Thematic, Party', 'War, Abstract', 'Thematic, CGS',
       'War, CGS', 'Abstract, Childrens', 'War, Childrens',
       'Party, Childrens', 'CGS, Abstract', 'War, Abstract, Childrens',
       'Strategy, CGS', 'Abstract, Party', 'Family, Party, Childrens',
       'Thematic, War, CGS', 'Thematic, Strategy, Abstract',
       'Thematic, Abstract', 'Thematic, Strategy, Family', 'War, Party',
       'Thematic, Family, Party', 'Strategy, Family, Party'], dtype=object)

### As you can see the categories has a blank category as well as more than one for some, we will clean this up to only look at games with one category only

In [32]:
category_df.replace('', np.nan, inplace=True)

category_df.dropna(subset = ['Category'], inplace=True)


In [33]:
category_df.Category.unique()

array(['Strategy', 'Abstract', 'Family', 'Strategy, Family',
       'Thematic, Strategy', 'CGS', 'Thematic, War', 'Thematic', 'War',
       'Family, Party', 'Family, Abstract', 'Strategy, War', 'Party',
       'Strategy, Abstract', 'Strategy, Party', 'Thematic, Family',
       'War, Family', 'Thematic, Strategy, War', 'Family, Childrens',
       'Childrens', 'Thematic, Party', 'War, Abstract', 'Thematic, CGS',
       'War, CGS', 'Abstract, Childrens', 'War, Childrens',
       'Party, Childrens', 'CGS, Abstract', 'War, Abstract, Childrens',
       'Strategy, CGS', 'Abstract, Party', 'Family, Party, Childrens',
       'Thematic, War, CGS', 'Thematic, Strategy, Abstract',
       'Thematic, Abstract', 'Thematic, Strategy, Family', 'War, Party',
       'Thematic, Family, Party', 'Strategy, Family, Party'], dtype=object)

In [34]:
#This options variable is used to select on the one category types

options = ['Thematic'
           ,'Strategy'
           , 'War'
           , 'Family'
           , 'CGS'
           , 'Abstract'
           ,'Party'
           , 'Childrens']
# selecting rows based on condition
category_cleaned_df = category_df[category_df['Category'].isin(options)]

In [35]:
category_cleaned_df.Category.unique()

array(['Strategy', 'Abstract', 'Family', 'CGS', 'Thematic', 'War',
       'Party', 'Childrens'], dtype=object)

In [36]:
games_id_name_year_df = games_updated_df1.filter(['BGGId', 'Name', 'YearPublished'], axis = 1)

### Remove any anomalies in the table with the yearpublished as 0. This could have been a place holder.

In [37]:
games_id_name_year_df.replace(0, np.nan, inplace=True)

games_id_name_year_df.dropna(subset = ['YearPublished'], inplace=True)

In [38]:
games_id_name_year_df

Unnamed: 0,BGGId,Name,YearPublished
0,1,Die Macher,1986.0
1,2,Dragonmaster,1981.0
2,3,Samurai,1998.0
3,4,Tal der Könige,1992.0
4,5,Acquire,1964.0
...,...,...,...
21919,346965,Azul: Queen's Garden,2021.0
21920,347146,Salvage,2021.0
21921,347521,Blitzkrieg!: World War Two in 20 Minutes,2019.0
21922,348955,Rock Paper Scissors: Deluxe Edition,2021.0


### Convert the year datatype as there seems to be a decimal 0 at the end

In [39]:
games_id_name_year_df = games_id_name_year_df.convert_dtypes()
games_id_name_year_df

Unnamed: 0,BGGId,Name,YearPublished
0,1,Die Macher,1986
1,2,Dragonmaster,1981
2,3,Samurai,1998
3,4,Tal der Könige,1992
4,5,Acquire,1964
...,...,...,...
21919,346965,Azul: Queen's Garden,2021
21920,347146,Salvage,2021
21921,347521,Blitzkrieg!: World War Two in 20 Minutes,2019
21922,348955,Rock Paper Scissors: Deluxe Edition,2021


### Now to merge with the category_cleaned_df

In [40]:
#df = pd.merge(df1, df2, on="ID")
games_id_name_year_cat_df = pd.merge(games_id_name_year_df, category_cleaned_df, on="BGGId",  how="inner")

In [41]:
games_id_name_year_cat_df

Unnamed: 0,BGGId,Name,YearPublished,Category
0,1,Die Macher,1986,Strategy
1,2,Dragonmaster,1981,Strategy
2,3,Samurai,1998,Strategy
3,5,Acquire,1964,Strategy
4,7,Cathedral,1978,Abstract
...,...,...,...,...
9113,339214,HIT !,2021,Family
9114,340466,Unfathomable,2021,Thematic
9115,342942,Ark Nova,2021,Strategy
9116,343562,Horrified: American Monsters,2021,Strategy


### Build the playtime_df in order to clean (remove large play times of 0 and greater than 210 minutes) and merge

In [42]:
games_id_playtime_df = games_updated_df1.filter(['BGGId', 'MfgPlaytime'], axis = 1)

In [43]:
games_id_playtime_df.replace(0, np.nan, inplace=True)

games_id_playtime_df.dropna(subset = ['MfgPlaytime'], inplace=True)

In [44]:
games_id_playtime_df2 = games_id_playtime_df[games_id_playtime_df['MfgPlaytime']<210]

In [45]:
games_id_playtime_df2

Unnamed: 0,BGGId,MfgPlaytime
1,2,30.0
2,3,60.0
3,4,60.0
4,5,90.0
6,7,20.0
...,...,...
21920,347146,40.0
21921,347521,45.0
21922,348955,1.0
21923,349131,15.0


In [46]:
games_id_name_cat_year_playtime_df = pd.merge(games_id_name_year_cat_df, games_id_playtime_df2, on="BGGId",  how="inner")

In [47]:
games_id_name_cat_year_playtime_df

Unnamed: 0,BGGId,Name,YearPublished,Category,MfgPlaytime
0,2,Dragonmaster,1981,Strategy,30.0
1,3,Samurai,1998,Strategy,60.0
2,5,Acquire,1964,Strategy,90.0
3,7,Cathedral,1978,Abstract,20.0
4,9,El Caballero,1998,Strategy,90.0
...,...,...,...,...,...
7642,338628,TRAILS,2021,Family,40.0
7643,339214,HIT !,2021,Family,20.0
7644,342942,Ark Nova,2021,Strategy,150.0
7645,343562,Horrified: American Monsters,2021,Strategy,60.0


In [48]:
games_id_minplayers_df = games_updated_df1.filter(['BGGId', 'MinPlayers'], axis = 1)

games_id_minplayers_df.replace(0, np.nan, inplace=True)

games_id_minplayers_df.dropna(subset = ['MinPlayers'], inplace=True)

games_incypm_df = pd.merge(games_id_name_cat_year_playtime_df, games_id_minplayers_df, on="BGGId",  how="inner")

games_incypm_df

Unnamed: 0,BGGId,Name,YearPublished,Category,MfgPlaytime,MinPlayers
0,2,Dragonmaster,1981,Strategy,30.0,3.0
1,3,Samurai,1998,Strategy,60.0,2.0
2,5,Acquire,1964,Strategy,90.0,2.0
3,7,Cathedral,1978,Abstract,20.0,2.0
4,9,El Caballero,1998,Strategy,90.0,2.0
...,...,...,...,...,...,...
7638,338628,TRAILS,2021,Family,40.0,2.0
7639,339214,HIT !,2021,Family,20.0,2.0
7640,342942,Ark Nova,2021,Strategy,150.0,1.0
7641,343562,Horrified: American Monsters,2021,Strategy,60.0,1.0


In [49]:
games_id_maxplayers_df = games_updated_df1.filter(['BGGId', 'MaxPlayers'], axis = 1)

games_id_maxplayers_df.replace(0, np.nan, inplace=True)

games_id_maxplayers_df.dropna(subset = ['MaxPlayers'], inplace=True)

games_incypmx_df = pd.merge(games_incypm_df, games_id_maxplayers_df, on="BGGId",  how="inner")

games_incypmx2_df = games_incypmx_df[games_incypmx_df['MaxPlayers']<21]

games_incypmx2_df

Unnamed: 0,BGGId,Name,YearPublished,Category,MfgPlaytime,MinPlayers,MaxPlayers
0,2,Dragonmaster,1981,Strategy,30.0,3.0,4.0
1,3,Samurai,1998,Strategy,60.0,2.0,4.0
2,5,Acquire,1964,Strategy,90.0,2.0,6.0
3,7,Cathedral,1978,Abstract,20.0,2.0,2.0
4,9,El Caballero,1998,Strategy,90.0,2.0,4.0
...,...,...,...,...,...,...,...
7614,338628,TRAILS,2021,Family,40.0,2.0,4.0
7615,339214,HIT !,2021,Family,20.0,2.0,5.0
7616,342942,Ark Nova,2021,Strategy,150.0,1.0,4.0
7617,343562,Horrified: American Monsters,2021,Strategy,60.0,1.0,5.0


In [50]:
games_id_ageuser_df = games_updated_df1.filter(['BGGId', 'AvgRating', 'MfgAgeRec', 'NumUserRatings'], axis = 1)

games_id_ageuser_df.replace(0, np.nan, inplace=True)

games_id_ageuser_df.dropna(subset = ['AvgRating', 'MfgAgeRec','NumUserRatings'], inplace=True)

games_cleaner_df = pd.merge(games_incypmx2_df, games_id_ageuser_df, on="BGGId",  how="inner")

games_cleaner_df

Unnamed: 0,BGGId,Name,YearPublished,Category,MfgPlaytime,MinPlayers,MaxPlayers,AvgRating,MfgAgeRec,NumUserRatings
0,2,Dragonmaster,1981,Strategy,30.0,3.0,4.0,6.65,12.0,562
1,3,Samurai,1998,Strategy,60.0,2.0,4.0,7.46,10.0,15146
2,5,Acquire,1964,Strategy,90.0,2.0,6.0,7.34,12.0,18655
3,7,Cathedral,1978,Abstract,20.0,2.0,2.0,6.52,8.0,3320
4,9,El Caballero,1998,Strategy,90.0,2.0,4.0,6.45,13.0,1389
...,...,...,...,...,...,...,...,...,...,...
7138,338628,TRAILS,2021,Family,40.0,2.0,4.0,7.29,10.0,554
7139,339214,HIT !,2021,Family,20.0,2.0,5.0,7.33,8.0,31
7140,342942,Ark Nova,2021,Strategy,150.0,1.0,4.0,8.48,14.0,618
7141,343562,Horrified: American Monsters,2021,Strategy,60.0,1.0,5.0,7.87,10.0,334


### Here is the flattened and cleaned table which will be used to do some analysis

In [51]:
games_cleaner_df = games_cleaner_df.convert_dtypes()
games_cleaner_df

Unnamed: 0,BGGId,Name,YearPublished,Category,MfgPlaytime,MinPlayers,MaxPlayers,AvgRating,MfgAgeRec,NumUserRatings
0,2,Dragonmaster,1981,Strategy,30,3,4,6.65,12,562
1,3,Samurai,1998,Strategy,60,2,4,7.46,10,15146
2,5,Acquire,1964,Strategy,90,2,6,7.34,12,18655
3,7,Cathedral,1978,Abstract,20,2,2,6.52,8,3320
4,9,El Caballero,1998,Strategy,90,2,4,6.45,13,1389
...,...,...,...,...,...,...,...,...,...,...
7138,338628,TRAILS,2021,Family,40,2,4,7.29,10,554
7139,339214,HIT !,2021,Family,20,2,5,7.33,8,31
7140,342942,Ark Nova,2021,Strategy,150,1,4,8.48,14,618
7141,343562,Horrified: American Monsters,2021,Strategy,60,1,5,7.87,10,334


### NOW, we can export these dataframes into the postgres tables using the function execute_values defined above

In [52]:
execute_values(conn, games_cleaner_df, 'games_flat')

Error: current transaction is aborted, commands ignored until end of transaction block



1

## Do a little profiling with pandas-profiling. Find the report_games.html in your directory or you can view the one in tthe github repository.

In [53]:
profile = ProfileReport(games_cleaner_df, title = 'Games Flattened and Cleaned', explorative= True)
#profile.to_notebook_iframe()
profile.to_file('report_games.html')

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

### Now lets make sure it has properly loaded into the database. If it has, we can retrieve it from inside this norebook using the select_sql function defined earlier! Don't mind the warning popping up, we just need to see if it works or not.

In [54]:
sql = '''select * from games_flat limit 10'''
select_sql(sql)



Unnamed: 0,bggid,name,yearpublished,category,mfgplaytime,minplayers,maxplayers,avgrating,mfgagerec,numuserratings
0,2,Dragonmaster,1981,Strategy,30,3,4,6.65,12,562
1,3,Samurai,1998,Strategy,60,2,4,7.46,10,15146
2,5,Acquire,1964,Strategy,90,2,6,7.34,12,18655
3,7,Cathedral,1978,Abstract,20,2,2,6.52,8,3320
4,9,El Caballero,1998,Strategy,90,2,4,6.45,13,1389
5,10,Elfenland,1998,Family,60,2,6,6.7,10,8324
6,11,Bohnanza,1997,Family,45,2,7,7.04,13,39886
7,12,Ra,1999,Strategy,60,2,5,7.48,12,19685
8,16,MarraCash,1996,Strategy,60,3,4,6.83,12,964
9,17,Button Men,1999,CGS,5,2,2,6.37,10,804


### Now lets chop this cleaned up flat dataframe into smaller dataframes for exporting into the seperate tables

In [55]:
games_df = games_cleaner_df.filter(['BGGId', 'Name', 'YearPublished', 'MfgAgeRec'])
users_df = games_cleaner_df.filter(['BGGId', 'MinPlayers', 'MaxPlayers'])
ratings_df = games_cleaner_df.filter(['BGGId', 'AvgRating', 'NumUserRatings'])
playtime_df = games_cleaner_df.filter(['BGGId', 'MfgPlaytime'])
category_df = games_cleaner_df.filter(['BGGId', 'Category'])


In [56]:
execute_values(conn, games_df, 'games')

Error: duplicate key value violates unique constraint "games_pkey"
DETAIL:  Key (bggid)=(2) already exists.



1

In [57]:
execute_values(conn, users_df, 'users')

Error: duplicate key value violates unique constraint "users_pkey"
DETAIL:  Key (bggid)=(2) already exists.



1

In [58]:
execute_values(conn, ratings_df, 'ratings')

Error: duplicate key value violates unique constraint "ratings_pkey"
DETAIL:  Key (bggid)=(2) already exists.



1

In [59]:
execute_values(conn, playtime_df, 'playtime')

Error: duplicate key value violates unique constraint "playtime_pkey"
DETAIL:  Key (bggid)=(2) already exists.



1

In [60]:
execute_values(conn, category_df, 'category')

Error: duplicate key value violates unique constraint "category_pkey"
DETAIL:  Key (bggid)=(2) already exists.



1

### Let's Query each one just to be sure

In [61]:
sql = '''select * from games limit 10'''
select_sql(sql)



Unnamed: 0,bggid,name,yearpublished,mfgagerec
0,2,Dragonmaster,1981,12
1,3,Samurai,1998,10
2,5,Acquire,1964,12
3,7,Cathedral,1978,8
4,9,El Caballero,1998,13
5,10,Elfenland,1998,10
6,11,Bohnanza,1997,13
7,12,Ra,1999,12
8,16,MarraCash,1996,12
9,17,Button Men,1999,10


In [62]:
sql = '''select * from users limit 10'''
select_sql(sql)



Unnamed: 0,bggid,minplayers,maxplayers
0,2,3,4
1,3,2,4
2,5,2,6
3,7,2,2
4,9,2,4
5,10,2,6
6,11,2,7
7,12,2,5
8,16,3,4
9,17,2,2


In [63]:
sql = '''select * from playtime limit 10'''
select_sql(sql)



Unnamed: 0,bggid,mfgplaytime
0,2,30
1,3,60
2,5,90
3,7,20
4,9,90
5,10,60
6,11,45
7,12,60
8,16,60
9,17,5


In [64]:
sql = '''select * from category limit 10'''
select_sql(sql)



Unnamed: 0,bggid,category
0,2,Strategy
1,3,Strategy
2,5,Strategy
3,7,Abstract
4,9,Strategy
5,10,Family
6,11,Family
7,12,Strategy
8,16,Strategy
9,17,CGS


In [66]:
import psycopg2
#establish connection
conn = psycopg2.connect("host=localhost dbname=bgg user=pocuser password=poc123")
#cursor facilitates processing such as traverse, add, remove, retrieve in a database
cur = conn.cursor()
cur.execute('SELECT * FROM ratings')