In [1]:
import psycopg2
import pandas as pd
pd.options.display.max_columns = 40

### Import raw dataset

#### Challenge - Build code to collect data directly from hockey-reference.com

In [2]:
df = pd.read_csv('/Users/mattrende/Documents/Coding/hockey-reference/YouTube Project/csv/in/jack_hughes_scoring_2023.csv')
team_games = pd.read_csv("/Users/mattrende/Documents/Coding/hockey-reference/YouTube Project/csv/out/fact_gamelog.csv")
player_name = 'Hughes, Jack'

In [3]:
df['Date'] = df.Date.astype(str)
df['Date'] = df.Date.str.replace('-','')
df['Date'] = df.Date.astype(int)

In [4]:
df = df.merge(team_games, on=['Date'], how='left')

In [5]:
df.rename(columns={'Unnamed: 3':'Home','Unnamed: 7':'Goal_Strength','Key':'Team_Game_Key', 'Rk':'Point_Number'},inplace=True)

In [6]:
goal_scorer = []
primary_assist = []
secondary_assist = []

for desc in df['Description']:

    ## Populate Goal Scorer
    temp_gs = desc.split(' ')[2:4]
    temp_gs = ', '.join(reversed(temp_gs))
    goal_scorer.append(temp_gs)

    ## Populate Primary Assist
    try:
        temp_pa = desc.split(' ')[6:8]
        temp_pa = ', '.join(reversed(temp_pa))
        primary_assist.append(temp_pa)
    except:
        primary_assist.append(None)

    ## Populate Secondary Assist
    try:
        temp_sa = desc.split(' ')[9:11]
        temp_sa = ', '.join(reversed(temp_sa))
        secondary_assist.append(temp_sa)
    except:
        secondary_assist.append(None)

df['goal_scorer'] = goal_scorer
df['primary_assist'] = primary_assist
df['secondary_assist'] = secondary_assist
df['tracked_player_name'] = player_name

In [7]:
df.head()

Unnamed: 0,Point_Number,Date,Tm,Home,Opp,P,Time,Goal_Strength,Description,Team_Game_Key,team_id,Season_Year,GP,Season,Home.1,Opp_Name,GF,GA,Result,OT,Team_S,Team_PIM,Team_PPG,Team_PPO,Team_SHG,Opp_S,Opp_PIM,Opp_PPG,Opp_PPO,Opp_SHG,goal_scorer,primary_assist,secondary_assist,tracked_player_name
0,1,20221018,NJD,,ANA,2,04:17,EV,Goal by Ondřej Palát assisted by Ryan Graves a...,18-2023-R3,18,2023,3,R,1,Anaheim Ducks,4,2,W,0,41,15,0,3,0,20,23,0,4,0,"Palát, Ondřej","Graves, Ryan","Hughes, Jack","Hughes, Jack"
1,2,20221018,NJD,,ANA,3,12:49,EV,Goal by Dawson Mercer assisted by Yegor Sharan...,18-2023-R3,18,2023,3,R,1,Anaheim Ducks,4,2,W,0,41,15,0,3,0,20,23,0,4,0,"Mercer, Dawson","Sharangovich, Yegor","Hughes, Jack","Hughes, Jack"
2,3,20221020,NJD,@,NYI,2,01:35,EV,Goal by Jack Hughes assisted by Damon Severson,18-2023-R4,18,2023,4,R,0,New York Islanders,4,1,W,0,43,6,0,3,0,17,8,0,2,0,"Hughes, Jack","Severson, Damon",,"Hughes, Jack"
3,4,20221025,NJD,@,DET,1,15:16,EV,Goal by Jack Hughes assisted by Erik Haula and...,18-2023-R7,18,2023,7,R,0,Detroit Red Wings,6,2,W,0,41,6,1,1,1,22,2,0,3,0,"Hughes, Jack","Haula, Erik","Bratt, Jesper","Hughes, Jack"
4,5,20221025,NJD,@,DET,2,00:43,PP,Goal by Jesper Bratt assisted by Jack Hughes a...,18-2023-R7,18,2023,7,R,0,Detroit Red Wings,6,2,W,0,41,6,1,1,1,22,2,0,3,0,"Bratt, Jesper","Hughes, Jack","Hamilton, Dougie","Hughes, Jack"


#### Connect to database

In [8]:
try:
    conn = psycopg2.connect("host=127.0.0.1 dbname=hockey_yt_project user=postgres password=password")
except psycopg2.Error as e:
    print("Error: Could not connect to database")
    print(e)

try:
    cur = conn.cursor()
except psycopg2.Error as e:
    print("Error: Could not get cursor to the database")
    print(e)

conn.set_session(autocommit=True)

#### Find last game in database

In [9]:
try: 
    cur.execute("""select max(g.date)
                   from fact_player_scoring_log p
                   join fact_gamelog g
	               on g.key = p.team_year_gp_key;""")

except psycopg2.Error as e:
    print("Select Error")
    print(e)

row = cur.fetchone()
games_in_db = []
while row:
    last_game_in_db = row[0]
    row = cur.fetchone()

In [10]:
last_game_in_db

20230413

#### Filter for rows after last game in db

In [11]:
missing_points = df[df.Date > last_game_in_db]
missing_points = missing_points[['tracked_player_name','Point_Number', 'Team_Game_Key', 'P', 'Time', 'Goal_Strength',
       'Description', 'goal_scorer', 'primary_assist', 'secondary_assist']]
missing_points.head()

Unnamed: 0,tracked_player_name,Point_Number,Team_Game_Key,P,Time,Goal_Strength,Description,goal_scorer,primary_assist,secondary_assist


In [12]:
player_scoring_insert = ("""INSERT INTO fact_player_scoring_log(
                  tracked_player_name,
                  point_number,
                  team_year_gp_key,
                  p,
                  time,
                  goal_strength,
                  description,
                  goal_scorer,
                  primary_assist,
                  secondary_assist)
               VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s)
               """)

#### Insert into DB

In [13]:
try:
    for i, row in missing_points.iterrows():
        #print(list(row))
        cur.execute(player_scoring_insert, list(row))

except psycopg2.Error as e:
    print("Error: could not insert rows")
    print(e)