In [265]:
#imports
import boto3
from botocore.exceptions import ClientError #for debugging
from dotenv import load_dotenv #for env variables
import os
import pandas as pd
from sqlalchemy import create_engine
import psycopg2
import csv
print("psycopg2 version:", psycopg2.__version__)

psycopg2 version: 2.9.10 (dt dec pq3 ext lo64)


In [266]:
#load the variables/password/IDs from the .env file
load_dotenv()
#get the access and secret key 
access_key_id = os.getenv('ACCESS_KEY_ID')
secret_key = os.getenv('SECRET_ACCESS_KEY')

#declare bucket name and data folder name where data will be downloaded to
bucket_name = "mindex-data-analytics-code-challenge"
data_folder = "/Users/mady/mindex_data_analytics_challenge/data/"

#print  key variables to verify that hold the correct values as what was provided
#print('access_key_id: ', access_key_id)
#print('secret_key: ', secret_key)


In [267]:

#create s3 client
s3 = boto3.client('s3', aws_access_key_id=access_key_id, aws_secret_access_key=secret_key, region_name='us-east-1')

#file names to download
files = ['bengals.csv', 'boyd_receiving.csv', 'chase_receiving.csv', 'higgins_receiving.csv']

#download each file
for file in files:
    try:
        s3.download_file(bucket_name, file, data_folder + file)
        print(file + " downloaded successfully")
    except Exception as e:
        print(file + "error: ", e)

 

bengals.csv downloaded successfully
boyd_receiving.csv downloaded successfully
chase_receiving.csv downloaded successfully
higgins_receiving.csv downloaded successfully


In [268]:
'''
Use the pandas library to load each CSV into its own dataframe.
'''
dataframes = {} #dictionary to hold the dataframes for each player
player_files = ['/Users/mady/mindex_data_analytics_challenge/data/boyd_receiving.csv', '/Users/mady/mindex_data_analytics_challenge/data/chase_receiving.csv', '/Users/mady/mindex_data_analytics_challenge/data/higgins_receiving.csv']

#put each file into a pandas dataframe
for file in player_files:
    csv = file.split('/data/')[1]
    player_name = csv.split('_')[0].capitalize()
    print(player_name + " dataframe created")
    dataframes[player_name] = pd.read_csv(file)
    #add player name as a column
    dataframes[player_name]['Player'] = player_name

#tst if that worked
print('****** Boyd df ****** \n ', dataframes['Boyd'].head())
print('****** Higgins df ****** \n ',dataframes['Higgins'].head())
print('****** Higgins df ****** \n ',dataframes['Chase'].head())

#load bengals data into a panda frame
dataframes['Bangals'] = pd.read_csv('/Users/mady/mindex_data_analytics_challenge/data/bengals.csv')
print('****** Bengals df ****** \n ',dataframes['Bangals'].head())



Boyd dataframe created
Chase dataframe created
Higgins dataframe created
****** Boyd df ****** 
     Week  Yards  TD Player
0  REG1     32   0   Boyd
1  REG2     73   0   Boyd
2  REG3     36   1   Boyd
3  REG4    118   0   Boyd
4  REG5     24   0   Boyd
****** Higgins df ****** 
     Week  Yards  TD   Player
0  REG1     58   1  Higgins
1  REG2     60   1  Higgins
2  REG5     32   0  Higgins
3  REG6     44   0  Higgins
4  REG7     62   0  Higgins
****** Higgins df ****** 
     Week  Yards  TD Player
0  REG1    101   1  Chase
1  REG2     54   1  Chase
2  REG3     65   2  Chase
3  REG4     77   0  Chase
4  REG5    159   1  Chase
****** Bengals df ****** 
     Week Opponent Location  Result
0  PRE1       TB     Away     1.0
1  PRE2      WSH     Away     0.0
2  PRE3      MIA     Home     0.0
3  REG1      MIN     Home     1.0
4  REG2      CHI     Away     0.0


In [None]:
'''
Join/Merge all of the dataframes together to display one global table that shows
the three different receiver’s yards and touchdown (TD) data as well as every
game result. Be sure to include Opponent, Location, and Result fields from the
bengals.csv file.

Replace the ‘1.0’ or ‘0.0’ values in the Result field to display ‘Win’ or ‘Loss’,
respectively.
'''

#concat the player data frames
df = pd.concat([dataframes['Boyd'], dataframes['Higgins'], dataframes['Chase']])

#join to the bengals data frame
df = df.merge(dataframes['Bangals'], on='Week')

#replace the 1.0 and 0.0 values in the Result field
df = df.replace({'Result': {1.0: 'Win', 0.0: 'Loss'}})
df.index = range(1, len(df) + 1)
#add indexs column so it's in CSV
df['Index'] = range(1, len(df) + 1)
print(df)
#save the global dataframe to a csv file 
df.to_csv('/Users/mady/mindex_data_analytics_challenge/data/global.csv', index=False)


     Week  Yards  TD   Player Opponent Location Result  Index
1    REG1     32   0     Boyd      MIN     Home    Win      1
2    REG2     73   0     Boyd      CHI     Away   Loss      2
3    REG3     36   1     Boyd      PIT     Away    Win      3
4    REG4    118   0     Boyd      JAX     Home    Win      4
5    REG5     24   0     Boyd       GB     Home   Loss      5
6    REG6      7   0     Boyd      DET     Away    Win      6
7    REG7     39   0     Boyd      BAL     Away    Win      7
8    REG8     69   1     Boyd      NYJ     Away   Loss      8
9    REG9     11   0     Boyd      CLE     Home   Loss      9
10  REG11     49   0     Boyd       LV     Away    Win     10
11  REG12     13   0     Boyd      PIT     Home    Win     11
12  REG13     85   0     Boyd      LAC     Home   Loss     12
13  REG14     55   0     Boyd       SF     Home   Loss     13
14  REG15     96   1     Boyd      DEN     Away    Win     14
15  REG16     85   1     Boyd      BAL     Home    Win     15
16  REG1

In [270]:
#remove any empty rows (csv have empty last row)
df.replace('', pd.NA, inplace=True) 
df.dropna(how='any')

Unnamed: 0,Week,Yards,TD,Player,Opponent,Location,Result,Index
1,REG1,32,0,Boyd,MIN,Home,Win,1
2,REG2,73,0,Boyd,CHI,Away,Loss,2
3,REG3,36,1,Boyd,PIT,Away,Win,3
4,REG4,118,0,Boyd,JAX,Home,Win,4
5,REG5,24,0,Boyd,GB,Home,Loss,5
6,REG6,7,0,Boyd,DET,Away,Win,6
7,REG7,39,0,Boyd,BAL,Away,Win,7
8,REG8,69,1,Boyd,NYJ,Away,Loss,8
9,REG9,11,0,Boyd,CLE,Home,Loss,9
10,REG11,49,0,Boyd,LV,Away,Win,10


In [263]:
#get variaables from the .env file
db_host = os.getenv('DB_HOST')
db_username = os.getenv('USERNAME')
db_password = os.getenv('PASSWORD')
db_name = os.getenv('DB_NAME')
db_table_name = os.getenv('TABLE_NAME')
db_address = os.getenv('ADDRESS')

#db connection details 
#note address is the IP address from terminal running nslookup and the host given
#because the host was not working to let me connect
db = {
    'dbname': db_name,
    'user': db_username,
    'password': db_password,
    'host': db_address,
    'port': '5432'
}
connection_string = 'postgresql+psycopg2://' + db_username + ':' + db_password + '@' + db_address + '/' + db_name
#create SQLalchemy engine
engine = create_engine(connection_string)

#for debugging, prints my table
def print_my_table():
    try:
        with engine.connect() as connection:
            query = f"SELECT * FROM {db_table_name};"
            mb = pd.read_sql(query, con=connection.connection)
            print("connection to DB successful, table to print below")
            print(mb)

    except Exception as e:
        print("error connecting to PostgreSQL DB: ", e)
        
print_my_table()


connection to DB successful, table to print below
Empty DataFrame
Columns: []
Index: []


  mb = pd.read_sql(query, con=connection.connection)


In [None]:
#helper method
def execute_query_helper(query):
    try:
        conn = psycopg2.connect(**db)
        cursor = conn.cursor()
        alter_query = query
        cursor.execute(alter_query)
        conn.commit()
    except Exception as e:
        print(f"Error: {e}")
    finally:
        #close connection
        if cursor:
            cursor.close()
        if conn:
            conn.close()

#add columms
execute_query_helper("""  
        ALTER TABLE madison_banaszak
        ADD COLUMN Week TEXT,
        ADD COLUMN Yards INT,
        ADD COLUMN TD INT,
        ADD COLUMN Player TEXT,
        ADD COLUMN Opponent TEXT,
        ADD COLUMN Location TEXT,
        ADD COLUMN Result TEXT,
        ADD COLUMN Index INT;
                     """)
print_my_table()


connection to DB successful, table to print below
Empty DataFrame
Columns: [week, yards, td, player, opponent, location, result, index]
Index: []


  mb = pd.read_sql(query, con=connection.connection)


In [286]:
#get connection to db
conn = psycopg2.connect(**db)
#create cursor object
curr = conn.cursor()

#delete all rows from table first
print("about to clean out table first before insert")
execute_query_helper(""" DELETE FROM madison_banaszak; """)
print_my_table()
#open the csv file
with open('/Users/mady/mindex_data_analytics_challenge/data/global.csv', 'r') as f:
    #skip the header row
    next(f)
    #copy the csv file into the table
    curr.copy_from(f, 'madison_banaszak', sep=',')
    conn.commit()
    curr.close()
    conn.close()
    print("CSV file copied into table successfully :)")

print_my_table()


about to clean out table first before insert


  mb = pd.read_sql(query, con=connection.connection)


connection to DB successful, table to print below
Empty DataFrame
Columns: [week, yards, td, player, opponent, location, result, index]
Index: []
CSV file copied into table successfully :)
connection to DB successful, table to print below
     week  yards  td   player opponent location result  index
0    REG1     32   0     Boyd      MIN     Home    Win      1
1    REG2     73   0     Boyd      CHI     Away   Loss      2
2    REG3     36   1     Boyd      PIT     Away    Win      3
3    REG4    118   0     Boyd      JAX     Home    Win      4
4    REG5     24   0     Boyd       GB     Home   Loss      5
5    REG6      7   0     Boyd      DET     Away    Win      6
6    REG7     39   0     Boyd      BAL     Away    Win      7
7    REG8     69   1     Boyd      NYJ     Away   Loss      8
8    REG9     11   0     Boyd      CLE     Home   Loss      9
9   REG11     49   0     Boyd       LV     Away    Win     10
10  REG12     13   0     Boyd      PIT     Home    Win     11
11  REG13     85 

  mb = pd.read_sql(query, con=connection.connection)
