# Lab 4 - Pitching Aggregation

## Setup

#### `pandas`

In [2]:
import pandas as pd
from dfply import *
pitching_raw = pd.read_csv('./data/baseball/core/Pitching.csv') 
pitching = (pitching_raw >>
             mutate(id = pitching_raw.index))
pitching.head()

Unnamed: 0,playerID,yearID,stint,teamID,lgID,W,L,G,GS,CG,...,WP,HBP,BK,BFP,GF,R,SH,SF,GIDP,id
0,bechtge01,1871,1,PH1,,1,2,3,3,2,...,7,,0,146.0,0,42,,,,0
1,brainas01,1871,1,WS3,,12,15,30,30,30,...,7,,0,1291.0,0,292,,,,1
2,fergubo01,1871,1,NY2,,0,0,1,0,0,...,2,,0,14.0,0,9,,,,2
3,fishech01,1871,1,RC1,,4,16,24,24,22,...,20,,0,1080.0,1,257,,,,3
4,fleetfr01,1871,1,NY2,,0,1,1,1,1,...,0,,0,57.0,0,21,,,,4


#### `sqlalchemy`

In [3]:
from sqlalchemy import create_engine
from sqlalchemy.ext.automap import automap_base
from sqlalchemy import select as select_sql

pitching_eng = create_engine("sqlite:///databases/baseball_2_5.db") 
Base = automap_base()
Base.prepare(pitching_eng, reflect=True)
Pitching = Base.classes.pitching
Pitching

sqlalchemy.ext.automap.pitching

In [4]:
stmt = select_sql('*').select_from(Pitching).limit(5)
pd.read_sql_query(stmt, con=pitching_eng)

Unnamed: 0,playerID,yearID,stint,teamID,lgID,W,L,G,GS,CG,...,WP,HBP,BK,BFP,GF,R,SH,SF,GIDP,id
0,bechtge01,1871,1,PH1,,1,2,3,3,2,...,7,,0,146.0,0,42,,,,0
1,brainas01,1871,1,WS3,,12,15,30,30,30,...,7,,0,1291.0,0,292,,,,1
2,fergubo01,1871,1,NY2,,0,0,1,0,0,...,2,,0,14.0,0,9,,,,2
3,fishech01,1871,1,RC1,,4,16,24,24,22,...,20,,0,1080.0,1,257,,,,3
4,fleetfr01,1871,1,NY2,,0,1,1,1,1,...,0,,0,57.0,0,21,,,,4


#### `pyspark`

In [5]:
from pyspark.sql import SparkSession
from more_pyspark import get_spark_types, to_pandas

spark = SparkSession.builder.appName('Ops').getOrCreate()

schema = get_spark_types(pitching, keys=['id'])

pitching_spark = spark.createDataFrame(pitching, schema=schema)
(pitching_spark.
   take(5)) >> to_pandas

Unnamed: 0,playerID,yearID,stint,teamID,lgID,W,L,G,GS,CG,...,WP,HBP,BK,BFP,GF,R,SH,SF,GIDP,id
0,bechtge01,1871,1,PH1,,1,2,3,3,2,...,7,,0,146.0,0,42,,,,0
1,brainas01,1871,1,WS3,,12,15,30,30,30,...,7,,0,1291.0,0,292,,,,1
2,fergubo01,1871,1,NY2,,0,0,1,0,0,...,2,,0,14.0,0,9,,,,2
3,fishech01,1871,1,RC1,,4,16,24,24,22,...,20,,0,1080.0,1,257,,,,3
4,fleetfr01,1871,1,NY2,,0,1,1,1,1,...,0,,0,57.0,0,21,,,,4


## Task 1

Compute and plot the average home runs per game allowed per year for all years in the pitching table.  To accomplish this task, you should

1. Aggregate total `HR` and `IPouts` for each year.
1. Create a `games_pitched` column, which is computed by dividing total inning pitched outs `IPouts` by 27, the number of outs in a game.
2. Create a `hr_per_game` column by dividing the total `HR` for each year by `games_pitched` for each year.
4. Make line plot of the results. You will need to create a new column `HR_per_inning` then perform the group and aggregate. You can use [seaborn's lineplot](https://seaborn.pydata.org/generated/seaborn.lineplot.html) to make the graph.

Solve this problem in each framework.

## <font color="red"> Problem 1 </font>

Explain why we might want to total the home runs and outs pitched before dividing.

> *you want to make sure to take into account the amount of games because the more games the more homeruns*

## <font color="red"> Problem 2 </font>

Complete the above tasks using `pandas` and `dfply`

In [13]:
hr_per_year = (pitching 
                >>select(X.HR, X.IPouts, X.yearID)
                >>group_by(X.yearID)
                >>summarise(total_hr = X.HR.sum(),
                            total_outs = X.IPouts.sum())
                >>mutate(games_pitched = X.total_outs/27)
                >>mutate(hr_per_games = X.total_hr/X.games_pitched)
              )
hr_per_year

Unnamed: 0,yearID,total_hr,total_outs,games_pitched,hr_per_games
0,1871,47,6750,250.000000,0.188000
1,1872,37,9858,365.111111,0.101339
2,1873,47,10754,398.296296,0.118003
3,1874,40,12509,463.296296,0.086338
4,1875,40,18571,687.814815,0.058155
5,1876,40,14218,526.592593,0.075960
6,1877,24,9723,360.111111,0.066646
7,1878,23,9972,369.333333,0.062274
8,1879,58,17391,644.111111,0.090047
9,1880,62,18094,670.148148,0.092517


## <font color="red"> Problem 3 </font>

Complete the above tasks using `sqlalchemy`

In [71]:
from sqlalchemy.sql import select as select_sql
from sqlalchemy import func
from sqlparse import format
from sqlalchemy import cast, Float

In [70]:
totals = (select_sql([func.sum(Pitching.HR).label('total_hr'), 
                    func.sum(Pitching.IPouts).label('total_outs'),
                   (Pitching.yearID)])
     .group_by(Pitching.yearID)
     .alias('tw_table'))
hr_per_year = (select_sql([((cast((totals.c.total_hr),Float)/(totals.c.total_outs/27)).label('hr_per_year')),
                          totals.c.yearID]))                 
pd.read_sql_query(hr_per_year, con=pitching_eng)

Unnamed: 0,hr_per_year,yearID
0,0.188000,1871
1,0.101370,1872
2,0.118090,1873
3,0.086393,1874
4,0.058224,1875
5,0.076046,1876
6,0.066667,1877
7,0.062331,1878
8,0.090062,1879
9,0.092537,1880


## <font color="red"> Problem 4 </font>

Complete the above tasks using `pyspark`

In [67]:
import pyspark.sql.functions as fn
hrs= fn.sum(pitching_spark.HR).alias('total_hrs')
outs= fn.sum(pitching_spark.IPouts).alias('total_out')
totals = (pitching_spark
             .select([pitching_spark.yearID, 
                      pitching_spark.HR,
                      pitching_spark.IPouts])
             .groupby(pitching_spark.yearID)
             .agg((hrs/(outs/27)).alias('hr_per_game'))
             .orderBy(pitching_spark.yearID, ascending=True)
             .collect()
        ) >> to_pandas
totals

Unnamed: 0,yearID,hr_per_game
0,1871,0.188000
1,1872,0.101339
2,1873,0.118003
3,1874,0.086338
4,1875,0.058155
5,1876,0.075960
6,1877,0.066646
7,1878,0.062274
8,1879,0.090047
9,1880,0.092517


## <font color="blue"> Key for Problem 4</font>

## Task 2

For each year, determine the team that had the most home runs (using the `Pitching.csv` file).

## <font color="red"> Problem 5 </font>

Solve **Task 2** with the framework of your choice.

In [66]:
top_team_hrs = (pitching 
               >>select(X.HR, X.IPouts, X.yearID, X.teamID)
               >>group_by(X.yearID, X.teamID)
               >>summarise(total_hr = X.HR.sum())
               >>ungroup
               >> group_by(X.yearID)
               >>filter_by(X.total_hr == colmax(X.total_hr))
              )
top_team_hrs

Unnamed: 0,teamID,yearID,total_hr
2,CL1,1871,13
10,BR1,1872,7
28,WS5,1873,11
30,BR2,1874,15
37,BR2,1875,6
45,PH2,1875,6
49,WS6,1875,6
52,CN1,1876,9
59,CHN,1877,7
64,BSN,1878,6
