# Baseball Predictions

In [42]:
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline
from sklearn.compose import ColumnTransformer
from sklearn.base import BaseEstimator, TransformerMixin
import numpy as np

In [43]:
cleaned = ['hsera', 'asera', 'hera', 'aera']

data_path = 'raw/train.csv'
data = pd.read_csv(data_path)

for item in cleaned:
    data.loc[data[item] > 18, item] = 18

testing_path = 'raw/data21.csv'
testing = pd.read_csv(testing_path)

testing.loc[data[item] > 18, item] = 18

data.describe()

Unnamed: 0,hrs,hra,hsera,hr,hera,ars,ara,asera,ar,aera,winner
count,4027.0,4027.0,4027.0,4027.0,4027.0,4027.0,4027.0,4027.0,4027.0,4027.0,4027.0
mean,4.640405,4.628354,4.459668,1.066056,4.30154,4.632575,4.635302,4.411378,1.064336,4.288438,0.535138
std,0.512721,0.572279,1.79736,0.380181,1.478321,0.512028,0.570607,1.703785,0.377345,1.475169,0.498826
min,3.41,3.04,0.0,0.369231,0.0,3.43,3.04,0.0,0.375,0.0,0.0
25%,4.3,4.19,3.323864,0.805195,3.44,4.31,4.2,3.299255,0.803922,3.44,0.0
50%,4.65,4.62,4.22449,1.016393,4.16,4.64,4.64,4.215886,1.016129,4.13,1.0
75%,4.98,5.06,5.329428,1.254237,4.91,4.97,5.07,5.302115,1.254391,4.9,1.0
max,6.03,6.45,18.0,2.527778,18.0,6.0,6.42,18.0,2.558824,18.0,1.0


In [44]:
class DataFrameColumnTransformer(TransformerMixin):
    def __init__(self, stages):
        self.col_trans = ColumnTransformer(stages)
    
    def fit(self, X: pd.DataFrame):
        """ Runs our ColumnTransformer.fit() method """
        self.col_trans.fit(X)
        return self
    
    def transform(self, X: pd.DataFrame) -> pd.DataFrame:
        """ Runs our ColumnTransformer.transform() method """
        output_arr = self.col_trans.transform(X)
        
        return self.to_dataframe(output_arr)
    
    def to_dataframe(self, arr: np.ndarray) -> pd.DataFrame:
        """Converts our output of ColumnTransformer into a DataFrame"""
        feature_names = self.col_trans.get_feature_names_out()
        
        # Remove the "__" that ColumnTransformer adds to our feature names
        # when we call self.col_trans.get_feature_names_out()
        for i, name in enumerate(feature_names):
            if '__' in name:
                feature_names[i] = name.split('__', 1)[-1]
        
        # Creates a Pandas Dataframe
        df = pd.DataFrame(arr, columns=feature_names)
        
        return df

In [45]:
class LogTransformer(BaseEstimator, TransformerMixin): 
    def __init__(self):
        self.feature_names = None
    
    def fit(self,
            X: pd.DataFrame, 
            y: pd.DataFrame = None):
        
        # We don't need to set/learn any variables so
        # we just need to return a reference to the object with 'self'
        # If we dont return self the Pipeline class will throw errors
        return self
    
    def transform(self,
                  X: pd.DataFrame, 
                  y: pd.DataFrame = None) -> pd.DataFrame:
        self.feature_names = X.columns
        
        return np.log1p(X)
    
    def get_feature_names_out(self, names=None):
        return list(self.feature_names)

In [46]:
def before_pipeline(data):
    passthrough_cols = data.drop(['hera', 'aera', 'hsera', 'asera'], axis=1).columns 

    # TODO 3.1
    stages = [('pass', 'passthrough', passthrough_cols),
              ('log', LogTransformer(), ['hera', 'aera', 'hsera', 'asera'])]

    # TODO 3.2
    before_pipe = DataFrameColumnTransformer(stages)
    
    # TODO 3.3
    cleaner_df = before_pipe.fit_transform(data)
    
    return cleaner_df

In [47]:
data = before_pipeline(data)
testing = before_pipeline(testing)

#y is the value we are trying to predict
train_y = data.winner
#these are the variables we are using to make the prediction
features = ['hrs', 'hra', 'hsera', 'hr', 'hera', 'ars', 'ara', 'asera', 'ar', 'aera']
train_X = data[features]

val_y = testing.winner
val_X = testing[features]

In [48]:
lr = LogisticRegression(random_state  = 41)

lr.fit(train_X, train_y)
lr_preds = lr.predict(val_X)
#print(mean_absolute_error(val_y, lr_preds))
print("Logistic Regression: ")
print(accuracy_score(val_y, lr_preds))

Logistic Regression: 
0.5732531930879038


In [49]:
pipe = make_pipeline(StandardScaler(),
                     LogisticRegression())

pipe.fit(train_X, train_y)
p_preds = pipe.predict(val_X)

print("Logistic Regression w/ Pipeline: ")
print(accuracy_score(val_y, p_preds))

Logistic Regression w/ Pipeline: 
0.5740045078888054
