### Importing Packages

In [121]:
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn import metrics
import numpy as np
from sklearn import datasets
from sklearn.neighbors import KNeighborsClassifier
import pandas as pd
from sklearn import svm
import matplotlib.pyplot as plt
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.svm import SVC
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
import random

In [123]:
df = pd.read_excel('C:/Users/aceme/OneDrive/Documents/SIAM Simons Summer Opportunity/Datasets/D3Softball.xlsx', index_col=[0])
df.replace('---', np.nan, inplace=True)
df.dropna(inplace=True)

### Splitting Data

In [126]:
# split the dataset into training and testing data
# test_size: this is the percentage of data used for testing (20% in this case), so the rest is used for training data (80% in this case)
# random_state: this is a random number chosen that should be used each time to ensure we get the same data split each time
X_train, X_test, y_train, y_test = train_test_split(df.iloc[:, :-1], df.iloc[:, -1], test_size = 0.4, random_state = 52)

### Data cleaning

In [129]:
columns_to_convert = [
    'G', 'AB', 'H (Offensive)', 'H/G', 'BA', '2B (Offensive)', '2B/G',
    '3B (Offensive)', '3B/G', 'Innings Pitched', 'K (Def)', 'K/G (Def)',
    'BB Allowed', 'BB/G (Def)', 'K/BB', 'HA', 'HA/G', 'Runs Allowed',
    'Runs Allowed/G', 'ER Allowed', 'ERA', 'WHIP', 'PO', 'A', 'E', 'E/G',
    'FPCT', 'HBP (Offensive)', 'HBP/G', 'BB (Offensive)', 'BB/G (Off)',
    'SF (Offensive)', 'SH (Offensive)', 'OBP', 'SHO', 'SHO %', 'SB',
    'SB/G', 'TB', 'TB/G', 'SLG PCT', 'R', 'R/G', 'DP', 'DP/G',
    'K (Off)', 'K/G (Off)'
]

# Convert the columns to float
df[columns_to_convert] = df[columns_to_convert].apply(pd.to_numeric, errors='coerce')

# FIC

In [132]:
class FeatureClassifier:
  def __init__(self,reqAcc = 0.01, classifier = 'DecisionTree', bias = [], control = None, n_jobs = None, random_state = None):
    self.featureClassifiers=[] #list of all the classifiers of all the selected features
    self.reqAcc=reqAcc #user specified cutoff value
    self.indexLs=[] # list of mapped index values to featureClassifiers
    self.flag=0
    self.bias=bias # list of biases for each and every label
    self.control=control #overfitting control for decision trees
    self.classifier=classifier #the classifier which is preferred by the user
    self.dic={'DecisionTree':0,'LinearRegression':1,'SVM':2,'LogisticRegression':3} #a dictionary which maps the classifier to its index
    self.n_jobs=n_jobs
    self.random_state=random_state
    self.num_lables = None

  def finIndex(self):
    #finds the index where the reqAcc condition fails and also created the indexLs[] for mapping
    for i in range(len(self.featureClassifiers)):
      if self.featureClassifiers[i][1] < self.reqAcc:
        return i
      self.indexLs.append(self.featureClassifiers[i][2])
    self.flag=1
    return i

  def fit(self,x,y):
    #applied the model to the dataset. The model is trained and saved for further prediction
    self.num_lables=len(set(y.flatten()))
    bestfeatures = SelectKBest(score_func=chi2,k=1)
    fit = bestfeatures.fit(x,y)

    for i in range(len(x[0])):
      clf=[DecisionTreeClassifier(max_depth=self.control,random_state=self.random_state),LinearRegression(n_jobs=self.n_jobs),SVC(gamma=self.control,random_state=self.random_state), LogisticRegression(penalty=self.control,random_state=self.random_state)][self.dic[self.classifier]]
      X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.33,random_state=self.random_state)
      clf.fit(X_train[:,i:i+1],y_train)
      self.featureClassifiers.append((clf,fit.scores_[i],i))
    self.featureClassifiers.sort(key=lambda x:x[1],reverse=True)
    index=self.finIndex()
    if self.flag==0:
      self.featureClassifiers=self.featureClassifiers[:index]
    return

  def predict(self,x):
    #given a list of inputs, predicts the possible outputs
    if not self.bias:
      self.bias=np.zeros(self.num_lables)
    if len(self.bias)<self.num_lables:
      raise AttributeError('Please check the lenth of bias list')
    yPred=[]
    for i in range(len(x)):
      pred_arr=np.zeros(self.num_lables)
      for j in range(len(self.indexLs)):
        pred=np.round(self.featureClassifiers[j][0].predict([[x[i][self.indexLs[j]]]]))
        pred_arr[pred]+=self.featureClassifiers[j][1]+self.bias[pred[0]]
      yPred.append(np.argmax(pred_arr))
    return yPred

# ACCURACY

In [135]:
#train the model using the training sets
clf1=FeatureClassifier(0,classifier='DecisionTree',control=3, random_state = 52)
# clf1.fit(X_train,y_train.reshape(-1,1))
clf1.fit(np.array(X_train), np.array(y_train)[:,np.newaxis].astype(int))

#predict the response for the test dataset
#model accuracy (how often the classifier is correct)
print("Accuracy:", metrics.accuracy_score(np.array(y_test).astype(int),clf1.predict(np.array(X_test))))

print(clf1.indexLs)
clf1.featureClassifiers
print(clf1.indexLs)

Accuracy: 0.9745222929936306
[38, 41, 1, 2, 22, 36, 10, 29, 17, 34, 27, 5, 19, 9, 23, 24, 31, 15, 0, 7, 43, 18, 45, 42, 20, 16, 39, 14, 37, 46, 25, 35, 3, 28, 30, 11, 13, 12, 21, 6, 32, 40, 8, 33, 4, 44, 26]
[38, 41, 1, 2, 22, 36, 10, 29, 17, 34, 27, 5, 19, 9, 23, 24, 31, 15, 0, 7, 43, 18, 45, 42, 20, 16, 39, 14, 37, 46, 25, 35, 3, 28, 30, 11, 13, 12, 21, 6, 32, 40, 8, 33, 4, 44, 26]


# F1-SCORES

In [137]:
# Assuming FeatureClassifier is correctly implemented for DecisionTreeClassifier
clf1 = FeatureClassifier(0, classifier='DecisionTree', control=3, random_state = 52)

# Fit model with the training data
clf1.fit(np.array(X_train), np.array(y_train).astype(int))

# Predict the response for the test dataset
y_pred = clf1.predict(np.array(X_test))

# Calculate and print F1 score
f1 = metrics.f1_score(np.array(y_test).astype(int), y_pred)
print("F1 Score:", f1)

# Print other relevant outputs for debugging
print(clf1.indexLs)

F1 Score: 0.5
[38, 41, 1, 2, 22, 36, 10, 29, 17, 34, 27, 5, 19, 9, 23, 24, 31, 15, 0, 7, 43, 18, 45, 42, 20, 16, 39, 14, 37, 46, 25, 35, 3, 28, 30, 11, 13, 12, 21, 6, 32, 40, 8, 33, 4, 44, 26]
