# Classifying MLB Free agents

Now we're going to actually try classifying. We'll bring in the final data (this might change from time to time), format it correctly, and then try some ML

In [39]:
# Bring in data
import pandas as pd
import pickle

with open('final_data.pickle', 'rb') as file:
    final_data = pickle.load(file)

In [41]:
# For features, let's drop all names, player/year IDs, position, Destination, and origin

# The Destination can stay for now, but drop all the others
X = final_data.drop(['playerID', 'nameFirst', 'nameLast', 'name',
                     'Origin', 'Position', 'yearID', 'Destination'], axis = 1).values

y = final_data['Destination'].values


# Split the data
from sklearn.model_selection import train_test_split

# Split into training and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, stratify=y)

## Make the naive method

Classify based on the most common team: the Yankees

In [72]:
final_data['Destination'].value_counts().max()
final_data['Destination'].shape[0]

1897

In [73]:
# Calculate accuracy based on this
most_freq = float(final_data['Destination'].value_counts().max())
total_freq = float(final_data['Destination'].shape[0])


print("Naive Accuracy = {}".format(most_freq/total_freq))

Naive Accuracy = 0.04533473906167633


## Attempt 1: K-Nearest Neighbors

This is probably the simplest approach; let's see how it works

In [61]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report

# Create a k-NN classifier with 6 neighbors
knn = KNeighborsClassifier(n_neighbors = 6)

# Fit the classifier to the data
knn.fit(X_train, y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=6, p=2,
           weights='uniform')

In [62]:
print(knn.score(X_test, y_test))

y_pred = knn.predict(X_test)

print(classification_report(y_test, y_pred))

0.0421052631579
             precision    recall  f1-score   support

        ARI       0.02      0.08      0.03        12
        ATL       0.00      0.00      0.00        12
        BAL       0.00      0.00      0.00        14
        BOS       0.04      0.08      0.06        13
        CHA       0.14      0.27      0.18        11
        CHN       0.04      0.07      0.05        14
        CIN       0.00      0.00      0.00        12
        CLE       0.00      0.00      0.00        13
        COL       0.10      0.12      0.11        16
        DET       0.00      0.00      0.00        12
        HOU       0.17      0.09      0.12        11
        KCA       0.15      0.20      0.17        10
        LAA       0.00      0.00      0.00         7
        LAN       0.06      0.06      0.06        17
        MIA       0.00      0.00      0.00        11
        MIL       0.00      0.00      0.00        12
        MIN       0.00      0.00      0.00         9
        NYA       0.00      0

  'precision', 'predicted', average, warn_for)


In [67]:
# Get actual probabilities
blah = knn.predict_proba(X_test)

array([[ 0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.33333333,  0.        ,  0.        ,  0.        ,  0.16666667,
         0.        ,  0.16666667,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.16666667,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.16666667,  0.        ,  0.        ,  0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.16666667,  0.        ,
         0.        ,  0.        ,  0.        ,  0.16666667,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.16666667,
         0.16666667,  0.        ,  0.        ,  0.33333333,  0.        ],
       [ 0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.16666667,  0.16666667,  0.        ,  0

## Attempt 2: Random Forest Classifier

In [68]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier()

# Fit the classifier to the data
rf.fit(X_train, y_train)

print(rf.score(X_test, y_test))

y_pred = rf.predict(X_test)

print(classification_report(y_test, y_pred))

0.0184210526316
             precision    recall  f1-score   support

        ARI       0.00      0.00      0.00        12
        ATL       0.00      0.00      0.00        12
        BAL       0.00      0.00      0.00        14
        BOS       0.14      0.15      0.15        13
        CHA       0.00      0.00      0.00        11
        CHN       0.00      0.00      0.00        14
        CIN       0.00      0.00      0.00        12
        CLE       0.00      0.00      0.00        13
        COL       0.00      0.00      0.00        16
        DET       0.05      0.08      0.06        12
        HOU       0.00      0.00      0.00        11
        KCA       0.10      0.10      0.10        10
        LAA       0.00      0.00      0.00         7
        LAN       0.00      0.00      0.00        17
        MIA       0.00      0.00      0.00        11
        MIL       0.00      0.00      0.00        12
        MIN       0.00      0.00      0.00         9
        NYA       0.08      0