In [1]:
import pandas as pd
import sklearn
import numpy as np

from sklearn.utils import shuffle
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import NearestNeighbors

### Data ingestion for ML

In [2]:
df = pd.read_csv('data.csv', index_col=0)
df = df[['Reactions', 'Composure', 'Vision', 'ShortPassing', 'BallControl', 'Overall', 'Name']]
df = df.dropna()

df = shuffle(df, random_state=10)
print(df.head())

       Reactions  Composure  Vision  ShortPassing  BallControl  Overall  \
16133       46.0       48.0    42.0          55.0         64.0       58   
6285        66.0       66.0    63.0          61.0         66.0       69   
15884       55.0       51.0    47.0          53.0         55.0       58   
9156        56.0       55.0    37.0          59.0         49.0       66   
495         77.0       61.0    57.0          37.0         34.0       80   

              Name  
16133     A. Bakir  
6285   João Victor  
15884     B. Singh  
9156      G. Milan  
495        J. Zoet  


### Train Test split

In [3]:
def split_df(df):
    X = df.drop(['Overall', 'Name'], axis=1).values
    name = df['Name'].values
    y = df['Overall'].values
    split = int(len(X) * 0.7)
    X_train = X[:split]
    X_test = X[split:]
    y_train = y[:split]
    y_test = y[split:]
    return X_train, X_test, y_train, y_test

X_train, X_test, y_train, y_test = split_df(df)

print(X_train)
print(y_train)

[[46. 48. 42. 55. 64.]
 [66. 66. 63. 61. 66.]
 [55. 51. 47. 53. 55.]
 ...
 [55. 67. 64. 67. 69.]
 [60. 50. 50. 65. 60.]
 [66. 61. 69. 78. 78.]]
[58 69 58 ... 65 67 72]


### Fit a Linear Regression Model

In [4]:
reg = LinearRegression().fit(X_train, y_train)

In [5]:
new_player = [95, 96, 94, 90, 96]
print(reg.predict([new_player]))

[89.19505682]


In [6]:
new_player = [66, 68, 64, 67, 70]

def calc_overall(new_player, model):
    return model.predict([new_player])

print(calc_overall(new_player, reg))

[69.8491762]


### Nearest Neighbour- full dataset

In [7]:
df = pd.read_csv('data.csv', index_col=0)
df = shuffle(df, random_state=10)

df = df[['Reactions', 'Composure', 'Vision', 'ShortPassing', 'BallControl', 'Name']]
df = df.dropna()

X = df.drop('Name', axis=1)
y = df['Name']

In [8]:
neigh = NearestNeighbors(n_neighbors=3)
neigh.fit(X)

NearestNeighbors(algorithm='auto', leaf_size=30, metric='minkowski',
                 metric_params=None, n_jobs=None, n_neighbors=3, p=2,
                 radius=1.0)

In [9]:
new_player = [95, 96, 94, 90, 96]
preds = neigh.kneighbors([new_player], 3, return_distance=False)

closest_players = [y.iloc[pred] for pred in preds[0]]
print(closest_players)

['L. Messi', 'David Silva', 'E. Hazard']
