In [50]:
import pandas as pd
import sklearn
import numpy as np

from sklearn.utils import shuffle
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import NearestNeighbors
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split

### Data ingestion for ML

In [51]:
df = pd.read_csv('data.csv', index_col=0)
df = df[['Reactions', 'Composure', 'Vision', 'ShortPassing', 'BallControl', 'Overall', 'Name']]
df = df.dropna()

df = shuffle(df, random_state=10)
print(df.head())

       Reactions  Composure  Vision  ShortPassing  BallControl  Overall  \
16133       46.0       48.0    42.0          55.0         64.0       58   
6285        66.0       66.0    63.0          61.0         66.0       69   
15884       55.0       51.0    47.0          53.0         55.0       58   
9156        56.0       55.0    37.0          59.0         49.0       66   
495         77.0       61.0    57.0          37.0         34.0       80   

              Name  
16133     A. Bakir  
6285   Jo√£o Victor  
15884     B. Singh  
9156      G. Milan  
495        J. Zoet  


### Train Test split

In [55]:
X = df.drop(['Overall', 'Name'], axis=1).values
#name = df['Name'].values
Y = df['Overall'].values
X_train, X_test, y_train, y_test = train_test_split(X,Y, test_size=0)

### Fit a Linear Regression Model

In [56]:
from sklearn import svm
reg = LinearRegression()
reg.fit(X_train, y_train)
reg.coef_


#y_pred = reg.predict(X_test)
#print('Accuracy of linear regression classifier on test set: {:.2f}'.format(reg.score(X_test, y_test)))

Accuracy of linear regression classifier on test set: 0.77


In [57]:
import pickle
filename='Regressor_model.sav'
pickle.dump(reg, open(filename, 'wb'))
load_lr_model =pickle.load(open(filename, 'rb'))
load_lr_model.coef_

array([ 0.50230703,  0.17814874, -0.00090128,  0.031064  , -0.0373743 ])

In [58]:
new_player = [95, 96, 94, 90, 96]
print(load_lr_model.predict([new_player]))

[89.07502619]


In [59]:
new_player = [66, 68, 64, 67, 70]

def calc_overall(new_player, model):
    return model.predict([new_player])

print(calc_overall(new_player, reg))

[69.80425594]


### Nearest Neighbour- full dataset

In [60]:
df = pd.read_csv('data.csv', index_col=0)
df = shuffle(df, random_state=10)

df = df[['Reactions', 'Composure', 'Vision', 'ShortPassing', 'BallControl', 'Name']]
df = df.dropna()

X = df.drop('Name', axis=1)
y = df['Name']

In [61]:
neigh = NearestNeighbors(n_neighbors=3)
neigh.fit(X)

NearestNeighbors(algorithm='auto', leaf_size=30, metric='minkowski',
         metric_params=None, n_jobs=None, n_neighbors=3, p=2, radius=1.0)

In [62]:
new_player = [95, 96, 94, 90, 96]
preds = neigh.kneighbors([new_player], 3, return_distance=False)

closest_players = [y.iloc[pred] for pred in preds[0]]
print(closest_players)

['L. Messi', 'David Silva', 'E. Hazard']
