# IMPORTATIONS

**Librairies**

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, accuracy_score, roc_auc_score
plt.style.use('ggplot')

**Loading the data into a dataframe**

In [None]:
df = pd.read_csv("../input/nba-general-data/nbalogreg-logistic-regression-exercise-1-QueryResult.csv", delimiter=',')
#df.target_5yrs = df.target_5yrs.map({0.0:"career length < 5 years", 1.0:"career length >= 5 years"})
df.info()

# EDA

In [None]:
df.describe()

In [None]:
corr = df.corr()
corr

In [None]:
f, axes = plt.subplots(ncols=1,figsize=(17,6))

sns.countplot(x='target_5yrs',ax=axes,data=df)

We see here that we have a slightly unbalanced dataset, as we have a little more players with a career of more than 5 years.

In [None]:
sns.heatmap(corr)

Here we see that the correlation coefficients between target_5yrs and the other variables don't tell us much, given that they are all quite close. In summary, they are positive, so we can conclude that their increase will promote the length of a player's career, but there are no variables that really stand out from the others. The variable with the highest correlation rate with target_5yrs happens to be gp, the number of matches played.

With this in mind, we will determine the variables to favor by studying the stripplots of the different variables with target_5yrs.

In [None]:
fig, axes = plt.subplots(5, 3, figsize=(18, 10))

fig.suptitle('Répartition des variables en fonction de leur durée de carrière')

sns.stripplot(ax=axes[0, 0], data=df, x='target_5yrs', y='pts')
sns.stripplot(ax=axes[0, 1], data=df, x='target_5yrs', y='min')
sns.stripplot(ax=axes[0, 2], data=df, x='target_5yrs', y='reb')
sns.stripplot(ax=axes[1, 0], data=df, x='target_5yrs', y='ast')
sns.stripplot(ax=axes[1, 1], data=df, x='target_5yrs', y='blk')
sns.stripplot(ax=axes[1, 2], data=df, x='target_5yrs', y='tov')
sns.stripplot(ax=axes[2, 0], data=df, x='target_5yrs', y='gp')
sns.stripplot(ax=axes[2, 1], data=df, x='target_5yrs', y='fgm')
sns.stripplot(ax=axes[2, 2], data=df, x='target_5yrs', y='fg')
sns.stripplot(ax=axes[3, 0], data=df, x='target_5yrs', y='3p_made')
sns.stripplot(ax=axes[3, 1], data=df, x='target_5yrs', y='3pa')
sns.stripplot(ax=axes[3, 2], data=df, x='target_5yrs', y='3p')
sns.stripplot(ax=axes[3, 0], data=df, x='target_5yrs', y='ftm')
sns.stripplot(ax=axes[3, 1], data=df, x='target_5yrs', y='fta')
sns.stripplot(ax=axes[3, 2], data=df, x='target_5yrs', y='ft')
sns.stripplot(ax=axes[4, 0], data=df, x='target_5yrs', y='oreb')
sns.stripplot(ax=axes[4, 1], data=df, x='target_5yrs', y='dreb')
sns.stripplot(ax=axes[4, 2], data=df, x='target_5yrs', y='stl')

Here we can clearly see that ten variables are really relevant to distinguish a player who will have a career lasting more than 5 years: pts, reb, ast, tov, fgm, ftm, fta, oreb, dreb, stl, blk.

We will choose here to consider certain data by processing them "per match played", like the variable pts.

In [None]:
def by_game(df, variable):
    df[str(variable) + "_by_game"] = df[variable]/df.gp

for var in ["pts","min","tov","blk","ast","stl","reb","fgm","ftm","fta","oreb","dreb"]:
    by_game(df, var)
    df = df.drop(var,axis=1)

df.head()

Now, we remove the non-useful variables:

In [None]:
for column in df.columns:
    if "_by_game" not in column and "target_5yrs" not in column:
        df = df.drop(column,axis=1)

df.columns

In [None]:
shuffled_rows = np.random.permutation(players.index)
df = df.iloc[shuffled_rows]
df.head()
df.isnull().sum()

In [None]:
df.corr()["pts_by_game"]

In [None]:
sns.pairplot(df, hue="target_5yrs")

## Implementation of models and tests

Here we will use the given test function with the subject. However, we have modified it somewhat: it has been separated into two functions; one performing the prediction on the test set, and the other on the practice set. The second will be used to determine if a model is overfitting.

In [None]:
df.drop(['target_5yrs'],axis=1).columns.values

In [None]:
import numpy as np
import pandas as pd
import matplotlib
from matplotlib import pyplot as plt
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import KFold
from sklearn.metrics import confusion_matrix
from sklearn.metrics import recall_score
from sklearn.svm import SVC

def score_classifier_test(dataset,classifier,labels):

    """
    performs 3 random trainings/tests to build a confusion matrix and prints results with precision and recall scores
    :param dataset: the dataset to work on
    :param classifier: the classifier to use
    :param labels: the labels used for training and validation
    :return:
    """

    kf = KFold(n_splits=3,random_state=50,shuffle=True)
    confusion_mat = np.zeros((2,2))
    recall = 0
    for training_ids,test_ids in kf.split(dataset):
        training_set = dataset[training_ids]
        training_labels = labels[training_ids]
        test_set = dataset[test_ids]
        test_labels = labels[test_ids]
        classifier.fit(training_set,training_labels)
        predicted_labels = classifier.predict(test_set)
        confusion_mat+=confusion_matrix(test_labels,predicted_labels)
        recall += recall_score(test_labels, predicted_labels)
    recall/=3
    return(confusion_mat, recall)

def score_classifier_train(dataset,classifier,labels):

    """
    performs 3 random trainings/tests to build a confusion matrix and prints results with precision and recall scores
    :param dataset: the dataset to work on
    :param classifier: the classifier to use
    :param labels: the labels used for training and validation
    :return:
    """

    kf = KFold(n_splits=3,random_state=50,shuffle=True)
    confusion_mat = np.zeros((2,2))
    recall = 0
    for training_ids,test_ids in kf.split(dataset):
        training_set = dataset[training_ids]
        training_labels = labels[training_ids]
        test_set = dataset[test_ids]
        test_labels = labels[test_ids]
        classifier.fit(training_set,training_labels)
        predicted_labels = classifier.predict(training_set)
        confusion_mat+=confusion_matrix(training_labels,predicted_labels)
        recall += recall_score(training_labels, predicted_labels)
    recall/=3
    return(confusion_mat, recall)

**Set up of models and test function**

Similarly, we have modified this part as well by removing the drops from the 'name' variable since we had already done so before.

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

lr = LogisticRegression()
mlp = MLPClassifier()
rf = RandomForestClassifier()
svc = SVC()

def test_model(model, type):
    # extract names, labels, features names and values
    labels = df['target_5yrs'].values # labels
    paramset = df.drop(['target_5yrs'],axis=1).columns.values
    df_vals = df.drop(['target_5yrs'],axis=1).values

    # replacing Nan values (only present when no 3 points attempts have been performed by a player)
    for x in np.argwhere(np.isnan(df_vals)):
        df_vals[x]=0.0

    # normalize dataset
    X = MinMaxScaler().fit_transform(df_vals)

    #example of scoring with support vector classifier
    if (type == 'test'): # test predictions
        return score_classifier_test(X,model,labels)
    else: # train predictions to conclude on the overfitting
        return score_classifier_train(X,model,labels)



**Model test**

In [None]:
accuracy_lr = test_model(lr,'test')[1]
accuracy_mlp = test_model(mlp,'test')[1]
accuracy_rf = test_model(rf,'test')[1]
accuracy_svc = test_model(svc,'test')[1]

print('Test Accuracy of Logistic Regression: %.3f' % accuracy_lr)
print('Test Accuracy of MLP: %.3f' % accuracy_mlp)
print('Test Accuracy of Random Forest: %.3f' % accuracy_rf)
print('Test Accuracy of SVC: %.3f' % accuracy_svc)

**Overfitting tests**

In [None]:
accuracy_lr = test_model(lr,'train')[1]
accuracy_mlp = test_model(mlp,'train')[1]
accuracy_rf = test_model(rf,'train')[1]
accuracy_svc = test_model(svc,'train')[1]

print('Train Accuracy of Logistic Regression: %.3f' % accuracy_lr)
print('Train Accuracy of MLP: %.3f' % accuracy_mlp)
print('Train Accuracy of Random Forest: %.3f' % accuracy_rf)
print('Train Accuracy of SVC: %.3f' % accuracy_svc)

The only model that seems to overfit is the Random Forest. It seems obvious here that the model to choose is therefore the logistic regression model.

**Save the model**

In [None]:
import pickle


Pkl_Filename = "lr.pkl"  

with open(Pkl_Filename, 'wb') as file:  
    pickle.dump(lr, file)