## Goals

- better understand the use of unsupervised learning techniques
- use a variety of methods and models
- explore hyperparamater tuning

## The Dataset

Here I will use the [FIFA2019 dataset](https://www.kaggle.com/karangadiya/fifa19) from Kaggle. I will import it from Thinkful's postgreSQL server.

### Import Libraries and Define Functions

In [5]:
import numpy as np
import pandas as pd
import scipy
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.cluster import KMeans, AgglomerativeClustering
from sklearn.cluster import DBSCAN, MeanShift
from scipy.cluster.hierarchy import dendrogram, linkage
from sklearn.mixture import GaussianMixture
from sklearn.decomposition import PCA
import umap
from sklearn.preprocessing import PowerTransformer
from sklearn import metrics
from sqlalchemy import create_engine

In [2]:
def print_full(x):
    pd.set_option('display.max_rows', len(x))
    print(x)
    pd.reset_option('display.max_rows')
    
def print_nulls(df):
    out = pd.Series({'Column': 'Percentage of nulls', '-----' : '-----'}
                   ).append(df.isnull().sum()*100/df.isnull().isnull().count())
   
    return print_full(out)

In [3]:
def plot_hist_boxplot(df):
    '''
    Accepts only numerical columns;
    use a maks.
    '''
    dim = df.shape[1]
    plt.figure(figsize=(20, dim*5))
    
    # iterate through the columns
    for i, column in enumerate(df.columns):
        
        # plot a histogram
        plt.subplot(dim, 2, (i+1)*2-1)
        plt.hist(df[column])
        plt.ylabel(column, size='xx-large')
        
        # plot a boxplot
        plt.subplot(dim, 2, (i+1)*2)
        plt.boxplot(df[column], whis=[2.5, 97.5]) # boxplot will show outliers beyond the inner 90th percentile
    plt.show()

### Load the dataset

In [None]:
postgres_user = 'dsbc_student'
postgres_pw = '7*.8G9QH21'
postgres_host = '142.93.121.174'
postgres_port = '5432'
postgres_db = 'fifa19'


engine = create_engine('postgresql://{}:{}@{}:{}/{}'.format(
    postgres_user, postgres_pw, postgres_host, postgres_port, postgres_db))
    
query1='''
SELECT
    *
FROM
    fifa19
'''
    
    
df = pd.read_sql_query(query1, con=engine)
engine.dispose()

In [4]:
df.head()

NameError: name 'df' is not defined

In [None]:

# Make column names lowercase for ease and consistency
df.columns = df.columns.str.lower()

In [None]:
df.columns

In [None]:



# Drop some columns of the columns
'''
The first two lines of columns are unneeded such as links to images.
The second two lines are position scores.

All keepers are missing values for these columns. It may be better to seperate out the columns but
I also suspect that these are not valuable features are are adding noise to the model.
To test this I'm dropping them in this iteration of my modeling
'''
df.drop(['ID', 'Name',  'Photo',  'Flag', 
       'Potential',  'Club Logo', 'Real Face', 'Jersey Number', 'Loaned From', 'Contract Valid Until'
         
         , 'LS', 'ST', 'RS', 'LW', 'LF', 'CF', 'RF', 'RW','LAM', 'CAM', 'RAM', 'LM', 'LCM', 'CM', 'RCM'
         , 'RM', 'LWB', 'LDM','CDM', 'RDM', 'RWB', 'LB', 'LCB', 'CB', 'RCB', 'RB',], axis=1, inplace=True)


# Make column names lowercase for ease and consistency
df.columns= df.columns.str.lower()

In [None]:
print_nulls(df)

In [None]:
df['joined']

In [None]:
df['joined'].astype('datetime64').max()

In [None]:
df.info()

In [None]:
# Define a function to apply derived functions for this dataset 

def clean(df):
    # Drop the small portion of remaining nulls
    df = df.dropna().copy() 
    
    # Values for wage and value need to be converted from strings to numerical 

    df['wage'] = df['wage'].apply(lambda x: int(str(x).replace('€', '').replace('K', '')
                                               ) * 1000 if 'K' in str(x) else x)

    df['wage'] = df['wage'].apply(lambda x: int(str(x).replace('€', '').replace('M', '')
                                               ) * 1000000 if 'M' in str(x) else x)

    df['wage'] = df['wage'].apply(lambda x: 0 if str(x) == '€0' else x)



    df['value'] = df['value'].apply(lambda x: float(str(x).replace('€', '').replace('K', '')
                                               ) * 1000 if 'K' in str(x) else x)

    df['value'] = df['value'].apply(lambda x: float(str(x).replace('€', '').replace('M', '')
                                               ) * 1000000 if 'M' in str(x) else x)

    df['value'] = df['value'].apply(lambda x: 0 if str(x) == '€0' else x)
    
    
    '''
    The release clause value presents a difficulty for filling. Presumably these are players that do 
    not have a release clause. As shown above this includes players at both the high and low end of the
    pay scale. Filling these nulls with 0 makes no sense and infinity is not supported by algorithms
    despite being logically closer. Perhaps turning this into a categorical variable would help?
    Luckily this is not a regression problem where the value would be more important.
    For now I will fill these values with 0 because all I am trying to achieve right now is clustering
    and this will give these observations a valid common value.
    '''
    
    df['release clause'] = df['release clause'].fillna(0)
    
    df['release clause'] = df['release clause'].apply(lambda x: float(str(x).replace('€', '').replace('K', '')
                                               ) * 1000 if 'K' in str(x) else x)

    df['release clause'] = df['release clause'].apply(lambda x: float(str(x).replace('€', '').replace('M', '')
                                               ) * 1000000 if 'M' in str(x) else x)

    df['release clause'] = df['release clause'].apply(lambda x: 0 if str(x) == '€0' else x)
    
    
    # Convert height to int
    df['height'] = df['height'].apply(lambda x : str(x).split('\''))

    df['height'] = df['height'].apply(lambda x : (int(x[0]) * 12) + int(x[1]))
    
    
    # Convert weight to int
    df['weight'] = df['weight'].apply(lambda x : int(str(x).strip('lbs')))
    
    
    # Encode perferred foot as numerical
    df['right preferred'] = df['preferred foot'].apply(lambda x : 1 if str(x) == 'Right'
                                                  else 0)
    df.drop('preferred foot', axis=1, inplace=True)
    
    
    # For the fielders their position scores need to be cleaned and typed properly
    '''
    These position scores lead to some unstable solutions. In this pass they are eliminated
    
    
    '''
    for col in ['ls', 'st', 'rs', 'lw', 'lf', 'cf', 'rf', 'rw', 'lam', 'cam', 'ram',
       'lm', 'lcm', 'cm', 'rcm', 'rm', 'lwb', 'ldm', 'cdm', 'rdm', 'rwb', 'lb',
       'lcb', 'cb', 'rcb', 'rb']:
    
        if col in df.columns:

            df[col] = df[col].apply(lambda x: sum([int(i) for i in str(x).split('+')]))
        
    '''
    Work rate is highliy subjective and may cause unstable solutions.
    
    
    '''
    
    # Seperate and encode the workrate values as ordinal
    #df['defense work rate'] = df['work rate'].apply(lambda x: 3 if (str(x).split('/ '))[1] == 'High' else
    #                                          (2 if (str(x).split('/ '))[1] == 'Medium' else 1))

    #df['offense work rate'] = df['work rate'].apply(lambda x: 3 if (str(x).split('/ '))[0] == 'High' else
    #                                               (2 if (str(x).split('/ '))[0] == 'Medium' else 1))
    
    df.drop('work rate', axis=1, inplace=True)
    
    
    # Log transform release cluase and value
    '''
    Release clause value and wage all are heavily skewed left. Release clause and value are benefit
    greatly from a log normal transformation and become roughly normal.
    
    Wage does not normalize as easiliy. For this pass I will drop it because is strongly correlated with 
    value (.85) so most of that infromation will be retained.
    
    '''
    
    df['release clause'] = df['release clause'].apply(lambda x: np.log(x) if x !=0 else np.log(x+1))
    
    df['value'] = df['value'].apply(lambda x: np.log(x) if x !=0 else np.log(x+1))
    
    


    
    
    # Drop some categorical variables
    
    '''
    Club and Nationality in particular add too many dimensions. 
    
    
    '''

    #df.drop('position', axis=1, inplace=True)

    df.drop('body type', axis=1, inplace=True)

    df.drop('club', axis=1, inplace=True)

    df.drop('nationality', axis=1, inplace=True)
    
    df.drop('joined', axis=1, inplace=True)

    return df

In [None]:
df['body type'].nunique()

In [None]:
df = clean(df)

In [None]:
keepers = df[df['position'] == 'GK'].copy()

fielders = df[df['position'] != 'GK'].copy()

In [None]:
plot_hist_boxplot(keepers.select_dtypes(include='number'))

In [None]:
keepers.drop('skill moves', axis=1, inplace=True)

In [None]:
keepers['value'] = keepers['value'].apply(lambda x: np.log(x) if x !=0 else np.log(x+1))

In [None]:
keepers['release clause'] = keepers['release clause'].apply(lambda x: np.log(x) if x !=0 else np.log(x+1))

In [None]:
keepers.drop('wage', axis=1, inplace=True)

In [None]:
plt.hist(np.log(keepers['international reputation']))

In [None]:
from sklearn.preprocessing import PowerTransformer

pt = PowerTransformer()

In [None]:
X = pt.fit_transform(keepers.drop('position', axis=1))

In [None]:
X.shape

In [None]:
for i in range(X.shape[1]):
    plt.hist(X[:, i])
    plt.show()

In [None]:
# Define a function to print a 2D PCA

def pca_2d(df):
    # initalize the module and get the components
    pca = PCA(n_components=2)
    pca_components = pca.fit_transform(df)
    
    # plot the 2D representation
    fig = plt.figure(figsize=(12,12))
    plt.scatter(pca_components[:, 0], pca_components[:, 1])
    plt.xticks([])
    plt.yticks([])
    plt.axis('off')
    plt.show()
    
    return pca_components

In [None]:
pca_2d(X)

In [None]:
df

In [None]:
df['release clause'] = df['release clause'].apply(lambda x: np.log(x) if x !=0 else np.log(x+1))

In [None]:
df = pd.concat([df, pd.get_dummies(df['position'], prefix='position')], axis=1)

In [None]:
df.info()

In [None]:
df.drop(['wage', 'position'], axis=1, inplace=True)

In [None]:
X = pt.fit_transform(df)

In [None]:
for i in range(X.shape[1]):
    plt.hist(X[:, i])
    plt.show()

In [None]:

scipy.stats.describe(df)

In [None]:
df.describe()

In [None]:
print_nulls(df)

In [None]:
pca_2d(X)

In [None]:
# Define a function to print a UMAP

def umap_2d(df):
    umap_components = umap.UMAP(n_neighbors=200,
                      min_dist=.5,
                      metric='cosine').fit_transform(df)

    
    # plot the 2D representation
    fig = plt.figure(figsize=(12,12))
    plt.scatter(umap_components[:, 0], umap_components[:, 1])
    plt.xticks([])
    plt.yticks([])
    plt.axis('off')
    plt.show()
    
    return umap_components

In [None]:
umap_2d(X)

In [None]:
#Elbow Method
seed = 0
elbow = dict()
for k in range(2,50):
    estimator = KMeans(n_clusters = k,random_state=seed)
    res = estimator.fit_predict(X)
    inertia = estimator.inertia_
    elbow[k] = inertia
    
elbow_df = pd.Series(elbow)
ax = elbow_df.plot(title = 'Elbow Method')
ax.set_xlabel('Number of clusters')
ax.set_ylabel('Inertia')
#plt.plot(3,elbow_df[3],'ro')

In [None]:
fielders['position'].nunique()

In [None]:
#Elbow Method
seed = 0
elbow = dict()
for k in range(2,5):
    estimator = KMeans(n_clusters = k,random_state=seed)
    res = estimator.fit_predict(X)
    inertia = estimator.inertia_
    elbow[k] = inertia
    
elbow_df = pd.Series(elbow)
ax = elbow_df.plot(title = 'Elbow Method')
ax.set_xlabel('Number of clusters')
ax.set_ylabel('Inertia')
#plt.plot(3,elbow_df[3],'ro')

In [None]:
estimator = KMeans(n_clusters = 3,random_state=seed)
res = estimator.fit_predict(X)

In [None]:
pca_components.shape

In [None]:
pca = PCA(n_components=2)
pca_components = pca.fit_transform(df)

In [None]:
colors = ['red', 'blue', 'black']

for i in range(pca_components.shape[0]):
    plt.scatter(pca_components[i, 0], pca_components[i, 1], color=colors[int(res[i])])

In [None]:
df.shape

In [None]:
res.shape

In [None]:
df['cluster'] = res

In [None]:
df[df['cluster'] == 0].describe()

In [None]:
res

In [None]:
df[df['cluster'] == 0].describe()

In [None]:
df[df['cluster'] == 2].describe()

In [None]:
#Elbow Method
seed = 0
elbow = dict()
for k in range(2,5):
    estimator = KMeans(n_clusters = k,random_state=seed)
    res = estimator.fit_predict(X)
    inertia = estimator.inertia_
    elbow[k] = inertia
    
elbow_df = pd.Series(elbow)
ax = elbow_df.plot(title = 'Elbow Method')
ax.set_xlabel('Number of clusters')
ax.set_ylabel('Inertia')
#plt.plot(3,elbow_df[3],'ro')

In [None]:
estimator = KMeans(n_clusters = 3,random_state=seed)
res = estimator.fit_predict(X)

In [None]:
pca_components.shape

In [None]:
pca = PCA(n_components=2)
pca_components = pca.fit_transform(df)

In [None]:
colors = ['red', 'blue', 'black']

for i in range(pca_components.shape[0]):
    plt.scatter(pca_components[i, 0], pca_components[i, 1], color=colors[int(res[i])])

In [None]:
df.shape

In [None]:
res.shape

In [None]:
df['cluster'] = res

In [None]:
df[df['cluster'] == 0].describe()

In [None]:
res

In [None]:
df[df['cluster'] == 0].describe()

In [None]:
df[df['cluster'] == 2].describe()

In [None]:
fielders