In [90]:
import pandas as pd
import numpy as np
from sklearn.cluster import DBSCAN
from sklearn import metrics
from sklearn.preprocessing import MinMaxScaler, StandardScaler
import ecopy as ep

In [2]:
input_data = pd.read_csv('/home/llevin/Desktop/capstone2/cleaned_data/input_data_vF2.csv')

In [48]:
def dist_gower(datamtx, strict=True):
    """returns a row-row gower dist matrix
    
    see for example, Faith et al., 1987
    
    
    * note that the comparison between any two rows is dependent on the entire
    data matrix, d_ij is a fn of all of datamtx, not just i,j
    * comparisons are between rows (samples)
    * any column containing identical data for all rows is ignored (this
    prevents a 0/0 error in the formula for gower distance
    * input: 2D numpy array.  Limited support for non-2D arrays if 
    strict==False
    * output: numpy 2D array float ('d') type.  shape (inputrows, inputrows)
    for sane input data
    * two rows of all zeros returns 0 distance between them
    * if strict==True, raises ValueError if any of the input data is
    not finite, or if the input data is not a rank 2 array (a matrix).
    * if strict==False, assumes input data is a 2d matrix.  
    If rank of input data is < 2, returns an empty 2d array (shape:
    (0, 0) ).  If 0 rows or 0 colunms, also returns an empty 2d array.
    """
    if strict:
        if not np.isfinite(datamtx).any():
            raise ValueError("non finite number in input matrix")
        if np.ndim(datamtx) != 2:
            raise ValueError("input matrix not 2D")
        numrows, numcols = np.shape(datamtx)
    else:
        try:
            numrows, numcols = np.shape(datamtx)
        except ValueError:
            return np.zeros((0,0),'d')
    if numrows == 0 or numcols == 0:
        return np.zeros((0,0),'d')
    dists = np.zeros((numrows,numrows),'d')
    coldiffs = datamtx.max(axis=0) - datamtx.min(axis=0)
    for i in range(numcols):
        if coldiffs[i] == 0.0:
            coldiffs[i] = 1.0 # numerator will be zero anyway
    for i in range(numrows):
        r1 = datamtx[i]
        for j in range(i):
            r2 = datamtx[j]
            rowdiff = r2 - r1
            dist = sum(abs(r1 - r2) / coldiffs)
            dists[i,j] = dists[j,i] = dist

    return dists

In [108]:
num_cols = input_data.select_dtypes(include=['int64','float64']).columns
num_cols = [col for col in num_cols if col not in ['home_team_api_id','away_team_api_id','result']]
X = input_data[num_cols].values
X.shape

(902, 97)

In [99]:
scale = StandardScaler()

norm_data = scale.fit_transform(X)

In [138]:
X_gower = dist_gower(norm_data)

In [139]:
norm_data

array([[-0.63162026,  2.27752825, -1.53313494, ..., -0.11111111,
        -0.17898775, -0.1255618 ],
       [ 0.09832397, -0.24656381, -0.27195143, ..., -0.11111111,
        -0.17898775, -0.1255618 ],
       [-0.26664815,  1.7166189 , -0.27195143, ..., -0.11111111,
        -0.17898775, -0.1255618 ],
       ..., 
       [ 0.09832397, -1.64883717,  0.56883757, ..., -0.11111111,
        -0.17898775, -0.1255618 ],
       [ 0.82826821, -1.92929185,  0.77903482, ..., -0.11111111,
        -0.17898775, -0.1255618 ],
       [-1.3615645 , -0.52701848, -1.32293769, ..., -0.11111111,
        -0.17898775, -0.1255618 ]])

In [140]:
X_gower

array([[  0.        ,  13.86936993,  14.28157725, ...,  15.97415305,
         21.48005184,  15.20866514],
       [ 13.86936993,   0.        ,  12.97425243, ...,  12.99448479,
         14.74668601,  15.27299494],
       [ 14.28157725,  12.97425243,   0.        , ...,  13.89881453,
         16.38711021,  18.7381474 ],
       ..., 
       [ 15.97415305,  12.99448479,  13.89881453, ...,   0.        ,
         12.39525836,  13.86546378],
       [ 21.48005184,  14.74668601,  16.38711021, ...,  12.39525836,
          0.        ,  19.4987074 ],
       [ 15.20866514,  15.27299494,  18.7381474 , ...,  13.86546378,
         19.4987074 ,   0.        ]])

In [141]:
X_gower.shape

(902, 902)

In [142]:
X_gower.mean(), X_gower.max(), X_gower.min()

(14.742523884733441, 28.382698153217625, 0.0)

In [224]:
db = DBSCAN(eps=11,min_samples=2,metric='precomputed').fit(X_gower)
core_samples_mask = np.zeros_like(db.labels_, dtype=bool)
core_samples_mask[db.core_sample_indices_] = True
labels = db.labels_

# Number of clusters in labels, ignoring noise if present.
n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)

print('Estimated number of clusters: %d' % n_clusters_)
print("Silhouette Coefficient: %0.3f" % metrics.silhouette_score(norm_data, labels))

Estimated number of clusters: 2
Silhouette Coefficient: 0.170


In [193]:
# import matplotlib.pyplot as plt

# # Black removed and is used for noise instead.
# unique_labels = set(labels)
# colors = plt.cm.Spectral(np.linspace(0, 1, len(unique_labels)))
# for k, col in zip(unique_labels, colors):
#     if k == -1:
#         # Black used for noise.
#         col = 'k'

#     class_member_mask = (labels == k)

#     xy = X[class_member_mask & core_samples_mask]
#     plt.plot(xy[:, 0], xy[:, 1], 'o', markerfacecolor=col,
#              markeredgecolor='k', markersize=14)

#     xy = X[class_member_mask & ~core_samples_mask]
#     plt.plot(xy[:, 0], xy[:, 1], 'o', markerfacecolor=col,
#              markeredgecolor='k', markersize=6)

# plt.title('Estimated number of clusters: %d' % n_clusters_)
# plt.show()

In [204]:
def dbscan_gower(X,eps=range(1,20),min_samples=range(1,10)):
    for i in eps:
        for j in min_samples:
            try:
                db = DBSCAN(eps=i,min_samples=j,metric='precomputed').fit(X)
                labels = db.labels_
                n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)
                print 'Eps: %d' % i
                print 'Min_Samples: %d' % j
                print('Estimated number of clusters: %d' % n_clusters_)
                print("Silhouette Coefficient: %0.3f" % metrics.silhouette_score(norm_data, labels))
            except:
                pass

In [205]:
dbscan_gower(X_gower)

Eps: 1
Min_Samples: 1
Estimated number of clusters: 902
Eps: 1
Min_Samples: 2
Estimated number of clusters: 0
Eps: 1
Min_Samples: 3
Estimated number of clusters: 0
Eps: 1
Min_Samples: 4
Estimated number of clusters: 0
Eps: 1
Min_Samples: 5
Estimated number of clusters: 0
Eps: 1
Min_Samples: 6
Estimated number of clusters: 0
Eps: 1
Min_Samples: 7
Estimated number of clusters: 0
Eps: 1
Min_Samples: 8
Estimated number of clusters: 0
Eps: 1
Min_Samples: 9
Estimated number of clusters: 0
Eps: 2
Min_Samples: 1
Estimated number of clusters: 902
Eps: 2
Min_Samples: 2
Estimated number of clusters: 0
Eps: 2
Min_Samples: 3
Estimated number of clusters: 0
Eps: 2
Min_Samples: 4
Estimated number of clusters: 0
Eps: 2
Min_Samples: 5
Estimated number of clusters: 0
Eps: 2
Min_Samples: 6
Estimated number of clusters: 0
Eps: 2
Min_Samples: 7
Estimated number of clusters: 0
Eps: 2
Min_Samples: 8
Estimated number of clusters: 0
Eps: 2
Min_Samples: 9
Estimated number of clusters: 0
Eps: 3
Min_Samples: 1
Es

In [229]:
def apply_clusters(df,cols,eps,min_samples):
    X = df[cols].values
    scale = StandardScaler()
    norm_data = scale.fit_transform(X)
    X_gower = dist_gower(norm_data)
    db = DBSCAN(eps=eps,min_samples=min_samples,metric='precomputed').fit(X_gower)
    labels = db.labels_
    clusters = []
    n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)
    print set(labels)
    print('Estimated number of clusters: %d' % n_clusters_)
    for i in labels:
        if i == -1:
            clusters.append('noise')
        else:
            clusters.append(str(i))
    df['cluster'] = clusters
    cluster_group = df.groupby('cluster').mean().reset_index()
    return cluster_group

In [230]:
apply_clusters(input_data,num_cols,8.6,2)

set([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, -1])
Estimated number of clusters: 17


Unnamed: 0,cluster,away_corners,away_fouls,away_shots,away_shots_target,away_yellow_cards,away_red_cards,full_time_away_goals,full_time_home_goals,home_corners,...,at_King Power Stadium,at_KC Stadium,at_Old Trafford,at_Stadium of Light,at_Selhurst Park,at_White Hart Lane,at_Carrow Road,at_Cardiff City Stadium,at_Craven Cottage,at_Madejski Stadium
0,noise,4.730599,10.879157,11.293792,4.914634,1.801552,0.101996,1.169623,1.603104,6.2051,...,0.013304,0.02439,0.053215,0.047672,0.027716,0.052106,0.039911,0.012195,0.031042,0.015521
