In [3]:
import pandas as pd
import numpy as np
from sklearn.cluster import DBSCAN
from sklearn import metrics
from sklearn.preprocessing import MinMaxScaler, StandardScaler
import ecopy as ep

In [4]:
input_data = pd.read_csv('/home/llevin/Desktop/capstone2/cleaned_data/input_data_vF2.csv')

In [6]:
def dist_gower(datamtx, strict=True):
    """returns a row-row gower dist matrix
    
    see for example, Faith et al., 1987
    
    
    * note that the comparison between any two rows is dependent on the entire
    data matrix, d_ij is a fn of all of datamtx, not just i,j
    * comparisons are between rows (samples)
    * any column containing identical data for all rows is ignored (this
    prevents a 0/0 error in the formula for gower distance
    * input: 2D numpy array.  Limited support for non-2D arrays if 
    strict==False
    * output: numpy 2D array float ('d') type.  shape (inputrows, inputrows)
    for sane input data
    * two rows of all zeros returns 0 distance between them
    * if strict==True, raises ValueError if any of the input data is
    not finite, or if the input data is not a rank 2 array (a matrix).
    * if strict==False, assumes input data is a 2d matrix.  
    If rank of input data is < 2, returns an empty 2d array (shape:
    (0, 0) ).  If 0 rows or 0 colunms, also returns an empty 2d array.
    """
    if strict:
        if not np.isfinite(datamtx).any():
            raise ValueError("non finite number in input matrix")
        if np.ndim(datamtx) != 2:
            raise ValueError("input matrix not 2D")
        numrows, numcols = np.shape(datamtx)
    else:
        try:
            numrows, numcols = np.shape(datamtx)
        except ValueError:
            return np.zeros((0,0),'d')
    if numrows == 0 or numcols == 0:
        return np.zeros((0,0),'d')
    dists = np.zeros((numrows,numrows),'d')
    coldiffs = datamtx.max(axis=0) - datamtx.min(axis=0)
    for i in range(numcols):
        if coldiffs[i] == 0.0:
            coldiffs[i] = 1.0 # numerator will be zero anyway
    for i in range(numrows):
        r1 = datamtx[i]
        for j in range(i):
            r2 = datamtx[j]
            rowdiff = r2 - r1
            dist = sum(abs(r1 - r2) / coldiffs)
            dists[i,j] = dists[j,i] = dist

    return dists

In [8]:
for col in input_data.columns:
    print col

away_corners
away_fouls
away_shots
away_shots_target
away_yellow_cards
away_red_cards
away_team
full_time_away_goals
full_time_home_goals
home_corners
home_fouls
home_red_cards
home_yellow_cards
home_shots
home_shots_target
half_time_away_goals
half_time_home_goals
half_time_result
home_team
Referee
result
City
Stadium
Capacity
Country
season
date
home_team_api_id
away_team_api_id
home_possession
home_passing
home_aerials
home_shots_against
home_tackles
home_interceptions
home_dribbles
home_fouls_for
away_possession
away_passing
away_aerials
away_shots_against
away_tackles
away_interceptions
away_dribbles
away_fouls_for
temp (F)
dew (F)
humidity
wind direction
wind speed (mph)
prec (in)
visibility (mi)
home_goal_headers
home_goal_freekicks
home_goal_volleys
home_goal_other
away_goal_headers
away_goal_freekicks
away_goal_volleys
away_goal_other
home_shoton_headers
home_shoton_freekicks
home_shoton_volleys
home_shoton_other
home_shoton_bigchance
away_shoton_headers
away_shoton_freekicks


In [9]:
num_cols = input_data.select_dtypes(include=['int64','float64']).columns
num_cols = [col for col in num_cols if col not in ['home_team_api_id','away_team_api_id','result','prec (in)',
                                                  'full_time_home_goals','full_time_away_goals','at_Emirates Stadium',
                                                   'at_Boleyn Ground','at_Loftus Road','at_Britannia Stadium',
                                                   'at_The Hawthorns','at_Anfield','at_Sports Direct Arena',
                                                   'at_Turf Moor','at_Villa Park','at_Goodison Park',
                                                   "at_St Mary's Stadium",'at_Liberty Stadium','at_Etihad Stadium',
                                                   'at_Stamford Bridge','at_King Power Stadium','at_KC Stadium',
                                                   'at_Old Trafford','at_Stadium of Light','at_Selhurst Park',
                                                   'at_White Hart Lane','at_Carrow Road','at_Cardiff City Stadium',
                                                   'at_Craven Cottage','at_Madejski Stadium']]
X = input_data[num_cols].values
X.shape

(902, 94)

In [10]:
scale = StandardScaler()

norm_data = scale.fit_transform(X)

In [11]:
X_gower = dist_gower(norm_data)

In [14]:
X_gower.shape

(902, 902)

In [15]:
X_gower.mean(), X_gower.max(), X_gower.min()

(14.356295244428486, 27.799364819884293, 0.0)

In [224]:
db = DBSCAN(eps=11,min_samples=2,metric='precomputed').fit(X_gower)
core_samples_mask = np.zeros_like(db.labels_, dtype=bool)
core_samples_mask[db.core_sample_indices_] = True
labels = db.labels_

# Number of clusters in labels, ignoring noise if present.
n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)

print('Estimated number of clusters: %d' % n_clusters_)
print("Silhouette Coefficient: %0.3f" % metrics.silhouette_score(norm_data, labels))

Estimated number of clusters: 2
Silhouette Coefficient: 0.170


In [18]:
def dbscan_gower(X,eps=range(1,20),min_samples=range(1,10)):
    X_gower = dist_gower(X)
    for i in eps:
        for j in min_samples:
            db = DBSCAN(eps=i,min_samples=j,metric='precomputed').fit(X_gower)
            labels = db.labels_
            try:
                n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)
                print 'Eps: %d' % i
                print 'Min_Samples: %d' % j
                print('Estimated number of clusters: %d' % n_clusters_)
                print("Silhouette Coefficient: %0.3f" % metrics.silhouette_score(norm_data, labels))
            except:
                pass

In [19]:
dbscan_gower(norm_data)

Eps: 1
Min_Samples: 1
Estimated number of clusters: 902
Eps: 1
Min_Samples: 2
Estimated number of clusters: 0
Eps: 1
Min_Samples: 3
Estimated number of clusters: 0
Eps: 1
Min_Samples: 4
Estimated number of clusters: 0
Eps: 1
Min_Samples: 5
Estimated number of clusters: 0
Eps: 1
Min_Samples: 6
Estimated number of clusters: 0
Eps: 1
Min_Samples: 7
Estimated number of clusters: 0
Eps: 1
Min_Samples: 8
Estimated number of clusters: 0
Eps: 1
Min_Samples: 9
Estimated number of clusters: 0
Eps: 2
Min_Samples: 1
Estimated number of clusters: 902
Eps: 2
Min_Samples: 2
Estimated number of clusters: 0
Eps: 2
Min_Samples: 3
Estimated number of clusters: 0
Eps: 2
Min_Samples: 4
Estimated number of clusters: 0
Eps: 2
Min_Samples: 5
Estimated number of clusters: 0
Eps: 2
Min_Samples: 6
Estimated number of clusters: 0
Eps: 2
Min_Samples: 7
Estimated number of clusters: 0
Eps: 2
Min_Samples: 8
Estimated number of clusters: 0
Eps: 2
Min_Samples: 9
Estimated number of clusters: 0
Eps: 3
Min_Samples: 1
Es

In [20]:
def apply_clusters(df,cols,eps,min_samples):
    X = df[cols].values
    scale = StandardScaler()
    norm_data = scale.fit_transform(X)
    X_gower = dist_gower(norm_data)
    db = DBSCAN(eps=eps,min_samples=min_samples,metric='precomputed').fit(X_gower)
    labels = db.labels_
    clusters = []
    n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)
    print set(labels)
    print('Estimated number of clusters: %d' % n_clusters_)
    for i in labels:
        if i == -1:
            clusters.append('noise')
        else:
            clusters.append(str(i))
    df['cluster'] = clusters
    cluster_group = df.groupby('cluster').mean().reset_index()
    return cluster_group.T

In [21]:
apply_clusters(input_data,num_cols,10,2)

set([0, 1, -1])
Estimated number of clusters: 2


Unnamed: 0,0,1,2
cluster,0,1,noise
away_corners,4.72009,9,4.08333
away_fouls,10.8646,12.5,11.4167
away_shots,11.2675,13.5,12.5
away_shots_target,4.87923,9.5,6
away_yellow_cards,1.78668,3,2.5
away_red_cards,0.0981941,0,0.416667
full_time_away_goals,1.16366,1.75,1.41667
full_time_home_goals,1.60609,1,1.58333
home_corners,6.17043,7.25,8.41667


In [235]:
input_data['Stadium'].value_counts()

Stamford Bridge          53
Emirates Stadium         52
Etihad Stadium           51
Goodison Park            51
Sports Direct Arena      50
The Hawthorns            49
Britannia Stadium        49
Old Trafford             48
Villa Park               47
White Hart Lane          47
Anfield                  46
Liberty Stadium          46
Stadium of Light         43
Boleyn Ground            38
St Mary's Stadium        37
Carrow Road              36
Loftus Road              34
Craven Cottage           28
Selhurst Park            25
KC Stadium               22
Madejski Stadium         14
Turf Moor                13
King Power Stadium       12
Cardiff City Stadium     11
Name: Stadium, dtype: int64