In [1]:
import pandas as pd
import numpy as np
from sklearn.cluster import DBSCAN
from sklearn import metrics
from sklearn.preprocessing import MinMaxScaler, StandardScaler
import ecopy as ep
from sklearn.decomposition import PCA

In [2]:
input_data = pd.read_csv('/home/llevin/Desktop/capstone2/cleaned_data/input_data_vF3.csv')

In [3]:
def dist_gower(datamtx, strict=True):
    """returns a row-row gower dist matrix
    
    see for example, Faith et al., 1987
    
    
    * note that the comparison between any two rows is dependent on the entire
    data matrix, d_ij is a fn of all of datamtx, not just i,j
    * comparisons are between rows (samples)
    * any column containing identical data for all rows is ignored (this
    prevents a 0/0 error in the formula for gower distance
    * input: 2D numpy array.  Limited support for non-2D arrays if 
    strict==False
    * output: numpy 2D array float ('d') type.  shape (inputrows, inputrows)
    for sane input data
    * two rows of all zeros returns 0 distance between them
    * if strict==True, raises ValueError if any of the input data is
    not finite, or if the input data is not a rank 2 array (a matrix).
    * if strict==False, assumes input data is a 2d matrix.  
    If rank of input data is < 2, returns an empty 2d array (shape:
    (0, 0) ).  If 0 rows or 0 colunms, also returns an empty 2d array.
    """
    if strict:
        if not np.isfinite(datamtx).any():
            raise ValueError("non finite number in input matrix")
        if np.ndim(datamtx) != 2:
            raise ValueError("input matrix not 2D")
        numrows, numcols = np.shape(datamtx)
    else:
        try:
            numrows, numcols = np.shape(datamtx)
        except ValueError:
            return np.zeros((0,0),'d')
    if numrows == 0 or numcols == 0:
        return np.zeros((0,0),'d')
    dists = np.zeros((numrows,numrows),'d')
    coldiffs = datamtx.max(axis=0) - datamtx.min(axis=0)
    for i in range(numcols):
        if coldiffs[i] == 0.0:
            coldiffs[i] = 1.0 # numerator will be zero anyway
    for i in range(numrows):
        r1 = datamtx[i]
        for j in range(i):
            r2 = datamtx[j]
            rowdiff = r2 - r1
            dist = sum(abs(r1 - r2) / coldiffs)
            dists[i,j] = dists[j,i] = dist

    return dists

In [4]:
input_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1626 entries, 0 to 1625
Data columns (total 88 columns):
away_corners               1626 non-null int64
away_fouls                 1626 non-null int64
away_shots                 1626 non-null int64
away_shots_target          1626 non-null int64
away_yellow_cards          1626 non-null int64
away_red_cards             1626 non-null int64
away_team                  1626 non-null object
full_time_away_goals       1626 non-null int64
full_time_home_goals       1626 non-null int64
home_corners               1626 non-null int64
home_fouls                 1626 non-null int64
home_red_cards             1626 non-null int64
home_yellow_cards          1626 non-null int64
home_shots                 1626 non-null int64
home_shots_target          1626 non-null int64
half_time_away_goals       1626 non-null int64
half_time_home_goals       1626 non-null int64
half_time_result           1626 non-null int64
home_team                  1626 non-null objec

In [72]:
num_cols = input_data.select_dtypes(include=['int64','float64']).columns
num_cols = [col for col in num_cols if col not in ['result','full_time_home_goals','full_time_away_goals',
                                                   'half_time_home_goals','half_time_away_goals','half_time_results'
                                                   ,'at_Emirates Stadium','at_Boleyn Ground','at_King Power Stadium'
                                                   ,'at_Old Trafford','at_Loftus Road','at_Britannia Stadium'
                                                   ,'at_The Hawthorns','at_Anfield','at_Sports Direct Arena'
                                                   ,'at_Turf Moor','at_Villa Park','at_Stamford Bridge','at_Selhurst Park'
                                                   ,'at_Goodison Park',"at_St Mary's Stadium",'at_Liberty Stadium',
                                                   'at_KC Stadium','at_Stadium of Light','at_White Hart Lane'
                                                   ,'at_Etihad Stadium','at_Carrow Road','at_Molineux Stadium',
                                                   'at_Cardiff City Stadium','at_Craven Cottage','at_Madejski Stadium']]
pca_cols = [col for col in num_cols if col not in ['home_team_api_id','away_team_api_id']]
X = input_data[pca_cols].values
X.shape

(1626, 48)

In [17]:
scale = StandardScaler()

norm_data = scale.fit_transform(X)
norm_data.shape

(1626, 2)

In [None]:
soccer_pca = PCA()
soccer_pca.fit(norm_data)
# soccer_pcs = pca.transform(norm_data)
# soccer_pcs = pd.DataFrame(soccer_pcs,columns=['PC'+str(i) for i in range(1, soccer_pcs.shape[1]+1)])
# soccer_pcs.head()

In [73]:
def dbscan_gower(df,eps=range(1,20),min_samples=range(1,10)):
    scale = StandardScaler()
    clusters_info = []
    for n in range(len(pca_cols)):
        columns = pca_cols[n:n+4]
        X = input_data[columns].values
        X_n = scale.fit_transform(X)
        X_gower = dist_gower(X_n)
        for i in eps:
            for j in min_samples:
                db = DBSCAN(eps=i,min_samples=j,metric='euclidean').fit(X_n)
                labels = db.labels_
                try:
                    n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)
                    if n_clusters_>1 and i>1 and j>1:
                        clusters_info.append([n,i,j,n_clusters_,metrics.silhouette_score(X_n, labels)])
                except:
                    pass
    clusters_info = sorted(clusters_info,reverse=True,key=lambda x: x[4])
    max_sil = clusters_info[0]
    print 'Column Index: ', max_sil[0]
    print 'Max Distance: ', max_sil[1]
    print 'Min Samples: ', max_sil[2]
    print 'Number of Clusters: ', max_sil[3]
    print 'Silhouette Score: ', max_sil[4]

In [74]:
dbscan_gower(input_data)

KeyboardInterrupt: 

In [76]:
def dbscan_gower(df,eps=range(1,20),min_samples=range(1,10)):
    scale = StandardScaler()
    clusters_info = []
    X = input_data[pca_cols].values
    X_n = scale.fit_transform(X)
    for i in eps:
        for j in min_samples:
            db = DBSCAN(eps=i,min_samples=j,metric='euclidean').fit(X_n)
            labels = db.labels_
            try:
                n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)
                if n_clusters_>1 and i>1 and j>1:
                    clusters_info.append([i,j,n_clusters_,metrics.silhouette_score(X_n, labels)])
            except:
                pass
    clusters_info = sorted(clusters_info,reverse=True,key=lambda x: x[3])
    max_sil = clusters_info[0]
    print 'Max Distance: ', max_sil[0]
    print 'Min Samples: ', max_sil[1]
    print 'Number of Clusters: ', max_sil[2]
    print 'Silhouette Score: ', max_sil[3]

dbscan_gower(input_data)

Max Distance:  8
Min Samples:  6
Number of Clusters:  2
Silhouette Score:  0.275857949339


In [47]:
def apply_clusters(df,cols,eps,min_samples):
    X = df[cols].values
    scale = StandardScaler()
    norm_data = scale.fit_transform(X)
    X_gower = dist_gower(norm_data)
    db = DBSCAN(eps=eps,min_samples=min_samples,metric='euclidean').fit(norm_data)
    labels = db.labels_
    clusters = []
    n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)
    print set(labels)
    print('Estimated number of clusters: %d' % n_clusters_)
    for i in labels:
        if i == -1:
            clusters.append('noise')
        else:
            clusters.append(str(i))
    df['cluster'] = clusters
    print df['cluster'].value_counts()
    cluster_group = df.groupby('cluster').mean().reset_index()
    return cluster_group.T

In [67]:
pca_cols[16:20]

['humidity', 'wind direction', 'wind speed (mph)', 'visibility (mi)']

In [77]:
pd.set_option('display.max_rows', len(input_data))
cluster = apply_clusters(input_data,pca_cols,8,6)

set([0, 1, -1])
Estimated number of clusters: 2
0        1583
noise      35
1           8
Name: cluster, dtype: int64


In [78]:
cluster

Unnamed: 0,0,1,2
cluster,0.0,1.0,noise
away_corners,4.66709,5.0,6.28571
away_fouls,11.0891,11.0,11.8857
away_shots,11.0739,13.125,14.9429
away_shots_target,4.91661,7.5,8.14286
away_yellow_cards,1.81807,2.125,2.31429
away_red_cards,0.0947568,0.0,0.257143
full_time_away_goals,1.09602,0.875,2
full_time_home_goals,1.60139,1.75,1.62857
home_corners,6.34491,5.625,5.94286


In [70]:
cluster[0]=cluster[0].map(lambda x: float(x))
cluster[1]=cluster[1].map(lambda x: float(x))
cluster['01_delta'] = (cluster[0]-cluster[1])/cluster[0]

In [71]:
cluster['01_delta']

cluster                         -inf
away_corners               -0.401267
away_fouls                  0.009779
away_shots                 -0.262620
away_shots_target          -0.724562
away_yellow_cards          -0.132747
away_red_cards             -0.465878
full_time_away_goals       -0.219385
full_time_home_goals        0.109009
home_corners                0.121248
home_fouls                  0.143338
home_red_cards             -1.422556
home_yellow_cards          -0.060799
home_shots                 -0.000371
home_shots_target          -0.180977
half_time_away_goals       -0.458446
half_time_home_goals        0.104115
half_time_result            0.285566
result                      0.238218
Capacity                    0.440793
home_team_api_id           -0.089954
away_team_api_id           -0.017575
temp (F)                    0.996088
dew (F)                     0.995903
humidity                    0.995267
wind direction              0.994123
wind speed (mph)            0.997189
v