In [16]:
import numpy as np
from sklearn.neighbors import NearestNeighbors
from sklearn import manifold, datasets
from sklearn.model_selection import train_test_split
from time import time
from grakel.kernels import NeighborhoodSubgraphPairwiseDistance
from grakel.kernels import WeisfeilerLehman
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from scipy import sparse, io
from mpl_toolkits.mplot3d import Axes3D
import matplotlib.pyplot as plt
import mat4py
import glob
import pandas as pd
from grakel import Graph

In [17]:
def clean_dataset(df):
    assert isinstance(df, pd.DataFrame), "df needs to be a pd.DataFrame"
    df.dropna(inplace=True)
    indices_to_keep = ~df.isin([np.nan, np.inf, -np.inf]).any(1)
    return df[indices_to_keep].astype(np.float64)

In [18]:
path = 'PPI'
files = glob.glob(path + '/*.csv')

lists = []

for filename in files:
    df = pd.read_csv(filename, index_col=None, header=None)
    lists.append(df)
    
frame = pd.concat(lists, axis=0, ignore_index=True)
data_ = clean_dataset(frame)

labels = pd.read_csv('ppi_labels.csv').T
# labels = labels.values.flatten()

data_, labels = data_.align(labels, axis=1, fill_value=0)

labels = labels.T
print(labels.shape)
print(data_.shape)


(232, 1)
(232, 232)


In [19]:
print("Splitting dataset into train/test (1000/100 instances)")
graphs_train, graphs_test = data_[:150], data_[150:232]
y_train, y_test = labels[:150], labels[150:232]

# # split a dataset into train and test sets
# from sklearn.datasets import make_blobs
# from sklearn.model_selection import train_test_split
# # create dataset
# X, y = make_blobs(n_samples=1000)
# split into train test sets
# graphs_train, graphs_test, y_train, y_test = train_test_split(lists, labels, test_size=0.33)
print(graphs_train, graphs_test, y_train.shape, y_test.shape)

Splitting dataset into train/test (1000/100 instances)
      0    1    2    3    4    5    6    7    8    9    ...  222  223  224  \
2883  0.0  1.0  0.0  0.0  0.0  0.0  1.0  1.0  1.0  1.0  ...  1.0  0.0  0.0   
2884  1.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0  0.0  0.0   
2885  0.0  0.0  0.0  1.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0  0.0  0.0   
2886  0.0  0.0  1.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0  0.0  0.0   
2887  0.0  0.0  0.0  0.0  0.0  1.0  0.0  0.0  0.0  0.0  ...  0.0  0.0  0.0   
...   ...  ...  ...  ...  ...  ...  ...  ...  ...  ...  ...  ...  ...  ...   
3028  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0  0.0  0.0   
3029  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0  0.0  0.0   
3030  1.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0  0.0  1.0   
3031  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0  0.0  0.0   
3032  1.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0  0.0  0.0   

      22

In [20]:
# Initialize a Weisfeiler-Lehman subtree kernel
gk = WeisfeilerLehman(n_iter=1, normalize=False)

# Construct kernel matrices
K_train = gk.fit_transform(graphs_train)
K_test = gk.transform(graphs_test)

# Train an SVM classifier and make predictions
clf = SVC(kernel='precomputed')
clf.fit(K_train, y_train) 
y_pred = clf.predict(K_test)

# Evaluate the predictions
print("Accuracy:", accuracy_score(y_pred, y_test))

TypeError: each element of X must be either a graph object or a list with at least a graph like object and node labels dict 


In [15]:
# Initialize neighborhood subgraph pairwise distance kernel
gk = NeighborhoodSubgraphPairwiseDistance(r=3, d=2)

print("Computing kernel matrics\n")
t0 = time()
K_train = gk.fit_transform(graphs_train)
K_test = gk.transform(graphs_test)
print("done in %0.3fs\n" % (time() - t0))

print("Classifying digits\n")
# Initialize SVM
clf = SVC(kernel='precomputed')

# Fit on the train Kernel
clf.fit(K_train, y_train)

# Predict and test.
y_pred = clf.predict(K_test)

# Calculate accuracy of classification.
print("Classification accuracy: %0.2f" % accuracy_score(y_test, y_pred))

Computing kernel matrics



TypeError: each element of X must have either a graph with labels for node and edge or 3 elements consisting of a graph type object, labels for vertices and labels for edges.

In [None]:
import pandas as pd
import numpy as np
from sklearn.svm import LinearSVC
from sklearn.preprocessing import LabelEncoder

train = pd.read_csv("../input/train.csv")
test = pd.read_csv("../input/test.csv")
sample_submission = pd.read_csv("../input/sampleSubmission.csv")
training_labels = LabelEncoder().fit_transform(train['target'])

# SVMs tend to like features that look similar to ~ N(0,1), so let's stabilise
# the long tails
train_features = train.drop('target', axis=1)
train_features[train_features > 4] = 4

model = LinearSVC().fit(train_features, training_labels)

scores = model.decision_function(test)
predictions = 1.0 / (1.0 + np.exp(-scores))
row_sums = predictions.sum(axis=1)
predictions_normalised = predictions / row_sums[:, np.newaxis]

# create submission file
prediction_DF = pd.DataFrame(predictions_normalised, index=sample_submission.id.values, columns=sample_submission.columns[1:])
prediction_DF.to_csv('svc_submission.csv', index_label='id')