In [1]:
import numpy as np
import pandas as pd
from sklearn.datasets import load_digits
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
#from sklearn import SVC
from sklearn.cluster import KMeans
from sklearn.cluster import AgglomerativeClustering
from sklearn.metrics import accuracy_score
import time

In [2]:
def timeit(method):
    def timed(*args, **kw):
        ts = time.time()
        result = method(*args, **kw)
        te = time.time()
        if 'log_time' in kw:
            name = kw.get('log_name', method.__name__.upper())
            kw['log_time'][name] = int((te - ts) * 1000)
        else:
            print('%r  %2.2f ms' % \
                  (method.__name__, (te - ts) * 1000))
        return result
    return timed

In [3]:
class clust():
    def _load_data(self, sklearn_load_ds):
        data = sklearn_load_ds
        X = data.data
        self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(X, data.target, test_size=0.3, random_state=42)
        
    def __init__(self, sklearn_load_ds):
        self._load_data(sklearn_load_ds)
    
    @timeit
    def classify(self, model=LogisticRegression(random_state=42)):
        model.fit(self.X_train, self.y_train)
        y_pred = model.predict(self.X_test)
        print('Accuracy: {}'.format(accuracy_score(self.y_test, y_pred)))

    @timeit
    def Kmeans(self, output='add'):
        n_clusters = len(np.unique(self.y_train))
        clf = KMeans(n_clusters = 10, random_state=42)
        clf.fit(self.X_train)
        y_labels_train = clf.labels_
        y_labels_test = clf.predict(self.X_test)
        if output == 'add':
            self.X_train['km_clust'] = y_labels_train
            self.X_test['km_clust'] = y_labels_test
        elif output == 'replace':
            self.X_train = y_labels_train[:, np.newaxis]
            self.X_test = y_labels_test[:, np.newaxis]
        else:
            raise ValueError('output should be either add or replace')
        return self
    def HClust(self, output='add'):
        n_clusters = len(np.unique(self.y_train))
        clf = AgglomerativeClustering(n_clusters=10, affinity='euclidean', linkage='ward')
        clf.fit(self.X_train)
        y_labels_train = clf.labels_
        y_labels_test = clf.fit_predict(self.X_test)
        if output == 'add':
            self.X_train['hc_clust'] = y_labels_train
            self.X_test['hc_clust'] = y_labels_test
        elif output == 'replace':
            self.X_train = y_labels_train[:, np.newaxis]
            self.X_test = y_labels_test[:, np.newaxis]
        else:
            raise ValueError('output should be either add or replace')
        return self

In [4]:
class loaddata:
    def __init__(self, data, target):
        self.data = data
        self.target = target


In [5]:
#reading Data
X_data = pd.read_csv('../data/raw/Data Cleaning.csv')
y_data = pd.read_excel('../data/raw/Training outputs.xlsx')

X_data.drop(columns=['Unnamed: 0'],axis=1,inplace=True)
y_data.drop(columns=['Unnamed: 0'],axis=1,inplace=True)

In [6]:
load_data = loaddata(X_data,y_data)
clust(load_data).Kmeans(output='add').classify()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  y = column_or_1d(y, warn=True)


'Kmeans'  2453.68 ms
Accuracy: 0.9025974025974026
'classify'  9012.03 ms




In [7]:
clust(load_data).Kmeans(output='add').classify(SVC())

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  y = column_or_1d(y, warn=True)


'Kmeans'  1975.90 ms
Accuracy: 0.9329004329004329
'classify'  2802.73 ms


In [8]:
clust(load_data).HClust(output='add').classify(SVC())

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  y = column_or_1d(y, warn=True)


Accuracy: 0.9329004329004329
'classify'  2567.15 ms
