In [148]:
import pandas as pd

In [149]:
training = pd.read_csv('training_dataset.csv')
testing_bodytrack = pd.read_csv('testing_bodytrack.csv')
testing_blackscholes = pd.read_csv('testing_blackscholes.csv')

In [150]:
# Make a label column. The label is "cluster active" if w_big > 1, and "cluster idle" otherwise.
training['y'] = training['w_big'].apply(lambda x: 0 if x > 1 else 1)
testing_bodytrack['y'] = testing_bodytrack['w_big'].apply(lambda x: 0 if x > 1 else 1)
testing_blackscholes['y'] = testing_blackscholes['w_big'].apply(lambda x: 0 if x > 1 else 1)

In [151]:
import numpy as np
from scipy import stats
zscore = np.abs(stats.zscore(training.select_dtypes(include=["float", "int"])))
ZSCORE_THREASHOLD = 4

is_inlier = ~ (zscore > ZSCORE_THREASHOLD).any(axis=1)
train_data = training[is_inlier]

In [152]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.impute import SimpleImputer

numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('pca', PCA(n_components=5)),
    ('scaler', StandardScaler())
])

In [None]:
import numpy as np
from sklearn.svm import SVC
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import StratifiedKFold
pipeline = Pipeline(steps=[('preprocessor', numerical_transformer),
                            ('SVM', SVC(random_state=42))])

params = {
    'SVM__C': np.logspace(-3, 3, 10),
    'SVM__gamma': np.logspace(-3, 3, 10),
    'SVM__kernel': ['rbf', 'poly', 'sigmoid'],
    'SVM__class_weight': ['balanced', None],
    'SVM__decision_function_shape': ['ovo', 'ovr'],
    'SVM__random_state': [42], 
    'SVM__max_iter': [10000]
}

cv = StratifiedKFold(shuffle=True, random_state=42, n_splits=10)
rfr_random = RandomizedSearchCV(pipeline, param_distributions=params, n_iter = 10000, cv = cv, verbose=1, random_state=42, n_jobs=-1, return_train_score=True)
print("Fitting now")
X = train_data.drop("y", axis=1)
y = train_data["y"]
rfr_random.fit(X, y)
rfr_random.best_score_

In [154]:
rfr_random.best_params_

{'SVM__random_state': 42,
 'SVM__max_iter': 10000,
 'SVM__kernel': 'rbf',
 'SVM__gamma': 0.021544346900318832,
 'SVM__decision_function_shape': 'ovo',
 'SVM__class_weight': None,
 'SVM__C': 215.44346900318823}

In [155]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

In [156]:
# Predict the labels of the test set: y_pred
y_pred = rfr_random.predict(testing_bodytrack.drop('y', axis=1))

# Compute the confusion matrix: cm
print(confusion_matrix(testing_bodytrack['y'], y_pred))

# Print the accuracy
print(accuracy_score(testing_bodytrack['y'], y_pred))

# Print the classification report
print(classification_report(testing_bodytrack['y'], y_pred))

[[526 382]
 [296 250]]
0.5337001375515819
              precision    recall  f1-score   support

           0       0.64      0.58      0.61       908
           1       0.40      0.46      0.42       546

    accuracy                           0.53      1454
   macro avg       0.52      0.52      0.52      1454
weighted avg       0.55      0.53      0.54      1454



In [157]:
# Predict the labels of the test set: y_pred
y_pred = rfr_random.predict(testing_blackscholes.drop('y', axis=1))

# Compute the confusion matrix: cm
print(confusion_matrix(testing_blackscholes['y'], y_pred))

# Print the accuracy
print(accuracy_score(testing_blackscholes['y'], y_pred))

# Print the classification report
print(classification_report(testing_blackscholes['y'], y_pred))

[[611 487]
 [506  49]]
0.3992740471869328
              precision    recall  f1-score   support

           0       0.55      0.56      0.55      1098
           1       0.09      0.09      0.09       555

    accuracy                           0.40      1653
   macro avg       0.32      0.32      0.32      1653
weighted avg       0.39      0.40      0.40      1653

