# SVC and LinearSVC
In this notebook we evaluate the different feature set with the SVC and LinearSVC each with and without upsampled feature sets. We optimize the Hyperparameters of the SVC with the Doc2Vec Pretrained and BOW TF set. The LinearSVC is optimized in another Notebook.

In [0]:
from google.colab import drive
drive.mount('/content/drive')

In [0]:
#package imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn import svm, metrics, model_selection
from ast import literal_eval
import time
from sklearn.multiclass import OneVsRestClassifier
import statistics


In [0]:
# Function to split train valid test with the option to upsample
def train_test_valid_split(df,upsampling=True, print_distribution = False):
    train,test = train_test_split(df,test_size=0.3,stratify=df["Reviewer_Score"], random_state=42)
    test,valid = train_test_split(test,test_size=0.5,stratify=test["Reviewer_Score"], random_state=42)
    #Zusammengefasst, folgende Aufteilung:
    #70% Training, 15% Validation, 15% Test


    unique, counts_train = np.unique(train["Reviewer_Score"], return_counts=True)
    if(upsampling): #Idee: reduce class 0 to the size of class 1, dupliate samples from class 2 to the size of class 1
        train_0 = train[train["Reviewer_Score"]==0].sample(frac=(counts_train[1]/counts_train[0]), random_state=42)
        train_1 = train[train["Reviewer_Score"]==1]
        train_2 = train[train["Reviewer_Score"]==2]
        train = train_0.append(train_1).append(train_2)
        train = train.sample(frac=1, random_state=42)
    
    if (print_distribution):
      unique, counts_train = np.unique(train["Reviewer_Score"], return_counts=True)
      plt.bar(unique, counts_train)
      unique, counts = np.unique(test["Reviewer_Score"], return_counts=True)
      plt.bar(unique, counts)
      unique, counts = np.unique(valid["Reviewer_Score"], return_counts=True)
      plt.bar(unique, counts)
      plt.title('Class Frequency')
      plt.xlabel('Class')
      plt.ylabel('Frequency')
      plt.show()
    
    return train,valid,test

# function to load all feature sets from Google Drive
def load_from_source():
  # list that holds a list for each category of features with the file paths for the feature data
  # "/fast_text/fast_text_nonswr_features.pkl", "/fast_text/fast_text_swr_features.pkl"
  #"/doc2vec/Pretrained_withScore.csv", "/doc2vec/Owntrained_withScore.csv"
  feature_filepaths=[["/fast_text/fast_text_nonswr_features.pkl", "/fast_text/fast_text_swr_features.pkl"],["/doc2vec/Pretrained_withScore.csv", "/doc2vec/Owntrained_withScore.csv"],["/BOW/tf_561-woerter.pkl", "/BOW/tfidf_561-woerter.pkl"]]
  # list that holds a list for each category of features with the labels for the feature data, fill this in the same way the filepath array is filled
  feature_labels = [["fast text without stop-word removal", "fast text with stop-word removal"],["Doc2Vec Pretrained","Doc2Vec Owntrained"],[]]
  # Load the dataframes and safe them in the same structure like the filepath and labels
  dataframes = []
  for feature_type_filepaths in feature_filepaths:
    feature_type_dataframes = []
    for feature_filepath in feature_type_filepaths:
      if feature_filepath[-3:] == "csv":
        df =  pd.read_csv("/content/drive/My Drive/Feature_generated_sets" + feature_filepath)
        if 'Unnamed: 0' in df.columns:
          df = df.drop('Unnamed: 0', 1)
      if feature_filepath[-3:] == "pkl":
        df =  pd.read_pickle("/content/drive/My Drive/Feature_generated_sets" + feature_filepath)
        if 'Unnamed: 0' in df.columns:
          df = df.drop('Unnamed: 0', 1)
      feature_type_dataframes.append(df)
    dataframes.append(feature_type_dataframes)
  return dataframes

# function to split all dataframes with the option of a reduced test size and passing of upsampling and drop class "ok" to the split function
def split_dataframes(dataframes,test_boolean=False,test_size=100000000, upsampling = False):
  # split the dataframes with the upper method and save them in a dictonary in arrays like the filepath
  test_samples = lambda df: df[0:test_size] if test_boolean else df
  split_dataframes = [] 
  for feature_type_dataframes in dataframes:
    feature_type_split_dataframes = []
    for feature_data in feature_type_dataframes:
      train, valid, test = train_test_valid_split(feature_data, upsampling=upsampling)
      train, valid, test = test_samples(train),test_samples(valid), test_samples(test)
      feature_type_split_dataframes.append({"train": train, "valid": valid, "test":test}) 

    split_dataframes.append(feature_type_split_dataframes)

  return split_dataframes

# function to transform the data to feature and label numpy arrays if their are split
def trans_to_numpy_split(split_dataframes):
  # transform data to numpy arrays
  split_types = ["train", "valid", "test"]
  for feature_type_dataframes in split_dataframes:
    for feature_data in feature_type_dataframes:
      if len(feature_data["train"].columns) == 2:
        for st in split_types:
          features = np.array(feature_data[st]["Review"].tolist())
          label = np.array(feature_data[st]["Reviewer_Score"].tolist())
          feature_data[st] = {"features": features, "label": label}
      elif len(feature_data["train"].columns) == 301:
        for st in split_types:
          features = np.array(feature_data[st].loc[:, :'299'].values)
          label = np.array(feature_data[st]["Reviewer_Score"].values)
          feature_data[st] = {"features": features, "label": label}
      elif len(feature_data["train"].columns) == 562:
        for st in split_types:
          features = np.array(feature_data[st].loc[:, :'yet'].values)
          label = np.array(feature_data[st]["Reviewer_Score"].values)
          feature_data[st] = {"features": features, "label": label}
  return split_dataframes

# function to transform the data to feature and label numpy arrays if their are not split
def trans_to_numpy_unsplit(dataframes, test, test_size):
  # transform data to numpy arrays
  df_result = []
  for feature_type_dataframes in dataframes:
    df_types = []
    for feature_data in feature_type_dataframes:
      if test:
        feature_data = feature_data[:test_size]
      if len(feature_data.columns) == 2:
        features = np.array(feature_data["Review"].tolist())
        label = np.array(feature_data["Reviewer_Score"].tolist())
        feature_data = {"features": features, "label": label}
      elif len(feature_data.columns) == 301:
        features = np.array(feature_data.loc[:, :'299'].values)
        label = np.array(feature_data["Reviewer_Score"].values)
        feature_data = {"features": features, "label": label}
      elif len(feature_data.columns) == 562:
        features = np.array(feature_data.loc[:, :'yet'].values)
        label = np.array(feature_data["Reviewer_Score"].values)
        feature_data = {"features": features, "label": label}
      df_types.append(feature_data)
    df_result.append(df_types)
  return df_result

In [0]:
# load all dataframes from google drive
dataframes = load_from_source()

## SVC
We can not use the full feature set but only a reduced set of maximum 100,000 samples in the train set.

### Test of different Feature sets

In [0]:
# Testing the different feature sets with the default classifier without upsampling in a 4-fold cv
# Output: average f1 scores for each of the six feature sets
trans_data = trans_to_numpy_unsplit(dataframes, True, 50000)
rbf = svm.SVC()
s1 = []
for d1 in trans_data:
  s2 = []
  for d2 in d1:
    scores = model_selection.cross_val_score(rbf, d2["features"], d2["label"], cv=4, n_jobs=-1, scoring="f1_macro")
    print(scores)
    s2.append(statistics.mean(scores))
  s1.append(s2)
print(s1)

[0.24536569 0.24536569 0.24536569 0.24536569]
[0.24536569 0.24536569 0.24536569 0.24536569]
[0.55218979 0.51703084 0.55200055 0.55623351]
[0.24862439 0.24928277 0.24741545 0.24741233]
[0.56658276 0.52766768 0.55206807 0.56140395]
[0.56488821 0.5305744  0.55327771 0.55720504]
[[0.24536568924839908, 0.24536568924839908], [0.5443636747638891, 0.2481837354326099], [0.5519306122663477, 0.5514863392099774]]


In [0]:
# Testing the different feature sets with the default classifier upsampling and then tested on not upsampled test set
# Output:  f1 scores for each of the six feature sets on the test set
dicts = trans_to_numpy_split(split_dataframes(dataframes, True, 50000, upsampling=True))
rbf = svm.SVC()
for d1 in dicts:
  for d2 in d1:
    rbf.fit(d2["train"]["features"], d2["train"]["label"])
    prediction = rbf.predict(d2["test"]["features"])
    print(metrics.classification_report(d2["test"]["label"], prediction))

              precision    recall  f1-score   support

           0       0.75      0.51      0.61     28423
           1       0.32      0.21      0.25     13182
           2       0.27      0.72      0.39      8395

    accuracy                           0.47     50000
   macro avg       0.45      0.48      0.42     50000
weighted avg       0.56      0.47      0.48     50000

              precision    recall  f1-score   support

           0       0.75      0.52      0.61     28423
           1       0.32      0.22      0.26     13182
           2       0.27      0.69      0.39      8395

    accuracy                           0.47     50000
   macro avg       0.45      0.48      0.42     50000
weighted avg       0.56      0.47      0.48     50000

              precision    recall  f1-score   support

           0       0.84      0.65      0.73     28423
           1       0.40      0.51      0.45     13182
           2       0.52      0.68      0.59      8395

    accuracy        

### Hyperparameter optimization without upsampling for Doc2Vec pretrained and BOW TF

In [0]:
# First GridSearch with many parameters
# Output: best parameters and a classification report on the test set
dicts = trans_to_numpy_split(split_dataframes(dataframes, True, 5000))
train_x, train_y, valid_x, valid_y, test_x, test_y = dicts[2][0]["train"]["features"],dicts[2][0]["train"]["label"], dicts[2][0]["valid"]["features"],dicts[2][0]["valid"]["label"], dicts[2][0]["test"]["features"],dicts[2][0]["test"]["label"]
svc = svm.SVC()
param_grid = {'C': [0.01, 0.1, 1, 10], 'kernel': ["rbf", "poly"], "degree": [1,2,3,5], "gamma": [0.1, 1, 10], "decision_function_shape": ["ovr", "ovo"]}
grid_svm = model_selection.GridSearchCV(svc,
                    param_grid= param_grid, 
                    scoring="f1_macro",
                    cv=4,   
                    n_jobs=-1) 
print("Training now")
time1 = time.time()
grid_svm.fit(train_x, train_y)
print(time.time()-time1)
# from https://scikit-learn.org/stable/auto_examples/model_selection/plot_grid_search_digits.html
print("Best parameters set found on development set:")
print()
print(grid_svm.best_params_)
print()
print("Detailed classification report:")
print()
y_true, y_pred = test_y, grid_svm.predict(test_x)
print(metrics.classification_report(y_true, y_pred))
print()

In [0]:
# Grid search with reduced parameters
# Output: best parameters and a classification report on the test set
dicts = trans_to_numpy_split(split_dataframes(dataframes, True, 20000))
train_x, train_y, valid_x, valid_y, test_x, test_y = dicts[2][0]["train"]["features"],dicts[2][0]["train"]["label"], dicts[2][0]["valid"]["features"],dicts[2][0]["valid"]["label"], dicts[2][0]["test"]["features"],dicts[2][0]["test"]["label"]
svc = svm.SVC()
param_grid = {'C': [0.01, 0.1, 1, 10], 'kernel': ["rbf"], "gamma": [0.1, 1, 10], "decision_function_shape": ["ovr"]}
grid_svm = model_selection.GridSearchCV(svc,
                    param_grid= param_grid, 
                    scoring="f1_macro",
                    cv=4,   
                    n_jobs=-1) 
print("Training now")
time1 = time.time()
grid_svm.fit(train_x, train_y)
print(time.time()-time1)
# from https://scikit-learn.org/stable/auto_examples/model_selection/plot_grid_search_digits.html
print("Best parameters set found on development set:")
print()
print(grid_svm.best_params_)
print()
print("Detailed classification report:")
print()
y_true, y_pred = test_y, grid_svm.predict(test_x)
print(metrics.classification_report(y_true, y_pred))
print()

Training now




9335.739322423935
Best parameters set found on development set:

{'C': 10, 'decision_function_shape': 'ovr', 'gamma': 1, 'kernel': 'rbf'}

Detailed classification report:

The model is trained on the full development set.
The scores are computed on the full evaluation set.

              precision    recall  f1-score   support

           0       0.73      0.82      0.77     11400
           1       0.40      0.33      0.36      5234
           2       0.58      0.48      0.52      3366

    accuracy                           0.63     20000
   macro avg       0.57      0.54      0.55     20000
weighted avg       0.61      0.63      0.62     20000




In [0]:
# run doc2vec pretrained with best params {'C': 0.01, 'decision_function_shape': 'ovr', 'degree': 2, 'gamma': 1, 'kernel': 'poly'} on a larger dataset
# Output: Classification report of the test set classified with the optimized SVC
dicts = trans_to_numpy_split(split_dataframes(dataframes, True, 100000))
train_x, train_y, valid_x, valid_y, test_x, test_y = dicts[1][0]["train"]["features"],dicts[1][0]["train"]["label"], dicts[1][0]["valid"]["features"],dicts[1][0]["valid"]["label"], dicts[1][0]["test"]["features"],dicts[1][0]["test"]["label"]
svc = svm.SVC(C=0.01, decision_function_shape = "ovr", degree=2, gamma=1, kernel="poly")
t0 = time.time()
svc.fit(train_x, train_y)
t1 = time.time()
prediction = svc.predict(test_x)
t2 = time.time()
time_train = t1-t0
time_predict = t2-t1
print(time_train)
print(time_predict)
# results
report = metrics.classification_report(test_y, prediction, output_dict=True)
print(report)

9195.91665816307
838.8839845657349
[0 0 0 ... 1 0 0]
{'0': {'precision': 0.7111163909595795, 'recall': 0.837261346947339, 'f1-score': 0.7690504103165298, 'support': 28598}, '1': {'precision': 0.38333333333333336, 'recall': 0.29460039883417705, 'f1-score': 0.3331598577500217, 'support': 13038}, '2': {'precision': 0.5909018861943256, 'recall': 0.44571975131516023, 'f1-score': 0.5081442104545764, 'support': 8364}, 'accuracy': 0.63026, 'macro avg': {'precision': 0.5617838701624128, 'recall': 0.5258604990322254, 'f1-score': 0.5367848261737093, 'support': 50000}, 'weighted avg': {'precision': 0.6055341984958279, 'recall': 0.63026, 'f1-score': 0.6117432007163796, 'support': 50000}}


In [0]:
# run tf  with best params {'C': 10, 'decision_function_shape': 'ovr', 'gamma': 1, 'kernel': 'rbf'} on a larger dataset
# Output: Classification report of the test set classified with the optimized SVC
dicts = trans_to_numpy_split(split_dataframes(dataframes, True, 50000))
train_x, train_y, valid_x, valid_y, test_x, test_y = dicts[2][0]["train"]["features"],dicts[2][0]["train"]["label"], dicts[2][0]["valid"]["features"],dicts[2][0]["valid"]["label"], dicts[2][0]["test"]["features"],dicts[2][0]["test"]["label"]
svc = svm.SVC(C=10, decision_function_shape = "ovr", gamma=1, kernel="rbf")
t0 = time.time()
svc.fit(train_x, train_y)
t1 = time.time()
prediction = svc.predict(test_x)
t2 = time.time()
time_train = t1-t0
time_predict = t2-t1
print(time_train)
print(time_predict)
# results
print(prediction)
report = metrics.classification_report(test_y, prediction, output_dict=True)
print(report)

16844.78852534294
1804.0371339321136
[0 0 0 ... 2 1 0]
{'0': {'precision': 0.7298809263368441, 'recall': 0.8166305336037485, 'f1-score': 0.7708226751382128, 'support': 28598}, '1': {'precision': 0.39752860411899316, 'recall': 0.3331032366927443, 'f1-score': 0.36247548303634775, 'support': 13038}, '2': {'precision': 0.5840632947160215, 'recall': 0.49426111908177905, 'f1-score': 0.5354228726848854, 'support': 8364}, 'accuracy': 0.63662, 'macro avg': {'precision': 0.5704909417239529, 'recall': 0.5479982964594239, 'f1-score': 0.5562403436198152, 'support': 50000}, 'weighted avg': {'precision': 0.618824361377786, 'recall': 0.63662, 'f1-score': 0.6249643823713379, 'support': 50000}}


## LinearSVC

### Test of different Feature sets
We are testing all feature sets with the LinearSVC with and without upsampling

In [0]:
# Testing the different feature sets not upsampled
# Output: average f1 scores for each of the six feature sets
trans_data = trans_to_numpy_unsplit(dataframes, True, 200000)
linear_classifier = svm.LinearSVC()
s1 = []
for d1 in trans_data:
  s2 = []
  for d2 in d1:
    scores = model_selection.cross_val_score(linear_classifier, d2["features"], d2["label"], cv=4, n_jobs=-1, scoring="f1_macro")
    print(scores)
    s2.append(statistics.mean(scores))
  s1.append(s2)
print(s1)

[0.3125768  0.31807336 0.31512314 0.31833965]
[0.32414877 0.32448252 0.32557382 0.32631602]
[0.52938076 0.52751034 0.53868915 0.53312427]
[0.23897628 0.23897628 0.23898177 0.23898177]
[0.54947739 0.55358653 0.56052272 0.55743804]
[0.54788742 0.55181126 0.55606958 0.5556378 ]
[[0.3160282380401167, 0.32513028271353456], [0.5321761311907596, 0.23897902404196636], [0.5552561696142201, 0.5528515122517301]]


In [0]:
# Testing the different feature sets upsampled
# Output: f1 scores for each of the six feature sets on the test set
dicts = trans_to_numpy_split(split_dataframes(dataframes, upsampling=True))
svc = svm.LinearSVC()
for d1 in dicts:
  for d2 in d1:
    svc.fit(d2["train"]["features"], d2["train"]["label"])
    prediction = svc.predict(d2["test"]["features"])
    print(metrics.classification_report(d2["test"]["label"], prediction))

              precision    recall  f1-score   support

           0       0.73      0.59      0.65     44096
           1       0.35      0.18      0.24     20237
           2       0.28      0.68      0.40     13028

    accuracy                           0.50     77361
   macro avg       0.45      0.48      0.43     77361
weighted avg       0.55      0.50      0.50     77361

              precision    recall  f1-score   support

           0       0.72      0.59      0.65     44096
           1       0.34      0.19      0.24     20237
           2       0.29      0.66      0.40     13028

    accuracy                           0.50     77361
   macro avg       0.45      0.48      0.43     77361
weighted avg       0.55      0.50      0.50     77361





              precision    recall  f1-score   support

           0       0.79      0.73      0.76     44096
           1       0.41      0.34      0.37     20237
           2       0.47      0.72      0.56     13028

    accuracy                           0.62     77361
   macro avg       0.56      0.60      0.57     77361
weighted avg       0.64      0.62      0.62     77361

              precision    recall  f1-score   support

           0       0.57      0.37      0.45     44096
           1       0.26      0.29      0.28     20237
           2       0.17      0.35      0.23     13028

    accuracy                           0.35     77361
   macro avg       0.34      0.34      0.32     77361
weighted avg       0.43      0.35      0.37     77361

              precision    recall  f1-score   support

           0       0.81      0.70      0.75     44096
           1       0.41      0.38      0.39     20237
           2       0.47      0.74      0.58     13028

    accuracy        