In [None]:
# Imports
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import KFold
from sklearn.metrics import precision_recall_fscore_support, accuracy_score, matthews_corrcoef
from sklearn.linear_model import LogisticRegression
import pandas as pd
import numpy as np

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


##Data processing

In [None]:
def getDataAndLabel(df, dataColumnName, labelColumnName):

  X = df.loc[:, dataColumnName].to_numpy()
  y = df.loc[:, labelColumnName].to_numpy()

  n = np.size(X)

  X_train = ['']*n
  y_train = ['']*n

  for i in range(n):
    X_train[i] += str(X[i])
    y_train[i] += str(y[i])

  return X_train, y_train

In [None]:
def getDataClass(df, dataColumnName, labelColumnName):
  X = df.loc[:, dataColumnName].to_numpy()
  y = df.loc[:, labelColumnName].to_numpy()

  train_classes = ['']*4

  n = np.size(X)

  for i in range(n):
    if (y[i] == 'B'):
      train_classes[0] += str(X[i]) + '. '
    
    if (y[i] == 'D'):
      train_classes[1] += str(X[i]) + '. '
    
    if (y[i] == 'I'):
      train_classes[2] += str(X[i]) + '. '
    
    if (y[i] == 'P'):
      train_classes[3] += str(X[i]) + '. '
    
  return train_classes

##Label functions

In [None]:
def predict(X_train, y_train, X_test, y_test, tfidf):

  X_train_transformed = tfidf.fit_transform(X_train)
  X_test_transformed = tfidf.transform(X_test)

  lr = LogisticRegression(max_iter = 1000)
  lr.fit(X_train_transformed, y_train)
  y_pred = lr.predict(X_test_transformed)

  return y_pred

##Evaluation on Testset

In [None]:
def evalOnDataset(df_train, df_test):
  X_train, y_train = getDataAndLabel(df_train, 'review', 'kano_labels')
  tfidf = TfidfVectorizer(input = getDataClass(df_train, 'review', 'kano_labels'), stop_words = "english")

  X_test, y_test = getDataAndLabel(df_test, 'review', 'kano_labels')

  y_pred = predict(X_train, y_train, X_test, y_test, tfidf)

  acc = accuracy_score(y_test, y_pred)
  prec, rec, f1, supp = precision_recall_fscore_support(y_test, y_pred, average = 'macro')

  print("accuracy:\t", acc)
  print("precision:\t", prec)
  print("recall:\t", rec)
  print("f1 score:\t", f1)

  df_test['prediction'] = y_pred
  df_test.to_excel("Brunotte_with_predictions_test" + str(i) + "_logistic_regression.xlsx")

##Evaluation on Testset divided by labels

In [None]:
def evalOnDatasetByLabel(df_train, df_test):
  X_train, y_train = getDataAndLabel(df_train, 'review', 'kano_labels')
  tfidf = TfidfVectorizer(input = getDataClass(df_train, 'review', 'kano_labels'), stop_words = "english")

  accuracy = []
  precision = []
  recall = []
  f1_score = []

  for label in ["B", "P", "D", "I"]:
    X_test, y_test = getDataAndLabel(df_test[df_test.kano_labels == label], 'review', 'kano_labels')
    print("------------------------------")
    print("evaluating label", label)
    print("------------------------------")

    y_pred = predict(X_train, y_train, X_test, y_test, tfidf)
    acc = accuracy_score(y_test, y_pred)
    prec, rec, f1, sup = precision_recall_fscore_support(y_test, y_pred, average='macro', zero_division = 1)

    print("accuracy:\t", acc)
    print("precision:\t", prec)
    print("recall:\t", rec)
    print("f1 score:\t", f1)

    accuracy.append(acc)
    precision.append(prec)
    recall.append(rec)
    f1_score.append(f1)

##10-fold cross-validation

In [None]:
def crossvalidation10fold(train_data):
  # 10 fold cross validation
  n = 10
  kf = KFold(n_splits=n, random_state = 42, shuffle = True)

  tfidf = TfidfVectorizer(input = getDataClass(train_data, 'review', 'kano_labels'), stop_words = "english")

  # Lists to store the values for accuracy, precision, recall and f1-score for each label
  resultsAcc = [[],[],[],[],[]]
  resultsPrecision = [[],[],[],[],[]]
  resultsRecall = [[],[],[],[],[]]
  resultsf1 = [[],[],[],[],[]]

  for train_index, val_index in kf.split(train_data):
    train_df = train_data.iloc[train_index]
    val_df = train_data.iloc[val_index]

    X_train, y_train = getDataAndLabel(train_df, 'review', 'kano_labels')

    # Evaluating on the val_df divided by labels
    for i, label in zip(range(4), ["B", "P", "D", "I"]):
      X_test, y_test = getDataAndLabel(val_df[val_df.kano_labels == label], 'review', 'kano_labels')

      y_pred = predict(X_train, y_train, X_test, y_test, tfidf)

      acc = accuracy_score(y_test, y_pred)
      prec, rec, f1, sup = precision_recall_fscore_support(y_test, y_pred, average='macro', zero_division = 1)

      resultsAcc[i].append(acc)
      resultsPrecision[i].append(prec)
      resultsRecall[i].append(rec)
      resultsf1[i].append(f1)

    # Evaluating on the whole val_df
    X_test, y_test = getDataAndLabel(val_df, 'review', 'kano_labels')

    y_pred = predict(X_train, y_train, X_test, y_test, tfidf)

    acc = accuracy_score(y_test, y_pred)
    prec, rec, f1, sup = precision_recall_fscore_support(y_test, y_pred, average='macro', zero_division = 1)

    resultsAcc[4].append(acc)
    resultsPrecision[4].append(prec)
    resultsRecall[4].append(rec)
    resultsf1[4].append(f1)
  
  for i, label in zip(range(5), ["basic", "performance", "delighter", "irrelevant", "overall"]):
    print("------------\n", label, "\n------------")
    print("Average accuracy:\t", np.round(np.average(resultsAcc[i]), 3))
    print("Average precision:\t", np.round(np.average(resultsPrecision[i]), 3))
    print("Average recall:\t", np.round(np.average(resultsRecall[i]), 3))
    print("Average f1 score:\t", np.round(np.average(resultsf1[i]), 3))

# Import Data

In [None]:
for i in range(1,6):
  # Import of the datasets
  training_dataset = pd.read_excel('DATASET_downsampled_test' + str(i) + '.xlsx')
  test_dataset = pd.read_excel('Trainingskorpus_Final.xlsx')
  combined_dataset = pd.read_excel('DATASET_Trainingskorpus_combined_test' + str(i) + '.xlsx')
  
  print("======================================\
  ITERATION " + str(i) + "\
  ======================================\n")

  #Run training on training set, evaluation on test set
  print("--------------------------\
  TRAINING ON TRAINING SET, EVALUATION ON TEST SET\
  -------------------------")
  evalOnDataset(training_dataset, test_dataset)
  print("\n\n")

  #Run training on training set, evaluation on test set --- devided by labels
  print("--------------------------\
  TRAINING ON TRAINING SET, EVALUATION ON TEST SET --- BY LABEL\
  -------------------------")
  evalOnDatasetByLabel(training_dataset, test_dataset)
  print("\n\n")

  #Perform a 10-fold cross-validation on the Murat Dataset
  print("--------------------------\
  CROSS-VALIDATION ON TRAINING SET\
  -------------------------")
  crossvalidation10fold(training_dataset)
  print("\n\n")

  #Perform a 10-fold cross-validation on the combined Dataset
  print("--------------------------\
  CROSS-VALIDATION ON COMBINED SET\
  -------------------------")
  crossvalidation10fold(combined_dataset)
  print("\n\n\n\n")
  print("\n\n\n\n")


--------------------------  TRAINING ON TRAINING SET, EVALUATION ON TEST SET  -------------------------
accuracy:	 0.5869297163995068
precision:	 0.3841541312517382
recall:	 0.45953502262859247
f1 score:	 0.38085538580931827



--------------------------  TRAINING ON TRAINING SET, EVALUATION ON TEST SET --- BY LABEL  -------------------------
------------------------------
evaluating label B
------------------------------
accuracy:	 0.661524500907441
precision:	 0.25
recall:	 0.9153811252268602
f1 score:	 0.19907154560349535
------------------------------
evaluating label P
------------------------------
accuracy:	 0.3924050632911392
precision:	 0.25
recall:	 0.8481012658227848
f1 score:	 0.14090909090909093
------------------------------
evaluating label D
------------------------------
accuracy:	 0.6842105263157895
precision:	 0.25
recall:	 0.9210526315789473
f1 score:	 0.20312500000000003
------------------------------
evaluating label I
------------------------------
accuracy:	 0.