In [27]:
import numpy as np
import pandas as pd

from src.data_preprocessing import clean_text, transform_tfidf
from src.model import gradient_descent, predict, pred_to_accuracy
from src.viz import graph_log_loss

In [28]:
# Read data
train_raw = pd.read_csv("data/train.csv")
test_raw = pd.read_csv("data/test.csv")
y_test_raw = pd.read_csv("data/submit.csv")

In [29]:
# Implement data cleaning and text preprocessing
test_raw = test_raw.join(y_test_raw["label"])
train_clean = clean_text(train_raw)
test_clean = clean_text(test_raw)
y_train = train_clean["label"].to_numpy()
y_test = test_clean["label"].to_numpy()

  col = col.str.replace('[^\w\s]', '')\


In [30]:
# tf-idf transformation
X_train, X_test, words = transform_tfidf(train_clean["text"], test_clean["text"])

X_train = X_train.todense()
X_test = X_test.todense()
X_train.shape

(18285, 16770)

In [32]:
n, p = X_train.shape

# Shuffle training data
indices = np.random.permutation(n)
X_train = X_train[indices,:]
y_train = y_train[indices]

# Set training parameters
folds = 3
n_alphas = 10
lambdas = [1, 0.1, 0.01, 0.001, 0.0001, 0.00001, 0.000001]

In [33]:
# Get average train log loss for each lambda and alpha
avg_loss_by_penalty = np.empty((n_alphas + 1,len(lambdas)))
for i, lambda_ in enumerate(lambdas):
    log_loss = np.empty((n_alphas+ 1,folds))
    buckets = list(range(0, n + 1, n//folds))
    
    for k in range(folds):
        val1 = buckets[k]
        val2 = buckets[k+1]

        X_val = X_train[val1:val2,:]
        X_train = np.delete(X_train, np.s_[val1:val2], axis=0)

        y_dev = y_train[val1:val2]
        y_train2 = np.delete(y_train, np.s_[val1:val2], axis=0)

        log_loss[:,k] = gradient_descent(X_train, X_val, y_train2, y_dev,n_alphas, lambda_)

    avg_loss_by_penalty[:,i] = np.mean(log_loss, axis=1)

In [None]:
print(avg_loss_by_penalty)

[[0.6932472  0.69438084 0.23308476 0.01062063 0.0086386  0.00848593
  0.00847283 0.00847213]
 [0.69115632 0.69314166 0.26863041 0.01050025 0.00862905 0.00848544
  0.00847032 0.00847204]
 [0.68771334 0.6931415  0.31071463 0.01037934 0.00861917 0.00848641
  0.00847087 0.00847103]
 [0.69757249 0.69314398 0.36045149 0.0102626  0.00861259 0.0084839
  0.0084714  0.00847131]
 [0.68190465 0.6930182  0.41816245 0.0101429  0.0086046  0.00848308
  0.00847238 0.0084696 ]
 [0.70767408 0.69329173 0.48367046 0.01002669 0.0085945  0.00848319
  0.00847199 0.00847064]
 [0.71064744 0.69283309 0.55307056 0.00991185 0.0085874  0.00848014
  0.00847163 0.00847122]
 [0.67379116 0.69364524 0.61820572 0.0097985  0.00857885 0.00848041
  0.00846961 0.00847012]
 [0.71633571 0.69369833 0.66652161 0.00968694 0.00856984 0.00847971
  0.00847272 0.00847125]
 [0.66887354 0.69355773 0.68946844 0.00957635 0.00856146 0.00848032
  0.00847157 0.00847129]
 [0.69851061 0.69322271 0.69301996 0.0094691  0.0085541  0.0084786
  0.

In [None]:
def get_best_vals(avg_err_array, lambdas):
    ind_ridge = (np.where(avg_err_array[0] == avg_err_array[0].min())[0][0])
    ind_lasso = (np.where(avg_err_array[10] == avg_err_array[10].min())[0][0])

    enl_matrix = avg_err_array[1:10]
    alpha_idx, en_lambda_idx = np.where(enl_matrix == np.min(enl_matrix))
    alpha_idx += 1

    ridge_lambda = lambdas[ind_ridge]
    lasso_lambda = lambdas[ind_lasso]
    en_lambda = lambdas[en_lambda_idx[0]]
    best_alpha = alpha_idx[0] * 0.1

    return  ridge_lambda, lasso_lambda, en_lambda, best_alpha

In [None]:
graph_log_loss(avg_loss_by_penalty)

In [None]:
ridge_lambda, lasso_lambda, en_lambda, best_alpha = get_best_vals(avg_loss_by_penalty, lambdas)
print("Best lambda for ridge regularization: ", ridge_lambda)
print("Best lambda for lasso regularization: ", lasso_lambda)
print("Best lambda for elastic net regularization: ", en_lambda)
print("Best alpha for elastic net regularization: ", best_alpha)

In [None]:
# Train data predictions
train_pred_ridge = predict(X_train, y_train,  X_train, 0, ridge_lambda, 0.001, 10000)
train_pred_lasso = predict(X_train, y_train,  X_train, 1, lasso_lambda, 0.001, 10000)
train_pred_en = predict(X_train, y_train,  X_train, best_alpha, en_lambda, 0.001, 10000)

In [None]:
# Select threshold and calculate train accuracy
threshold = 0.9999
train_accuracy_r = pred_to_accuracy(train_pred_ridge, y_test, threshold)
train_accuracy_l = pred_to_accuracy(train_pred_lasso, y_test, threshold)
train_accuracy_en = pred_to_accuracy(train_pred_en, y_test, threshold)
print(train_accuracy_r)
print(train_accuracy_l)
print(train_accuracy_en)

In [None]:
# Test data predictions
pred_ridge = predict(X_train, y_train,  X_test, 0, ridge_lambda, 0.001, 10000)
pred_lasso = predict(X_train, y_train,  X_test, 1, lasso_lambda, 0.001, 10000)
pred_en = predict(X_train, y_train,  X_test, best_alpha, en_lambda, 0.001, 10000)

In [None]:
# Calculate test accuracy
accuracy_r = pred_to_accuracy(pred_ridge, y_test, threshold)
accuracy_l = pred_to_accuracy(pred_lasso, y_test, threshold)
accuracy_en = pred_to_accuracy(pred_en, y_test, threshold)
print(accuracy_r)
print(accuracy_l)
print(accuracy_en)