In [1]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
import sklearn

In [37]:
training = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")
training = training.iloc[:, 1:] # Dropping 'Unnamed' column
test = test.iloc[:, 1:] # Dropping 'Unnamed' column

# logistic regression

## assumptions

### 1: linearly relationship between input and log-odds

### 2: collinearity

### 3: outliers

## Fit the model

In [38]:
def onehot_feature(pd_data, column_name):
    # Retrieve the unique values (the categories) and an index for each sample
    # specifying the sample category (values[value_idx] reconstruct the original array)
    col_values = pd_data[column_name].to_numpy().astype('<U')
    values, value_idx = np.unique(col_values, return_inverse=True)
    n_values = values.size
    # Create a temporary identity matrix to convert value_idx into one-hot features
    onehots = np.eye(n_values) #when you use an array to index another array in NumPy, it selects rows from the indexed array based on the values in the index array. e[a] selects rows from the identity matrix e based on the values in array a
    value_onehot = onehots[value_idx]
    # Remove the categorical feature
    pd_data = pd_data.drop(column_name, axis=1)
    # Add the new featues
    for i in range(n_values):
        pd_data["{}_{}".format(column_name, values[i])] = value_onehot[:, i]
        
    return pd_data

training = onehot_feature(training, 'DNAtype')

test = onehot_feature(test, 'DNAtype')

In [39]:
training = training.drop(["SpeciesID", "SpeciesName"], axis = 1)
test = test.drop(["SpeciesID", "SpeciesName"], axis = 1)

In [40]:
import re

# Divide il dataset in feature e target
X_train = training.drop(["Kingdom", "AGA"], axis = 1)  # Feature
y_train = training['Kingdom']  # Target

# Adding missing columns for DNA type
pattern = r'DNAtype_\d+'
set_colnames_training = set()
for col in training.columns.tolist():
    if re.match(pattern, col):
        set_colnames_training.add(col)
set_colnames_test = set()
for col in test.columns.tolist():
    if re.match(pattern, col):
        set_colnames_test.add(col)
missing_columns_in_test = list(set_colnames_training.difference(set_colnames_test.intersection(set_colnames_training)))
for col in missing_columns_in_test:
    test[col] = 0

X_test = test.drop(columns = ['Kingdom'])
y_test = test['Kingdom']

X_test = X_test[X.columns]

In [41]:
def accuracy(y_pred, y_true):
    return (y_pred == y_true).mean() #.sum()/y_pred.shape[0]

In [71]:
from sklearn.linear_model import LogisticRegression             # 1- model selection
model = LogisticRegression(solver="newton-cg", penalty= None)  # 2- hyperparams

# Addestra il modello
model.fit(X_train.values, y_train)



In [72]:
# Print the estimated coefficients
print("beta0 =", model.intercept_.squeeze())
print("beta1 =", model.coef_.squeeze())

beta0 = [-0.8712231  -0.03309422  1.25544877 -0.21893754 -0.56085932 -1.08554166
  2.17294705 -0.81686841 -0.72667704  0.21017838  0.67462709]
beta1 = [[ 3.75797337e-06 -2.41297523e-01  3.07312443e-02  4.01422050e-01
  -4.07942389e-01  2.30389729e-01  1.39036292e-02  2.61742700e-01
  -6.15425412e-01  7.25167508e-02 -2.17421897e-01  1.71942666e+00
  -1.00592561e-01  6.07253092e-02  1.51191689e-01  5.63368322e-01
  -2.87688077e-01 -3.55266728e-01 -2.70828114e-01  2.87319710e-01
  -1.42776411e-02 -2.69554548e-01 -1.77558513e-01 -1.27000682e-01
   5.00406110e-03 -1.71189322e-01  2.90676102e-03 -1.03204497e-01
   3.36269117e-01  1.66242783e-01 -3.62256535e-01 -3.38638583e-01
  -6.02416533e-02 -1.09921300e-01 -1.28524279e-01 -9.40490141e-02
  -2.25792126e-01 -9.55624832e-03 -9.63727609e-02  8.68235311e-02
   2.87637216e-01  3.97576346e-01 -6.24903013e-01 -5.09887393e-01
  -2.41884822e-01 -2.35419246e-01 -2.99208992e-01 -4.29045401e-01
  -3.86592890e-01 -2.33826828e-01  3.34392634e-01  2.8362

In [73]:
print("Train accuracy:", accuracy(y_train, model.predict(X_train.values)))
print("Test accuracy:", accuracy(y_test, model.predict(X_test.values)))

Train accuracy: 0.6389368643254654
Test accuracy: 0.6304681504221028


In [76]:
from scipy.stats import norm, zscore

def z_test(X, y, model, names, alpha=None):
    n_samples, n_features = X.shape
    betas = np.concatenate([model.intercept_, model.coef_.reshape(-1)])
    
    # Compute the prediction
    pred = model.predict_proba(X) # [N, 2]
    y = y.reshape(-1)    
    X = np.concatenate([np.ones([X.shape[0], 1]), X], axis=-1)
    n_samples, n_features = X.shape
    
    V = np.diagflat(pred[:,0] * (1- pred[:,0])) #pred[:,0] * (1- pred[:,0]) #diagglat provides two dimensional array provided the diagonal
    covLogit = np.linalg.inv(X.T @ V @ X) #np.linalg.inv provides the inverse of the matrix
    se_b = np.sqrt(np.diag(covLogit)) #computing the standard error, we have an array of all beta parametrs
    
    z_stat_b = (betas-0)/se_b

    # Compute the p-value (two-sided test)
    p_values = np.array([2 * norm.sf(np.abs(z_stat)) for z_stat in z_stat_b])#!differnece from the previous we are computing norm. for normal distribution 
    
    df = pd.DataFrame()
    df["Name"] = names
    df["Coefficients"] = betas
    df["Standard Errors"] = np.round(se_b, decimals=4)
    df["Z-stat"] = np.round(z_stat_b, decimals=1)
    df["p-value"] = p_values
    if alpha:
        rejectH0 = p_values < alpha
        df["reject H0"] = rejectH0    
    
    return df


X = X_train.to_numpy()
y = y_train.to_numpy()


z_test(X, y, model, ["Intercept", *X_train.columns.tolist()], alpha=0.0001)

MemoryError: Unable to allocate 829. MiB for an array with shape (10422, 10422) and data type float64

### penalty = l2 fa il ridge

In [45]:
from sklearn.linear_model import LogisticRegression             # 1- model selection
model = LogisticRegression(solver="newton-cg", penalty= "l2")  # 2- hyperparams

# Addestra il modello
model.fit(X_train.values, y_train)



In [46]:
# Print the estimated coefficients
print("beta0 =", model.intercept_.squeeze())
print("beta1 =", model.coef_.squeeze())

beta0 = [-0.91339052  0.08903074  0.96209825  0.04397373 -0.62819517 -1.53797411
  2.11276333 -0.62189921 -0.56543464  0.10512911  0.95389849]
beta1 = [[ 1.51589218e-06 -7.93206604e-02  3.03269544e-03  1.02116195e-01
  -1.28114812e-01  6.41250515e-02 -1.76023855e-03  6.69568025e-02
  -1.96860414e-01  3.16331195e-03 -7.65528124e-02  5.13110036e-01
  -3.44104565e-02  1.23712610e-02  4.20097194e-02  1.64078096e-01
  -9.10428320e-02 -1.14358915e-01 -9.61194567e-02  8.13185221e-02
  -1.69196240e-02 -8.43081351e-02 -5.71002655e-02 -4.30539382e-02
  -5.45916055e-03 -5.46039794e-02 -4.75333767e-03 -4.32435250e-02
   9.90012197e-02  4.79638586e-02 -1.15379893e-01 -1.05443644e-01
  -2.53993804e-02 -3.69253758e-02 -4.21033629e-02 -2.99083282e-02
  -7.36007468e-02 -8.84245967e-03 -3.74136935e-02  2.29924911e-02
   7.63661081e-02  1.20756762e-01 -2.00663576e-01 -1.56443843e-01
  -8.67661764e-02 -7.78263317e-02 -9.22765561e-02 -1.29504408e-01
  -1.23433162e-01 -7.45881430e-02  8.74962754e-02  9.2806

In [47]:
print("Train accuracy:", accuracy(y_train, model.predict(X_train.values)))
print("Test accuracy:", accuracy(y_test, model.predict(X_test.values)))

Train accuracy: 0.5659182498560736
Test accuracy: 0.5506523407521106


### lasso... doesn't work

In [66]:
C_vals = np.linspace(1e-4, 0.5, 100)
betas = []

from sklearn import preprocessing
X_std = preprocessing.scale(X_train)

for C in C_vals:
    model = LogisticRegression(solver='saga', penalty='l1', C=C)
    model.fit(X_std, y_train) 
    beta = model.coef_.reshape(-1) # colecting betas
    betas.append(beta)
betas = np.stack(betas, axis=1)  # [Nfeat, NCvals] how the beta changes with the C

for beta, name in zip(betas, X_train.columns.tolist()):
    plt.plot(C_vals, beta, label=name)
plt.legend()
plt.xlabel("C=1/lambda")
plt.ylabel("beta")



KeyboardInterrupt: 

# prove sul dataset con solo i codoni

In [54]:
training = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")
training = training.iloc[:, 1:] # Dropping 'Unnamed' column
test = test.iloc[:, 1:] # Dropping 'Unnamed' column

In [55]:
training_prova = training.drop(["SpeciesID", "SpeciesName", "Ncodons", "DNAtype"], axis = 1)
test_prova = test.drop(["SpeciesID", "SpeciesName", "Ncodons", "DNAtype"], axis = 1)

In [56]:
from sklearn.metrics import accuracy_score
import re

# Divide il dataset in feature e target
X_train_prova = training_prova.drop(["Kingdom", "AGA"], axis = 1)  # Feature
y_train_prova = training_prova['Kingdom']  # Target

# Adding missing columns for DNA type
pattern = r'DNAtype_\d+'
set_colnames_training_prova = set()
for col in training_prova.columns.tolist():
    if re.match(pattern, col):
        set_colnames_training_prova.add(col)
set_colnames_test_prova = set()
for col in test_prova.columns.tolist():
    if re.match(pattern, col):
        set_colnames_test_prova.add(col)
missing_columns_in_test_prova = list(set_colnames_training_prova.difference(set_colnames_test_prova.intersection(set_colnames_training_prova)))
for col in missing_columns_in_test_prova:
    test_prova[col] = 0

X_test_prova = test_prova.drop(columns = ['Kingdom'])
y_test_prova = test_prova['Kingdom']

X_test_prova = X_test_prova[X_prova.columns]

In [59]:
from sklearn.linear_model import LogisticRegression             # 1- model selection
model = LogisticRegression(solver="newton-cg", penalty= None)  # 2- hyperparams

# Addestra il modello
model.fit(X_train_prova.values, y_train_prova)

In [60]:
# Print the estimated coefficients
print("beta0 =", model.intercept_.squeeze())
print("beta1 =", model.coef_.squeeze())

beta0 = [ -5.89922634 -28.74683965  27.5989626   -5.37201055  -5.02417613
  -4.28425665  15.36030445  -7.58095906 -12.01750417  75.12987413
 -49.16416864]
beta1 = [[-1.79762751e+01 -7.25194041e+01 -2.33844259e+00 -1.28262193e+01
   8.00617015e+01 -3.54919984e+00  4.06264521e+01 -8.99023568e+01
   4.04603184e+01  4.32512633e+00  6.18208414e+01  3.09806933e+01
   1.37132097e+01  2.39222689e+02  6.66021811e+01 -1.70095073e+01
  -2.43899806e+01 -2.81620047e+01  1.10011002e+02  4.37950100e+01
  -6.28902151e+01  4.40371497e+01  2.92977607e+01  8.71810894e+01
   4.65211496e+00  6.55521185e+00  7.54832817e+00 -1.66908170e+01
  -3.54984237e+01 -1.43038395e+02 -2.55795450e+01  6.56402809e+01
   4.43790332e+01  3.85333442e+01  4.23590755e-01  7.33613365e+01
  -6.58406662e+00 -1.43193424e+02  1.33007484e+02  1.12546016e+02
   1.10044392e+02  2.15347318e+01 -2.02285175e+01 -5.75485482e+01
  -4.17817331e+00 -3.09403217e+01 -1.47075498e+02 -1.20128163e+02
  -1.05609123e+02 -6.71431641e+00 -9.73912444

In [61]:
print("Train accuracy:", accuracy(y_train_prova, model.predict(X_train_prova.values)))
print("Test accuracy:", accuracy(y_test_prova, model.predict(X_test_prova.values)))

Train accuracy: 0.8530992132028401
Test accuracy: 0.843054489639294


In [62]:
from sklearn.linear_model import LogisticRegression             # 1- model selection
model = LogisticRegression(solver="newton-cg", penalty= "l2")  # 2- hyperparams

# Addestra il modello
model.fit(X_train_prova.values, y_train_prova)

In [63]:
# Print the estimated coefficients
print("beta0 =", model.intercept_.squeeze())
print("beta1 =", model.coef_.squeeze())

beta0 = [-1.288769    1.15115039  1.02967244 -0.07770685 -0.76470022 -3.20047689
  1.66828553 -0.95336319 -0.8390956   2.2519085   1.0230949 ]
beta1 = [[-2.30655321e-01 -1.57571157e-01 -1.90382991e-01 -3.56389012e-01
  -2.55787585e-02  5.95797784e-02 -6.55403085e-01 -2.82630243e-01
  -5.01930811e-01 -4.55909422e-01  1.42255143e+00  1.49586652e-01
   1.10739996e-01  4.17445765e-01  4.08897643e-01  8.36477470e-02
  -3.63992850e-01 -2.06678957e-01  2.58845229e-01  3.75659931e-01
  -3.39147995e-01 -2.67648561e-01 -3.22113130e-01  2.67709147e-01
  -2.31963229e-02  4.13166529e-02  1.95281485e-01  2.48496240e-01
   2.54449606e-01 -5.55537834e-01 -4.70300048e-01 -3.40956419e-01
   3.42485597e-02 -7.70962210e-02  6.48725796e-02 -3.39945313e-01
   7.83377031e-03 -4.21448977e-01  3.19402772e-01  2.00033499e-01
   5.36139528e-01 -8.05275496e-01 -1.37044518e-01 -2.78012006e-01
  -2.28288669e-01 -2.80038564e-01 -3.68443215e-01 -4.79712773e-01
  -2.98830270e-01  4.05579835e-01  9.14876436e-01 -4.3544

In [64]:
print("Train accuracy:", accuracy(y_train_prova, model.predict(X_train_prova.values)))
print("Test accuracy:", accuracy(y_test_prova, model.predict(X_test_prova.values)))

Train accuracy: 0.6433506044905009
Test accuracy: 0.6400613967766692


### lasso... doesn't work

In [67]:
C_vals = np.linspace(1e-4, 0.5, 100)
betas = []

from sklearn import preprocessing
X_std = preprocessing.scale(X_train_prova)

for C in C_vals:
    model = LogisticRegression(solver='saga', penalty='l1', C=C)
    model.fit(X_std, y_train_prova) 
    beta = model.coef_.reshape(-1) # colecting betas
    betas.append(beta)
betas = np.stack(betas, axis=1)  # [Nfeat, NCvals] how the beta changes with the C

for beta, name in zip(betas, X_train_prova.columns.tolist()):
    plt.plot(C_vals, beta, label=name)
plt.legend()
plt.xlabel("C=1/lambda")
plt.ylabel("beta")



KeyboardInterrupt: 