In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf

In [None]:
tf.config.list_physical_devices()

## Load the Data

In [None]:
df = pd.read_csv('data.csv')

## Drop one bad outlier
df = df[df['HIVincidence'] < df['HIVincidence'].max()]
df.head()

## Separate into X and y


In [None]:
try:
    y = df.pop('HIVincidence')
except:
    pass
X = df.values

## How many observations have Zero incidence?

## ~75%

In [None]:
(y==0).mean()

## Scale all Features

In [None]:
from sklearn.preprocessing import StandardScaler as SS
X = SS().fit_transform(X)

## Train/Test Split

In [None]:
from sklearn.model_selection import train_test_split as TTS
X_train, X_test, y_train, y_test = TTS(X,y, random_state = 2)

## Make a very simple model to predict `Zero` or `Non-Zero` HIV Incidence

In [None]:
from sklearn.linear_model import LogisticRegression as LR
model = LR(max_iter = 1000).fit(X_train,y_train>0)


## Score the Model. 

In [None]:
model.score(X_train, y_train>0), model.score(X_test, y_test>0) 

## 90% Accuracy is a reasonable starting point for such a simple model.
 
## Now we'll look at only the portion of the data with non-zero HIV Incidence and make a regression model

In [None]:
HIV_X = X[y>0]
HIV_y = y[y>0]

In [None]:
HIV_X.shape

In [None]:
(HIV_X_train, 
 HIV_X_test, 
 HIV_y_train, 
 HIV_y_test) = TTS(HIV_X,HIV_y, random_state=2)

# First, let's look a purely LASSO model.


## Fit 100 different LASSO Models with alpha in the range of $10^{-3}$ to $10$

In [None]:
from sklearn.linear_model import Lasso
from sklearn.metrics import mean_squared_error as mse

train_score, test_score = [],[]
train_r2, test_r2 = [], []
coefs = []

alphas = np.logspace(-3, 1, 100)

for alpha in alphas:
    regression = Lasso(alpha = alpha, max_iter = 10000)
    regression.fit(HIV_X_train, HIV_y_train)
    
    train_score.append(mse(HIV_y_train, regression.predict(HIV_X_train), squared = False))    
    test_score.append(mse(HIV_y_test, regression.predict(HIV_X_test), squared = False))
    
    train_r2.append(regression.score(HIV_X_train, HIV_y_train))
    test_r2.append(regression.score(HIV_X_test, HIV_y_test))

    coefs.append(regression.coef_)
    


## Plot the Train and Test RMSE Error.
## Identify the alpha value that yields the lowest Test RMSE

In [None]:
plt.plot(alphas, train_score, label = 'train')   
plt.plot(alphas, test_score, label = 'test')
plt.legend()
best_alpha = alphas[np.argmax(test_r2)]
plt.axvline(best_alpha)
plt.title("LASSO")
plt.xlabel('Alpha')
plt.ylabel('RMSE')
plt.xscale('log')
plt.annotate(f'best alpha: {round(best_alpha,2)}',xy=(.005, 10));

In [None]:
plt.plot(alphas, train_r2, label = 'train')   
plt.plot(alphas, test_r2, label = 'test')
plt.legend()

plt.axvline(best_alpha)
plt.title("LASSO")
plt.xlabel('Alpha')
plt.ylabel('R-Squared')
plt.xscale('log')

In [None]:
best_alpha

## Plot the coefficients and a vertical line to indicate the best alpha value.

## Notice that at this alpha value, some of our coefficients are zero

In [None]:
plt.plot(alphas, coefs)
plt.axvline(best_alpha)
plt.title('Coefficients')
plt.xscale('log')
plt.xlabel('LASSO Alpha');

## Build our final model based on the alpha value that gives lowest test RMSE

In [None]:
best_model = Lasso(alpha = best_alpha, max_iter = 10000)
best_model.fit(HIV_X_train, HIV_y_train)

## Use this best model to make predictions and compare to actuals

In [None]:
plt.scatter(best_model.predict(HIV_X_train), HIV_y_train, label = 'Train')
plt.scatter(best_model.predict(HIV_X_test), HIV_y_test, label = 'Test')
plt.legend()
plt.plot([0,120],[0,120])
plt.xlabel('Predictions')
plt.ylabel('Actuals');



### How many coefficients were deleted from the model?

In [None]:
(best_model.coef_==0).sum()

### Which columns were not deleted?

In [None]:
print('\n'.join(df.columns[best_model.coef_!=0]))

### Which coefficients were deleted?

In [None]:
print('\n'.join(df.columns[best_model.coef_==0]))