In [16]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mutual_info_score
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error

### Dataset

In this homework, we will use the Car price dataset. Download it from [here](https://raw.githubusercontent.com/alexeygrigorev/mlbookcamp-code/master/chapter-02-car-price/data.csv).

Or you can do it with `wget`:

```bash
wget https://raw.githubusercontent.com/alexeygrigorev/mlbookcamp-code/master/chapter-02-car-price/data.csv
```

We'll keep working with the `MSRP` variable, and we"ll transform it to a classification task. 

### Features

For the rest of the homework, you"ll need to use only these columns:

* `Make`,
* `Model`,
* `Year`,
* `Engine HP`,
* `Engine Cylinders`,
* `Transmission Type`,
* `Vehicle Style`,
* `highway MPG`,
* `city mpg`,
* `MSRP`

### Data preparation

* Select only the features from above and transform their names using the next line:
  ```  data.columns = data.columns.str.replace(" ', '_').str.lower()  ```
* Fill in the missing values of the selected features with 0.
* Rename `MSRP` variable to `price`.

In [17]:
df_raw = pd.read_csv("./data/data.csv")
print(df_raw.head())
print(df_raw.shape)
print(df_raw.columns)
print(df_raw.isna().sum())

  Make       Model  Year             Engine Fuel Type  Engine HP  \
0  BMW  1 Series M  2011  premium unleaded (required)      335.0   
1  BMW    1 Series  2011  premium unleaded (required)      300.0   
2  BMW    1 Series  2011  premium unleaded (required)      300.0   
3  BMW    1 Series  2011  premium unleaded (required)      230.0   
4  BMW    1 Series  2011  premium unleaded (required)      230.0   

   Engine Cylinders Transmission Type     Driven_Wheels  Number of Doors  \
0               6.0            MANUAL  rear wheel drive              2.0   
1               6.0            MANUAL  rear wheel drive              2.0   
2               6.0            MANUAL  rear wheel drive              2.0   
3               6.0            MANUAL  rear wheel drive              2.0   
4               6.0            MANUAL  rear wheel drive              2.0   

                         Market Category Vehicle Size Vehicle Style  \
0  Factory Tuner,Luxury,High-Performance      Compact         C

In [18]:
df = df_raw.copy()
df.drop(["Engine Fuel Type", "Driven_Wheels", "Number of Doors", "Market Category", "Vehicle Size", "Popularity"], axis = 1, inplace = True)

df.columns = df.columns.str.replace(' ', '_').str.lower() # modified to accommodate df designation "df" instead of "data"

df = df.fillna(0)

df.rename(columns={"msrp": "price"}, inplace = True)

### Question 1

What is the most frequent observation (mode) for the column `transmission_type`?

- `AUTOMATIC`
- `MANUAL`
- `AUTOMATED_MANUAL`
- `DIRECT_DRIVE`

In [19]:
df.transmission_type.value_counts()

AUTOMATIC           8266
MANUAL              2935
AUTOMATED_MANUAL     626
DIRECT_DRIVE          68
UNKNOWN               19
Name: transmission_type, dtype: int64

### Question 2

Create the [correlation matrix](https://www.google.com/search?q=correlation+matrix) for the numerical features of your dataset. 
In a correlation matrix, you compute the correlation coefficient between every pair of features in the dataset.

What are the two features that have the biggest correlation in this dataset?

- `engine_hp` and `year`
- `engine_hp` and `engine_cylinders`
- `highway_mpg` and `engine_cylinders`
- `highway_mpg` and `city_mpg`

In [20]:
df.dtypes

make                  object
model                 object
year                   int64
engine_hp            float64
engine_cylinders     float64
transmission_type     object
vehicle_style         object
highway_mpg            int64
city_mpg               int64
price                  int64
dtype: object

In [21]:
numerical = ["year", "engine_hp", "engine_cylinders", "highway_mpg", "city_mpg", "price"]

for feature in numerical:
    print(df[numerical].corrwith(df[feature]).abs())
    print()

year                1.000000
engine_hp           0.338714
engine_cylinders    0.040708
highway_mpg         0.258240
city_mpg            0.198171
price               0.227590
dtype: float64

year                0.338714
engine_hp           1.000000
engine_cylinders    0.774851
highway_mpg         0.415707
city_mpg            0.424918
price               0.650095
dtype: float64

year                0.040708
engine_hp           0.774851
engine_cylinders    1.000000
highway_mpg         0.614541
city_mpg            0.587306
price               0.526274
dtype: float64

year                0.258240
engine_hp           0.415707
engine_cylinders    0.614541
highway_mpg         1.000000
city_mpg            0.886829
price               0.160043
dtype: float64

year                0.198171
engine_hp           0.424918
engine_cylinders    0.587306
highway_mpg         0.886829
city_mpg            1.000000
price               0.157676
dtype: float64

year                0.227590
engine_hp           0

### Make `price` binary

* Now we need to turn the `price` variable from numeric into a binary format.
* Let's create a variable `above_average` which is `1` if the `price` is above its mean value and `0` otherwise.

In [22]:
df["above_average"] = df["price"]
df["above_average"].where(df["above_average"] < df.above_average.mean(), 1, inplace=True)
df["above_average"].where(df["above_average"] <= 1, 0, inplace=True)

### Split the data

* Split your data in train/val/test sets with 60%/20%/20% distribution.
* Use Scikit-Learn for that (the `train_test_split` function) and set the seed to `42`.
* Make sure that the target value (`above_average`) is not in your dataframe.

In [23]:
df_full_train, df_test = train_test_split(df, test_size = 0.2, random_state = 42)
df_train, df_val = train_test_split(df_full_train, test_size = 0.25, random_state = 42)

df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

y_train = df_train.above_average.values
y_val = df_val.above_average.values
y_test = df_test.above_average.values

del df_train["above_average"]
del df_val["above_average"]
del df_test["above_average"]

# although not explicitly noted, it makes sense to also remove price!
del df_train["price"]
del df_val["price"]
del df_test["price"]

numerical = ["year", "engine_hp", "engine_cylinders", "highway_mpg", "city_mpg"]

### Question 3

* Calculate the mutual information score between `above_average` and other categorical variables in our dataset. 
  Use the training set only.
* Round the scores to 2 decimals using `round(score, 2)`.

Which of these variables has the lowest mutual information score?
  
- `make`
- `model`
- `transmission_type`
- `vehicle_style`

In [24]:
categorical = ["make", "model", "transmission_type", "vehicle_style"]

for variable in categorical:
    print(variable)
    print(mutual_info_score(y_train, df_train[variable]))

make
0.23976875439118348
model
0.46234389209653004
transmission_type
0.02095754189630187
vehicle_style
0.08414301956779438


### Question 4

* Now let's train a logistic regression.
* Remember that we have several categorical variables in the dataset. Include them using one-hot encoding.
* Fit the model on the training dataset.
    - To make sure the results are reproducible across different versions of Scikit-Learn, fit the model with these parameters:
    - `model = LogisticRegression(solver='liblinear', C=10, max_iter=1000, random_state=42)`
* Calculate the accuracy on the validation dataset and round it to 2 decimal digits.

What accuracy did you get?

- 0.60
- 0.72
- 0.84
- 0.95

In [25]:
# OHE

dv = DictVectorizer(sparse = False)

train_dict = df_train[categorical + numerical].to_dict(orient = "records")
X_train = dv.fit_transform(train_dict)

val_dict = df_val[categorical + numerical].to_dict(orient = "records")
X_val = dv.transform(val_dict)

In [26]:
# Fit model

model = LogisticRegression(solver = "liblinear", C = 10, max_iter = 1000, random_state = 42)
model.fit(X_train, y_train)

In [27]:
# Predict accuracy

y_pred = model.predict_proba(X_val)[:, 1]
churn_decision = (y_pred >= 0.5)
accuracy = (y_val == churn_decision).mean()

print(accuracy)

0.9454469156525388


### Question 5 

* Let's find the least useful feature using the *feature elimination* technique.
* Train a model with all these features (using the same parameters as in Q4).
* Now exclude each feature from this set and train a model without it. Record the accuracy for each model.
* For each feature, calculate the difference between the original accuracy and the accuracy without the feature. 

Which of following feature has the smallest difference?

- `year`
- `engine_hp`
- `transmission_type`
- `city_mpg`

> **Note**: the difference doesn't have to be positive

In [28]:
df_small_train = df_train.copy()
df_small_val = df_val.copy()
categorical_small = categorical.copy()
numerical_small = numerical.copy()

for i in df_train.columns:
    print(i)
    del df_small_train[i]

    if i in categorical_small:
        categorical_small.remove(i)

    elif i in numerical_small:
        numerical_small.remove(i)

    train_small_dict = df_small_train[categorical_small + numerical_small].to_dict(orient = "records")
    X_small_train = dv.fit_transform(train_small_dict)

    val_small_dict = df_small_val[categorical_small + numerical_small].to_dict(orient = "records")
    X_small_val = dv.transform(val_small_dict)

    model.fit(X_small_train, y_train)

    y_pred = model.predict_proba(X_small_val)[:, 1]
    churn_decision = (y_pred >= 0.5)
    print((y_val == churn_decision).mean() - accuracy)

    # reset for next iteration
    df_small_train = df_train.copy()
    df_small_val = df_val.copy()
    categorical_small = categorical.copy()
    numerical_small = numerical.copy()

make
0.0012589173310952884
model
-0.026017624842635367
year
0.0025178346621905767
engine_hp
-0.020981955518254325
engine_cylinders
0.0012589173310952884
transmission_type
-0.0008392782207301552
vehicle_style
-0.0025178346621904657
highway_mpg
0.0012589173310952884
city_mpg
0.00041963911036513313


### Question 6

* For this question, we'll see how to use a linear regression model from Scikit-Learn.
* We'll need to use the original column `price`. Apply the logarithmic transformation to this column.
* Fit the Ridge regression model on the training data with a solver `'sag'`. Set the seed to `42`.
* This model also has a parameter `alpha`. Let's try the following values: `[0, 0.01, 0.1, 1, 10]`.
* Round your RMSE scores to 3 decimal digits.

Which of these alphas leads to the best RMSE on the validation set?

- 0
- 0.01
- 0.1
- 1
- 10

> **Note**: If there are multiple options, select the smallest `alpha`.

In [29]:
df = df_raw.copy()
df.drop(["Engine Fuel Type", "Driven_Wheels", "Number of Doors", "Market Category", "Vehicle Size", "Popularity"], axis = 1, inplace = True)

df.columns = df.columns.str.replace(' ', '_').str.lower()
df = df.fillna(0)
df.rename(columns={"msrp": "price"}, inplace = True)

df_full_train, df_test = train_test_split(df, test_size = 0.2, random_state = 42)
df_train, df_val = train_test_split(df_full_train, test_size = 0.25, random_state = 42)

df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)

y_train = df_train.price.values
y_val = df_val.price.values

del df_train["price"]
del df_val["price"]

numerical = ["year", "engine_hp", "engine_cylinders", "highway_mpg", "city_mpg"]
categorical = ["make", "model", "transmission_type", "vehicle_style"]

dv = DictVectorizer()

train_dict = df_train[categorical + numerical].to_dict(orient = "records")
X_train = dv.fit_transform(train_dict)

val_dict = df_val[categorical + numerical].to_dict(orient = "records")
X_val = dv.transform(val_dict)

In [36]:
alphas = [0, 0.01, 0.1, 1, 10]

for i in alphas:
    ridge = Ridge(alpha = i, solver = "sag", random_state = 42)
    ridge.fit(X_train, y_train)
    y_pred = ridge.predict(X_val)

    rmse = np.sqrt(mean_squared_error(y_val, y_pred))

    print(i)
    print(round(rmse, 3))

0
32034.67
0.01
31320.825
0.1
31292.046
1
31803.416
10
41567.732


In [37]:
# these are wild RMSE values, no idea why I got them