In [74]:
import pandas as pd
import numpy as np

In [19]:
mycols = ["Make",
"Model",
"Year",
"Engine HP",
"Engine Cylinders",
"Transmission Type",
"Vehicle Style",
"highway MPG",
"city mpg",
         "MSRP"]


In [20]:
df = pd.read_csv("../data.csv")

In [21]:
df.columns

Index(['Make', 'Model', 'Year', 'Engine Fuel Type', 'Engine HP',
       'Engine Cylinders', 'Transmission Type', 'Driven_Wheels',
       'Number of Doors', 'Market Category', 'Vehicle Size', 'Vehicle Style',
       'highway MPG', 'city mpg', 'Popularity', 'MSRP'],
      dtype='object')

In [22]:
df = df[mycols]

In [23]:
df.columns = df.columns.str.replace(' ', '_').str.lower()


In [24]:
df.columns

Index(['make', 'model', 'year', 'engine_hp', 'engine_cylinders',
       'transmission_type', 'vehicle_style', 'highway_mpg', 'city_mpg',
       'msrp'],
      dtype='object')

In [29]:
df.isnull().sum()

make                 0
model                0
year                 0
engine_hp            0
engine_cylinders     0
transmission_type    0
vehicle_style        0
highway_mpg          0
city_mpg             0
msrp                 0
dtype: int64

In [26]:
df.fillna(0, inplace=True)

In [30]:
df.rename(columns={"msrp":"price"}, inplace=True)

In [34]:
df.transmission_type.value_counts()

transmission_type
AUTOMATIC           8266
MANUAL              2935
AUTOMATED_MANUAL     626
DIRECT_DRIVE          68
UNKNOWN               19
Name: count, dtype: int64

In [63]:
df.dtypes

make                  object
model                 object
year                   int64
engine_hp            float64
engine_cylinders     float64
transmission_type     object
vehicle_style         object
highway_mpg            int64
city_mpg               int64
price                  int64
above_average          int64
dtype: object

In [40]:
num_cols = ["year", "engine_hp", "engine_cylinders", "highway_mpg", "city_mpg", "price"]

In [41]:
df[num_cols].corr()

Unnamed: 0,year,engine_hp,engine_cylinders,highway_mpg,city_mpg,price
year,1.0,0.338714,-0.040708,0.25824,0.198171,0.22759
engine_hp,0.338714,1.0,0.774851,-0.415707,-0.424918,0.650095
engine_cylinders,-0.040708,0.774851,1.0,-0.614541,-0.587306,0.526274
highway_mpg,0.25824,-0.415707,-0.614541,1.0,0.886829,-0.160043
city_mpg,0.198171,-0.424918,-0.587306,0.886829,1.0,-0.157676
price,0.22759,0.650095,0.526274,-0.160043,-0.157676,1.0


In [54]:
df.price.mean()

40594.737032063116

In [43]:
df["above_average"] = df.price > df.price.mean()

In [45]:
df["above_average"] = df["above_average"].astype(int)

In [46]:
df["above_average"]

0        1
1        1
2        0
3        0
4        0
        ..
11909    1
11910    1
11911    1
11912    1
11913    0
Name: above_average, Length: 11914, dtype: int64

In [47]:
from sklearn.model_selection import train_test_split

In [50]:
df_fulltrain, df_test = train_test_split(df, test_size=.2, random_state = 42)
df_train, df_val = train_test_split(df_fulltrain, test_size=.25, random_state = 42)


In [56]:
df_train.price

3972      33599
1997      26245
5216     248000
2805      24990
11369     20475
          ...  
9232      37655
5710      25135
11306     28345
4414       2000
10286     40220
Name: price, Length: 7148, dtype: int64

In [51]:
len(df_test), len(df_val), len(df_train)

(2383, 2383, 7148)

In [52]:
y_train = df_train.price.values
y_val = df_val.price.values
y_test = df_test.price.values

In [57]:
y_train

array([ 33599,  26245, 248000, ...,  28345,   2000,  40220])

In [58]:
del df_train["price"]
del df_test["price"]
del df_val["price"]

In [61]:
from sklearn.metrics import mutual_info_score

In [62]:
mutual_info_score(df_fulltrain.above_average, df_fulltrain.highway_mpg)

0.04379543487177338

In [65]:
df_fulltrain.dtypes

make                  object
model                 object
year                   int64
engine_hp            float64
engine_cylinders     float64
transmission_type     object
vehicle_style         object
highway_mpg            int64
city_mpg               int64
price                  int64
above_average          int64
dtype: object

In [66]:
cat_columns = ["make", "model", "transmission_type", "vehicle_style"]

In [75]:
for c in cat_columns:
    mi = np.round(mutual_info_score(df_fulltrain.above_average, df_fulltrain[f"{c}"]), 2)
    print(c, mi)

make 0.24
model 0.46
transmission_type 0.02
vehicle_style 0.08


In [76]:
def mutual_info_above_avg_score(series):
    return np.round(mutual_info_score(series, df_fulltrain.above_average), 2)

In [78]:
df_fulltrain[cat_columns].apply(mutual_info_above_avg_score)

make                 0.24
model                0.46
transmission_type    0.02
vehicle_style        0.08
dtype: float64

In [79]:
from sklearn.feature_extraction import DictVectorizer

In [80]:
df_train.columns

Index(['make', 'model', 'year', 'engine_hp', 'engine_cylinders',
       'transmission_type', 'vehicle_style', 'highway_mpg', 'city_mpg',
       'above_average'],
      dtype='object')

In [83]:
y_train = df_train.above_average.values
y_test = df_test.above_average.values
y_val = df_val.above_average.values


In [85]:
del df_train["above_average"]
del df_test["above_average"]
del df_val["above_average"]


In [86]:
train_dicts = df_train.to_dict(orient="records")

In [88]:
dv = DictVectorizer(sparse=False)

In [89]:
X_train = dv.fit_transform(train_dicts)

In [92]:
val_dicts = df_val.to_dict(orient="records")

In [93]:
X_val = dv.transform(val_dicts)

In [95]:
from sklearn.linear_model import LogisticRegression

In [112]:
model = LogisticRegression(solver='liblinear', C=10, max_iter=1000, random_state=42)

In [113]:
model.fit(X_train, y_train)

In [114]:
model.intercept_[0]

-0.08890611172877781

In [122]:
y_pred = model.predict_proba(X_val)[:, 1]

In [123]:
ab_av = (y_pred >= .5)

In [124]:
y_val

array([0, 1, 0, ..., 0, 1, 1])

In [126]:
ab_av.astype(int)

array([0, 1, 0, ..., 0, 1, 1])

In [127]:
(y_val == ab_av).mean()

0.9345362987830466

In [144]:
baseline = .9345362987830466
dv = DictVectorizer(sparse=False)
model = LogisticRegression(solver='liblinear', C=10, max_iter=1000, random_state=42)
res = []
for c in df_train.columns:
    #print(c, type(columns=[c]))
    df_train_temp = df_train.drop(columns=[c])
    df_val_temp = df_val.drop(columns=[c])
    train_dicts = df_train_temp.to_dict(orient="records")
    X_train = dv.fit_transform(train_dicts)
    val_dicts = df_val_temp.to_dict(orient="records")
    X_val = dv.transform(val_dicts)
    model.fit(X_train, y_train)
    y_pred = model.predict_proba(X_val)[:, 1]
    ab_av = (y_pred >= .5)
    acc = (y_val == ab_av).mean()
    diff = np.absolute(acc -baseline)
    res.append([c, acc, diff])
res
    
    
    
    
    

[['make', 0.9458665547629039, 0.011330255979857373],
 ['model', 0.9240453210239195, 0.010490977759127107],
 ['year', 0.9467058329836341, 0.012169534200587528],
 ['engine_hp', 0.9236256819135543, 0.01091061686949224],
 ['engine_cylinders', 0.946286193873269, 0.011749895090222395],
 ['transmission_type', 0.9450272765421738, 0.010490977759127218],
 ['vehicle_style', 0.9408308854385229, 0.006294586655476331],
 ['highway_mpg', 0.9412505245488879, 0.006714225765841353],
 ['city_mpg', 0.946286193873269, 0.011749895090222395]]

In [145]:
res.sort(key=lambda x: x[2])

In [146]:
res

[['vehicle_style', 0.9408308854385229, 0.006294586655476331],
 ['highway_mpg', 0.9412505245488879, 0.006714225765841353],
 ['model', 0.9240453210239195, 0.010490977759127107],
 ['transmission_type', 0.9450272765421738, 0.010490977759127218],
 ['engine_hp', 0.9236256819135543, 0.01091061686949224],
 ['make', 0.9458665547629039, 0.011330255979857373],
 ['engine_cylinders', 0.946286193873269, 0.011749895090222395],
 ['city_mpg', 0.946286193873269, 0.011749895090222395],
 ['year', 0.9467058329836341, 0.012169534200587528]]

In [154]:
df["price"] = np.log1p(df.price)

In [155]:
df_fulltrain, df_test = train_test_split(df, test_size=.2, random_state = 42)
df_train, df_val = train_test_split(df_fulltrain, test_size=.25, random_state = 42)


In [156]:
y_train = df_train.price.values
y_val = df_val.price.values
y_test = df_test.price.values

In [164]:
#del df_train["price"]
#del df_val["price"]
#del df_test["price"]
del df_train["above_average"]
del df_val["above_average"]
del df_test["above_average"]


In [165]:
df_train.dtypes

make                  object
model                 object
year                   int64
engine_hp            float64
engine_cylinders     float64
transmission_type     object
vehicle_style         object
highway_mpg            int64
city_mpg               int64
dtype: object

In [189]:
train_dicts = df_train.to_dict(orient="records")
X_train = dv.fit_transform(train_dicts)
val_dicts = df_val.to_dict(orient="records")
X_val = dv.transform(val_dicts)

In [190]:
alpha = [0, .01, .1, 1, 10]

In [191]:
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error

In [192]:
type(mean_squared_error(y_val, y_pred, squared=False))

numpy.float64

In [194]:
for a in alpha:
    model = Ridge(solver='sag', random_state=42, alpha=a)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_val)
    rmse = mean_squared_error(y_val, y_pred, squared=False).round(5)
    print (a, rmse)



0 0.48679




0.01 0.48679




0.1 0.4868




1 0.48682
10 0.48703






0.4868168752609268