In [1]:
# import libraries
import pandas as pd
import numpy as np

from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.pipeline import Pipeline
from sklearn.svm import SVR
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
pd.options.display.float_format = '{:.4f}'.format
np.random.seed(42)

In [3]:
# Read the dataset
dataSet  = pd.read_csv("DataSet/app-store-apple-data-set-10k-apps/AppleStore.csv")

dataSet.head()

Unnamed: 0.1,Unnamed: 0,id,track_name,size_bytes,currency,price,rating_count_tot,rating_count_ver,user_rating,user_rating_ver,ver,cont_rating,prime_genre,sup_devices.num,ipadSc_urls.num,lang.num,vpp_lic
0,1,281656475,PAC-MAN Premium,100788224,USD,3.99,21292,26,4.0,4.5,6.3.5,4+,Games,38,5,10,1
1,2,281796108,Evernote - stay organized,158578688,USD,0.0,161065,26,4.0,3.5,8.2.2,4+,Productivity,37,5,23,1
2,3,281940292,"WeatherBug - Local Weather, Radar, Maps, Alerts",100524032,USD,0.0,188583,2822,3.5,4.5,5.0.0,4+,Weather,37,5,3,1
3,4,282614216,"eBay: Best App to Buy, Sell, Save! Online Shop...",128512000,USD,0.0,262241,649,4.0,4.5,5.10.0,12+,Shopping,37,5,9,1
4,5,282935706,Bible,92774400,USD,0.0,985920,5320,4.5,5.0,7.5.1,4+,Reference,37,5,45,1


In [4]:
dataSet = pd.concat([dataSet, pd.get_dummies(dataSet['prime_genre'], prefix_sep="genre", dummy_na=True)], axis = 1)

In [5]:
dataSet = pd.concat([dataSet, pd.get_dummies(dataSet['cont_rating'], prefix_sep="cont_", dummy_na=True)], axis = 1)

In [6]:
dataSet.head()

Unnamed: 0.1,Unnamed: 0,id,track_name,size_bytes,currency,price,rating_count_tot,rating_count_ver,user_rating,user_rating_ver,...,Sports,Travel,Utilities,Weather,nan,12+,17+,4+,9+,nan.1
0,1,281656475,PAC-MAN Premium,100788224,USD,3.99,21292,26,4.0,4.5,...,0,0,0,0,0,0,0,1,0,0
1,2,281796108,Evernote - stay organized,158578688,USD,0.0,161065,26,4.0,3.5,...,0,0,0,0,0,0,0,1,0,0
2,3,281940292,"WeatherBug - Local Weather, Radar, Maps, Alerts",100524032,USD,0.0,188583,2822,3.5,4.5,...,0,0,0,1,0,0,0,1,0,0
3,4,282614216,"eBay: Best App to Buy, Sell, Save! Online Shop...",128512000,USD,0.0,262241,649,4.0,4.5,...,0,0,0,0,0,1,0,0,0,0
4,5,282935706,Bible,92774400,USD,0.0,985920,5320,4.5,5.0,...,0,0,0,0,0,0,0,1,0,0


In [7]:
subset = dataSet.sample(1000)

In [8]:
dataSet.columns

Index([       'Unnamed: 0',                'id',        'track_name',
              'size_bytes',          'currency',             'price',
        'rating_count_tot',  'rating_count_ver',       'user_rating',
         'user_rating_ver',               'ver',       'cont_rating',
             'prime_genre',   'sup_devices.num',   'ipadSc_urls.num',
                'lang.num',           'vpp_lic',              'Book',
                'Business',          'Catalogs',         'Education',
           'Entertainment',           'Finance',      'Food & Drink',
                   'Games',  'Health & Fitness',         'Lifestyle',
                 'Medical',             'Music',        'Navigation',
                    'News',     'Photo & Video',      'Productivity',
               'Reference',          'Shopping', 'Social Networking',
                  'Sports',            'Travel',         'Utilities',
                 'Weather',                 nan,               '12+',
                    

In [9]:
features =[ 'size_bytes','currency','price','rating_count_tot','rating_count_ver','user_rating','user_rating_ver',
           'sup_devices.num','ipadSc_urls.num','lang.num','vpp_lic','Book','Business','Catalogs','Education',
           'Entertainment','Finance','Food & Drink','Games','Health & Fitness','Lifestyle','Medical','Music',
           'Navigation','News','Photo & Video','Productivity','Reference','Shopping','Social Networking','Sports',
           'Travel','Utilities','Weather','12+','17+','4+','9+',]

In [10]:
X = dataSet.drop(labels=['Unnamed: 0','user_rating', 'prime_genre', 'id', 'track_name','currency', 'ver', 'cont_rating'], axis=1)
y = dataSet['user_rating']

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [11]:
np.random.seed(42)
ss = StandardScaler()
ss.fit(X_train)
X_train_s = ss.transform(X_train)
X_test_s = ss.transform(X_test)

In [12]:
print(X_train.shape, y_train.shape)

(5397, 38) (5397,)


In [13]:
from sklearn.linear_model import ElasticNetCV, RidgeCV, LassoCV

In [14]:
ElasticNetCV(cv=3, random_state=42)

ElasticNetCV(alphas=None, copy_X=True, cv=3, eps=0.001, fit_intercept=True,
       l1_ratio=0.5, max_iter=1000, n_alphas=100, n_jobs=1,
       normalize=False, positive=False, precompute='auto', random_state=42,
       selection='cyclic', tol=0.0001, verbose=0)

In [15]:
params_grid={
        'l1_ratio':[0.25, 0.5, 1]
            }
gs = GridSearchCV(ElasticNetCV(cv=3, random_state=42), params_grid, verbose=2, cv=3)
ss = StandardScaler()
gs.fit(X_train_s, y_train)

Fitting 3 folds for each of 3 candidates, totalling 9 fits
[CV] l1_ratio=0.25 ...................................................
[CV] .................................... l1_ratio=0.25, total=   0.1s
[CV] l1_ratio=0.25 ...................................................
[CV] .................................... l1_ratio=0.25, total=   0.0s
[CV] l1_ratio=0.25 ...................................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.1s remaining:    0.0s


[CV] .................................... l1_ratio=0.25, total=   0.0s
[CV] l1_ratio=0.5 ....................................................
[CV] ..................................... l1_ratio=0.5, total=   0.0s
[CV] l1_ratio=0.5 ....................................................
[CV] ..................................... l1_ratio=0.5, total=   0.0s
[CV] l1_ratio=0.5 ....................................................
[CV] ..................................... l1_ratio=0.5, total=   0.0s
[CV] l1_ratio=1 ......................................................
[CV] ....................................... l1_ratio=1, total=   0.0s
[CV] l1_ratio=1 ......................................................
[CV] ....................................... l1_ratio=1, total=   0.0s
[CV] l1_ratio=1 ......................................................
[CV] ....................................... l1_ratio=1, total=   0.0s


[Parallel(n_jobs=1)]: Done   9 out of   9 | elapsed:    0.5s finished


GridSearchCV(cv=3, error_score='raise',
       estimator=ElasticNetCV(alphas=None, copy_X=True, cv=3, eps=0.001, fit_intercept=True,
       l1_ratio=0.5, max_iter=1000, n_alphas=100, n_jobs=1,
       normalize=False, positive=False, precompute='auto', random_state=42,
       selection='cyclic', tol=0.0001, verbose=0),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'l1_ratio': [0.25, 0.5, 1]}, pre_dispatch='2*n_jobs',
       refit=True, return_train_score='warn', scoring=None, verbose=2)

In [16]:
gs.score(X_test_s, y_test)

0.6029481377039894

In [17]:
gs.best_params_

{'l1_ratio': 1}

In [18]:
gs.best_score_

0.6053058234080807

In [19]:
gs.best_estimator_.coef_

array([-0.00000000e+00,  2.32019726e-02,  4.94709208e-03,  1.71803705e-03,
        1.12972370e+00, -1.72471801e-02,  8.98891450e-02,  3.65483993e-02,
        3.08291709e-02, -1.79187518e-02,  0.00000000e+00, -1.35907115e-02,
       -0.00000000e+00, -1.10651795e-02, -0.00000000e+00,  1.51608579e-02,
       -3.40168626e-02,  1.07830945e-02,  0.00000000e+00, -0.00000000e+00,
       -0.00000000e+00, -0.00000000e+00,  1.19768097e-02,  2.02742696e-02,
        7.73919213e-03,  0.00000000e+00,  3.92983228e-02, -0.00000000e+00,
       -0.00000000e+00,  1.45777574e-02,  0.00000000e+00,  7.60857871e-05,
        0.00000000e+00,  6.70633432e-03, -0.00000000e+00, -0.00000000e+00,
        0.00000000e+00,  0.00000000e+00])

In [20]:
pd.DataFrame({'variable': features, f'_coefficient': gs.best_estimator_.coef_}).sort_values(by=f'_coefficient')

Unnamed: 0,variable,_coefficient
16,Finance,-0.034
9,lang.num,-0.0179
5,user_rating,-0.0172
11,Book,-0.0136
13,Catalogs,-0.0111
19,Health & Fitness,-0.0
20,Lifestyle,-0.0
21,Medical,-0.0
0,size_bytes,-0.0
27,Reference,-0.0


In [21]:
RidgeCV(cv=3)

RidgeCV(alphas=(0.1, 1.0, 10.0), cv=3, fit_intercept=True, gcv_mode=None,
    normalize=False, scoring=None, store_cv_values=False)

In [22]:
params_grid={
        'alphas':[(0.1, 1, 10)]
            }
np.random.seed(42)
gs = GridSearchCV(RidgeCV(), params_grid, verbose=2, cv=3)
gs.fit(X_train, y_train)

Fitting 3 folds for each of 1 candidates, totalling 3 fits
[CV] alphas=(0.1, 1, 10) .............................................
[CV] .............................. alphas=(0.1, 1, 10), total=   0.1s
[CV] alphas=(0.1, 1, 10) .............................................
[CV] .............................. alphas=(0.1, 1, 10), total=   0.0s
[CV] alphas=(0.1, 1, 10) .............................................
[CV] .............................. alphas=(0.1, 1, 10), total=   0.0s


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.1s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    0.1s finished


GridSearchCV(cv=3, error_score='raise',
       estimator=RidgeCV(alphas=(0.1, 1.0, 10.0), cv=None, fit_intercept=True, gcv_mode=None,
    normalize=False, scoring=None, store_cv_values=False),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'alphas': [(0.1, 1, 10)]}, pre_dispatch='2*n_jobs',
       refit=True, return_train_score='warn', scoring=None, verbose=2)

In [23]:
gs.best_estimator_

RidgeCV(alphas=(0.1, 1, 10), cv=None, fit_intercept=True, gcv_mode=None,
    normalize=False, scoring=None, store_cv_values=False)

In [24]:
gs.best_score_

-735632.3703271607

In [25]:
gs.best_estimator_.coef_

array([ 1.19394849e-06,  5.51815575e-03,  1.72450409e-07,  2.90411438e-06,
        6.30492146e-01, -6.92280592e-03,  5.29935348e-02,  5.15369508e-03,
        3.92152263e-01, -2.47349169e-01,  3.87086642e-02, -3.04192460e-01,
       -5.33646725e-02, -1.20343014e-01, -8.43750984e-02,  2.37387929e-01,
       -1.28928574e-01,  1.06358161e-01,  5.48223337e-02, -1.50983570e-01,
       -7.12718531e-02, -1.07337289e-01,  1.71926454e-01,  1.06632275e-01,
        7.51603156e-02,  2.06541870e-02,  3.39287091e-01, -8.02697680e-02,
       -6.41985758e-02,  1.97633886e-01, -8.66742587e-03,  7.27101721e-02,
        0.00000000e+00,  4.71705849e-02, -2.75470370e-02, -2.32325364e-02,
        3.60898854e-03,  0.00000000e+00])

In [26]:
LassoCV()

LassoCV(alphas=None, copy_X=True, cv=None, eps=0.001, fit_intercept=True,
    max_iter=1000, n_alphas=100, n_jobs=1, normalize=False, positive=False,
    precompute='auto', random_state=None, selection='cyclic', tol=0.0001,
    verbose=False)

In [27]:
params_grid={
        'alphas':[(0.1, 1, 10)]
            }
gs = GridSearchCV(LassoCV(), params_grid, verbose=2, cv=3)
gs.fit(X_train, y_train)

Fitting 3 folds for each of 1 candidates, totalling 3 fits
[CV] alphas=(0.1, 1, 10) .............................................
[CV] .............................. alphas=(0.1, 1, 10), total=   0.0s
[CV] alphas=(0.1, 1, 10) .............................................
[CV] .............................. alphas=(0.1, 1, 10), total=   0.0s
[CV] alphas=(0.1, 1, 10) .............................................
[CV] .............................. alphas=(0.1, 1, 10), total=   0.0s


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    0.1s finished


GridSearchCV(cv=3, error_score='raise',
       estimator=LassoCV(alphas=None, copy_X=True, cv=None, eps=0.001, fit_intercept=True,
    max_iter=1000, n_alphas=100, n_jobs=1, normalize=False, positive=False,
    precompute='auto', random_state=None, selection='cyclic', tol=0.0001,
    verbose=False),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'alphas': [(0.1, 1, 10)]}, pre_dispatch='2*n_jobs',
       refit=True, return_train_score='warn', scoring=None, verbose=2)

In [28]:
gs.best_estimator_

LassoCV(alphas=(0.1, 1, 10), copy_X=True, cv=None, eps=0.001,
    fit_intercept=True, max_iter=1000, n_alphas=100, n_jobs=1,
    normalize=False, positive=False, precompute='auto', random_state=None,
    selection='cyclic', tol=0.0001, verbose=False)

In [29]:
gs.best_score_

0.6022999478089462

In [30]:
gs.best_estimator_.coef_

array([-2.74834453e-11,  3.99943835e-03,  2.08418061e-07,  3.78582470e-06,
        6.05101762e-01, -4.34293592e-03,  2.33829199e-02,  6.73890472e-03,
        0.00000000e+00, -0.00000000e+00,  0.00000000e+00, -0.00000000e+00,
        0.00000000e+00, -0.00000000e+00, -0.00000000e+00,  0.00000000e+00,
       -0.00000000e+00,  0.00000000e+00,  0.00000000e+00, -0.00000000e+00,
        0.00000000e+00, -0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
        0.00000000e+00,  0.00000000e+00,  0.00000000e+00, -0.00000000e+00,
       -0.00000000e+00,  0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
        0.00000000e+00,  0.00000000e+00, -0.00000000e+00,  0.00000000e+00,
       -0.00000000e+00,  0.00000000e+00])

In [31]:
pd.DataFrame({'variable': features, f'_coefficient': gs.best_estimator_.coef_}).sort_values(by=f'_coefficient')

Unnamed: 0,variable,_coefficient
5,user_rating,-0.0043
0,size_bytes,-0.0
21,Medical,-0.0
22,Music,0.0
23,Navigation,0.0
24,News,0.0
25,Photo & Video,0.0
26,Productivity,0.0
27,Reference,-0.0
28,Shopping,-0.0


### Now with Pipelines

In [32]:
pipe = Pipeline([
    ('ss', StandardScaler()),
    ('svr', SVR())
])

In [33]:
params_grid={
    'ss__with_mean':[True, False],
    'ss__with_std':[True, False],
    'svr__C': [0.1, 1, 5], 
    'svr__kernel':['rbf', 'linear', 'poly']
    }

In [34]:
gs = GridSearchCV(pipe, params_grid, verbose=2)
gs.fit(X_train_s, y_train)

Fitting 3 folds for each of 36 candidates, totalling 108 fits
[CV] ss__with_mean=True, ss__with_std=True, svr__C=0.1, svr__kernel=rbf 
[CV]  ss__with_mean=True, ss__with_std=True, svr__C=0.1, svr__kernel=rbf, total=   0.9s
[CV] ss__with_mean=True, ss__with_std=True, svr__C=0.1, svr__kernel=rbf 


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    1.4s remaining:    0.0s


[CV]  ss__with_mean=True, ss__with_std=True, svr__C=0.1, svr__kernel=rbf, total=   0.9s
[CV] ss__with_mean=True, ss__with_std=True, svr__C=0.1, svr__kernel=rbf 
[CV]  ss__with_mean=True, ss__with_std=True, svr__C=0.1, svr__kernel=rbf, total=   0.9s
[CV] ss__with_mean=True, ss__with_std=True, svr__C=0.1, svr__kernel=linear 
[CV]  ss__with_mean=True, ss__with_std=True, svr__C=0.1, svr__kernel=linear, total=   0.8s
[CV] ss__with_mean=True, ss__with_std=True, svr__C=0.1, svr__kernel=linear 
[CV]  ss__with_mean=True, ss__with_std=True, svr__C=0.1, svr__kernel=linear, total=   0.9s
[CV] ss__with_mean=True, ss__with_std=True, svr__C=0.1, svr__kernel=linear 
[CV]  ss__with_mean=True, ss__with_std=True, svr__C=0.1, svr__kernel=linear, total=   0.9s
[CV] ss__with_mean=True, ss__with_std=True, svr__C=0.1, svr__kernel=poly 
[CV]  ss__with_mean=True, ss__with_std=True, svr__C=0.1, svr__kernel=poly, total=   0.8s
[CV] ss__with_mean=True, ss__with_std=True, svr__C=0.1, svr__kernel=poly 
[CV]  ss__wit

[CV]  ss__with_mean=True, ss__with_std=False, svr__C=5, svr__kernel=poly, total=   1.7s
[CV] ss__with_mean=True, ss__with_std=False, svr__C=5, svr__kernel=poly 
[CV]  ss__with_mean=True, ss__with_std=False, svr__C=5, svr__kernel=poly, total=   1.3s
[CV] ss__with_mean=False, ss__with_std=True, svr__C=0.1, svr__kernel=rbf 
[CV]  ss__with_mean=False, ss__with_std=True, svr__C=0.1, svr__kernel=rbf, total=   0.9s
[CV] ss__with_mean=False, ss__with_std=True, svr__C=0.1, svr__kernel=rbf 
[CV]  ss__with_mean=False, ss__with_std=True, svr__C=0.1, svr__kernel=rbf, total=   0.8s
[CV] ss__with_mean=False, ss__with_std=True, svr__C=0.1, svr__kernel=rbf 
[CV]  ss__with_mean=False, ss__with_std=True, svr__C=0.1, svr__kernel=rbf, total=   0.8s
[CV] ss__with_mean=False, ss__with_std=True, svr__C=0.1, svr__kernel=linear 
[CV]  ss__with_mean=False, ss__with_std=True, svr__C=0.1, svr__kernel=linear, total=   0.8s
[CV] ss__with_mean=False, ss__with_std=True, svr__C=0.1, svr__kernel=linear 
[CV]  ss__with_m

[CV]  ss__with_mean=False, ss__with_std=False, svr__C=5, svr__kernel=linear, total=  11.2s
[CV] ss__with_mean=False, ss__with_std=False, svr__C=5, svr__kernel=linear 
[CV]  ss__with_mean=False, ss__with_std=False, svr__C=5, svr__kernel=linear, total=   8.2s
[CV] ss__with_mean=False, ss__with_std=False, svr__C=5, svr__kernel=poly 
[CV]  ss__with_mean=False, ss__with_std=False, svr__C=5, svr__kernel=poly, total=   1.2s
[CV] ss__with_mean=False, ss__with_std=False, svr__C=5, svr__kernel=poly 
[CV]  ss__with_mean=False, ss__with_std=False, svr__C=5, svr__kernel=poly, total=   1.3s
[CV] ss__with_mean=False, ss__with_std=False, svr__C=5, svr__kernel=poly 
[CV]  ss__with_mean=False, ss__with_std=False, svr__C=5, svr__kernel=poly, total=   1.3s


[Parallel(n_jobs=1)]: Done 108 out of 108 | elapsed:  4.7min finished


GridSearchCV(cv=None, error_score='raise',
       estimator=Pipeline(memory=None,
     steps=[('ss', StandardScaler(copy=True, with_mean=True, with_std=True)), ('svr', SVR(C=1.0, cache_size=200, coef0=0.0, degree=3, epsilon=0.1, gamma='auto',
  kernel='rbf', max_iter=-1, shrinking=True, tol=0.001, verbose=False))]),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'ss__with_mean': [True, False], 'ss__with_std': [True, False], 'svr__C': [0.1, 1, 5], 'svr__kernel': ['rbf', 'linear', 'poly']},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=2)

In [35]:
gs.best_estimator_

Pipeline(memory=None,
     steps=[('ss', StandardScaler(copy=True, with_mean=True, with_std=False)), ('svr', SVR(C=1, cache_size=200, coef0=0.0, degree=3, epsilon=0.1, gamma='auto',
  kernel='rbf', max_iter=-1, shrinking=True, tol=0.001, verbose=False))])

In [36]:
gs.best_score_

0.5731354044819255

In [37]:
gs.best_params_

{'ss__with_mean': True,
 'ss__with_std': False,
 'svr__C': 1,
 'svr__kernel': 'rbf'}