# Naive Models

Here we will train some naive models just based on our numerical features to predict popularity. 

In [30]:
import pandas as pd
import numpy as np
import math
from sklearn import linear_model
from sklearn import svm
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import make_pipeline
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

## Import data & set up design matrix and prediction

In [2]:
data = pd.read_csv('./data/data.csv')
data.head()

Unnamed: 0,acousticness,artists,danceability,duration_ms,energy,explicit,id,instrumentalness,key,liveness,loudness,mode,name,popularity,release_date,speechiness,tempo,valence,year
0,0.995,['Carl Woitschach'],0.708,158648,0.195,0,6KbQ3uYMLKb5jDxLF7wYDD,0.563,10,0.151,-12.428,1,Singende Bataillone 1. Teil,0,1928,0.0506,118.469,0.779,1928
1,0.994,"['Robert Schumann', 'Vladimir Horowitz']",0.379,282133,0.0135,0,6KuQTIu1KoTTkLXKrwlLPV,0.901,8,0.0763,-28.454,1,"Fantasiestücke, Op. 111: Più tosto lento",0,1928,0.0462,83.972,0.0767,1928
2,0.604,['Seweryn Goszczyński'],0.749,104300,0.22,0,6L63VW0PibdM1HDSBoqnoM,0.0,5,0.119,-19.924,0,Chapter 1.18 - Zamek kaniowski,0,1928,0.929,107.177,0.88,1928
3,0.995,['Francisco Canaro'],0.781,180760,0.13,0,6M94FkXd15sOAOQYRnWPN8,0.887,1,0.111,-14.734,0,Bebamos Juntos - Instrumental (Remasterizado),0,1928-09-25,0.0926,108.003,0.72,1928
4,0.99,"['Frédéric Chopin', 'Vladimir Horowitz']",0.21,687733,0.204,0,6N6tiFZ9vLTSOIxkj8qKrd,0.908,11,0.098,-16.829,1,"Polonaise-Fantaisie in A-Flat Major, Op. 61",1,1928,0.0424,62.149,0.0693,1928


In [3]:
n = len(data)
feats_num = ['acousticness', 'danceability', 'energy', 'instrumentalness', 'key', 'liveness', 
             'loudness', 'speechiness', 'tempo', 'valence', 'year']
X = data[feats_num]
X['offset'] = np.ones((n,))
y = data['popularity']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """


### Split data into training and test sets

In [4]:
rand_ids = np.random.randint(0, n, size=n)
train = rand_ids[0:math.floor(0.8*n)].tolist()
test = rand_ids[math.floor(0.8*n):n].tolist()

Xtrain = X.iloc[train]
ytrain = y.iloc[train]
Xtest = X.iloc[test]
ytest = y.iloc[test]

In [5]:
print(np.shape(Xtrain))
print(np.shape(ytrain))
print(np.shape(Xtest))
print(np.shape(ytest))

(135927, 12)
(135927,)
(33982, 12)
(33982,)


### 1. Linear Model

#### Fit the model

In [6]:
reg = linear_model.LinearRegression()
reg.fit(Xtrain, ytrain)
w = reg.coef_
print(w)

[-4.07490371e+00  4.96093184e+00 -8.62366178e-01 -2.51895210e+00
 -1.03599979e-02 -2.61948484e+00  2.37131165e-02 -7.54455270e+00
  5.86303728e-03 -4.41864839e-01  6.82520572e-01  0.00000000e+00]


In [7]:
reg.score(Xtrain, ytrain)

0.7816548103009746

In [8]:
reg.score(Xtest, ytest)

0.783739925438374

In [9]:
y_pred_train = reg.predict(Xtrain)

In [10]:
mean_squared_error(ytrain, y_pred_train)

101.25321383744433

#### Evaluate on test set

In [11]:
y_pred_test = reg.predict(Xtest)

In [12]:
mean_squared_error(ytest, y_pred_test)

100.08462704024267

In [13]:
mean_absolute_error(ytest, y_pred_test)

7.663311481848592

### 2. Ridge Regression

#### Fit the model

In [14]:
reg_ridge = linear_model.Ridge(alpha=.5)
reg_ridge.fit(Xtrain, ytrain)

Ridge(alpha=0.5, copy_X=True, fit_intercept=True, max_iter=None,
      normalize=False, random_state=None, solver='auto', tol=0.001)

In [15]:
w_ridge = reg_ridge.coef_
print(w_ridge)

[-4.07452161e+00  4.95947779e+00 -8.62312792e-01 -2.51879507e+00
 -1.03609055e-02 -2.61951712e+00  2.37277027e-02 -7.54272276e+00
  5.86202498e-03 -4.41299042e-01  6.82526377e-01  0.00000000e+00]


In [16]:
y_pred_train = reg_ridge.predict(Xtrain)

In [17]:
mean_squared_error(ytrain, y_pred_train)

101.25321392268627

In [18]:
mean_absolute_error(ytrain, y_pred_train)

7.714841578866958

#### Evaluate on test set

In [24]:
y_pred_test = reg_ridge.predict(Xtest)

In [25]:
mean_squared_error(ytest, y_pred_test)

102.13359857486013

In [26]:
mean_absolute_error(ytest, y_pred_test)

7.720139923478649

### 3. Polynomial Regression

#### Fit the model


In [27]:
poly = PolynomialFeatures(degree=2)
Xtrain = poly.fit_transform(Xtrain)
reg = linear_model.LinearRegression()
reg.fit(Xtrain, ytrain)
w_poly = reg.coef_

In [28]:
y_pred_train = reg.predict(Xtrain)

In [29]:
mean_squared_error(ytrain, y_pred_train)

96.46763782219608

In [30]:
mean_absolute_error(ytrain, y_pred_train)

7.461951169738375

#### Evaluate on test set

In [31]:
Xtest = poly.fit_transform(Xtest)
y_pred_test = reg.predict(Xtest)

In [32]:
mean_squared_error(ytest, y_pred_test)

98.01798185034083

In [33]:
mean_absolute_error(ytest, y_pred_test)

7.496326191994422

### 4. SVM Regression

#### Fit the Model

In [None]:
reg = svm.SVR()
reg.fit(Xtrain, ytrain)

In [None]:
y_pred_train = reg.predict(Xtrain)

In [None]:
mean_squared_error(ytrain, y_pred_train)

In [None]:
mean_absolute_error(ytrain, y_pred_train)

#### Evaluate Test Set

In [None]:
y_pred_test = reg.predict(Xtest)

In [None]:
mean_squared_error(ytest, y_pred_test)

In [None]:
mean_absolute_error(ytest, y_pred_test)

### 5. Nearest Neighbors Regression

#### Fit the Model

In [22]:
n = 5
nnreg = KNeighborsRegressor(n_neighbors = n)
nnreg.fit(Xtrain, ytrain)

KNeighborsRegressor(algorithm='auto', leaf_size=30, metric='minkowski',
                    metric_params=None, n_jobs=None, n_neighbors=5, p=2,
                    weights='uniform')

In [25]:
y_pred_train = nnreg.predict(Xtrain)
mean_squared_error(ytrain, y_pred_train)

53.39956506065756

In [26]:
mean_absolute_error(ytrain, y_pred_train)

5.024165912585432

#### Evaluate on Test Set

In [27]:
y_pred_test = nnreg.predict(Xtest)

In [28]:
mean_squared_error(ytest, y_pred_test)

84.81800247189689

In [29]:
mean_absolute_error(ytest, y_pred_test)

6.468859984697781

### 6. Decision Tree

#### Fit the Model

In [31]:
decision_tree_reg = DecisionTreeRegressor()
decision_tree_reg.fit(Xtrain, ytrain)

DecisionTreeRegressor(ccp_alpha=0.0, criterion='mse', max_depth=None,
                      max_features=None, max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, presort='deprecated',
                      random_state=None, splitter='best')

In [32]:
y_pred_train = decision_tree_reg.predict(Xtrain)
mean_squared_error(ytrain, y_pred_train)

0.6994933379856905

In [37]:
mean_absolute_error(ytrain, y_pred_train)

0.073603688534497

#### Evaluate on Test Set

In [34]:
y_pred_test = decision_tree_reg.predict(Xtest)

In [35]:
mean_squared_error(ytest, y_pred_test)

77.08097059270808

In [36]:
mean_absolute_error(ytest, y_pred_test)

4.1289333148361465

### 7. Lasso

#### Fit the Model

In [39]:
lasso_reg = linear_model.Lasso(alpha=0.1)
lasso_reg.fit(Xtrain, ytrain)

Lasso(alpha=0.1, copy_X=True, fit_intercept=True, max_iter=1000,
      normalize=False, positive=False, precompute=False, random_state=None,
      selection='cyclic', tol=0.0001, warm_start=False)

In [40]:
y_pred_train = lasso_reg.predict(Xtrain)
mean_squared_error(ytrain, y_pred_train)

102.86343379152532

In [41]:
mean_absolute_error(ytrain, y_pred_train)

7.767250178491312

#### Evaluate the Test Set

In [42]:
y_pred_test = lasso_reg.predict(Xtest)

In [43]:
mean_squared_error(ytest, y_pred_test)

101.72604380250877

In [44]:
mean_absolute_error(ytest, y_pred_test)

7.718754170150074