# Naive Models

Here we will train some naive models just based on our numerical features to predict popularity. 

In [41]:
import pandas as pd
import numpy as np
import math
from sklearn import linear_model
from sklearn import svm
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import make_pipeline
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

## Import data & set up design matrix and prediction

In [42]:
data = pd.read_csv('./data/data.csv')

In [43]:
n = len(data)
feats_num = ['acousticness', 'danceability', 'energy', 'instrumentalness', 'key', 'liveness', 
             'loudness', 'speechiness', 'tempo', 'valence', 'year']
X = data[feats_num]
X['offset'] = np.ones((n,))
y = data['popularity']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """


### Split data into training and test sets

In [44]:
rand_ids = np.random.randint(0, n, size=n)
train = rand_ids[0:math.floor(0.8*n)].tolist()
test = rand_ids[math.floor(0.8*n):n].tolist()

Xtrain = X.iloc[train]
ytrain = y.iloc[train]
Xtest = X.iloc[test]
ytest = y.iloc[test]

In [45]:
print(np.shape(Xtrain))
print(np.shape(ytrain))
print(np.shape(Xtest))
print(np.shape(ytest))

(135927, 12)
(135927,)
(33982, 12)
(33982,)


### 1. Linear Model

#### Fit the model

In [46]:
reg = linear_model.LinearRegression()
reg.fit(Xtrain, ytrain)
w = reg.coef_
print(w)

[-4.25372670e+00  5.07653569e+00 -1.46243016e+00 -2.25721589e+00
 -4.48624587e-03 -2.34208933e+00  4.27769619e-02 -7.52099877e+00
  5.87237092e-03 -3.59179277e-01  6.85028581e-01  0.00000000e+00]


In [47]:
reg.score(Xtrain, ytrain)

0.7845481153764502

In [48]:
reg.score(Xtest, ytest)

0.7840478991642359

In [49]:
y_pred_train = reg.predict(Xtrain)

In [50]:
mean_squared_error(ytrain, y_pred_train)

100.23614126206915

#### Evaluate on test set

In [51]:
y_pred_test = reg.predict(Xtest)

In [52]:
mean_squared_error(ytest, y_pred_test)

101.20851302104528

In [53]:
mean_absolute_error(ytest, y_pred_test)

7.709405653966397

### 2. Ridge Regression

#### Fit the model

In [54]:
reg_ridge = linear_model.Ridge(alpha=.5)
reg_ridge.fit(Xtrain, ytrain)

Ridge(alpha=0.5, copy_X=True, fit_intercept=True, max_iter=None,
      normalize=False, random_state=None, solver='auto', tol=0.001)

In [55]:
w_ridge = reg_ridge.coef_
print(w_ridge)

[-4.25328840e+00  5.07507243e+00 -1.46220398e+00 -2.25709036e+00
 -4.48727101e-03 -2.34217306e+00  4.27873675e-02 -7.51914657e+00
  5.87135252e-03 -3.58638023e-01  6.85034261e-01  0.00000000e+00]


In [56]:
y_pred_train = reg_ridge.predict(Xtrain)

In [57]:
mean_squared_error(ytrain, y_pred_train)

100.23614134970833

In [58]:
mean_absolute_error(ytrain, y_pred_train)

7.698135087598441

#### Evaluate on test set

In [59]:
y_pred_test = reg_ridge.predict(Xtest)

In [60]:
mean_squared_error(ytest, y_pred_test)

101.20851083176558

In [61]:
mean_absolute_error(ytest, y_pred_test)

7.709402317982085

### 3. Polynomial Regression

#### Fit the model


In [32]:
poly = PolynomialFeatures(degree=2)
Xtrain = poly.fit_transform(Xtrain)
reg = linear_model.LinearRegression()
reg.fit(Xtrain, ytrain)
w_poly = reg.coef_

In [34]:
y_pred_train = reg.predict(Xtrain)

In [35]:
mean_squared_error(ytrain, y_pred_train)

92.93964359493044

In [36]:
mean_absolute_error(ytrain, y_pred_train)

7.239199833086659

#### Evaluate on test set

In [38]:
Xtest = poly.fit_transform(Xtest)
y_pred_test = reg.predict(Xtest)

ValueError: matmul: Input operand 1 has a mismatch in its core dimension 0, with gufunc signature (n?,k),(k,m?)->(n?,m?) (size 4278 is different from 91)

In [None]:
mean_squared_error(ytest, y_pred_test)

In [None]:
mean_absolute_error(ytest, y_pred_test)

### 4. SVM Regression

In [62]:
reg = svm.SVR()
reg.fit(Xtrain, ytrain)

AttributeError: coef_ is only available when using a linear kernel

In [None]:
y_pred_train = reg.predict(Xtrain)

In [None]:
mean_squared_error(ytrain, y_pred_train)

In [None]:
mean_absolute_error(ytrain, y_pred_train)