In [91]:
import pandas as pd
from matplotlib import pyplot as plt
import numpy as np
import math
import seaborn as sns
import sklearn
from sklearn import linear_model, preprocessing
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import PolynomialFeatures
from itertools import combinations

%matplotlib inline
sns.set_style('white')

In [92]:
df = pd.read_csv('responses.csv')

In [93]:
df.head()

Unnamed: 0,Music,Slow songs or fast songs,Dance,Folk,Country,Classical music,Musical,Pop,Rock,Metal or Hardrock,...,Age,Height,Weight,Number of siblings,Gender,Left - right handed,Education,Only child,Village - town,House - block of flats
0,5.0,3.0,2.0,1.0,2.0,2.0,1.0,5.0,5.0,1.0,...,20.0,163.0,48.0,1.0,female,right handed,college/bachelor degree,no,village,block of flats
1,4.0,4.0,2.0,1.0,1.0,1.0,2.0,3.0,5.0,4.0,...,19.0,163.0,58.0,2.0,female,right handed,college/bachelor degree,no,city,block of flats
2,5.0,5.0,2.0,2.0,3.0,4.0,5.0,3.0,5.0,3.0,...,20.0,176.0,67.0,2.0,female,right handed,secondary school,no,city,block of flats
3,5.0,3.0,2.0,1.0,1.0,1.0,1.0,2.0,2.0,1.0,...,22.0,172.0,59.0,1.0,female,right handed,college/bachelor degree,yes,city,house/bungalow
4,5.0,3.0,4.0,3.0,2.0,4.0,3.0,5.0,3.0,1.0,...,20.0,170.0,59.0,1.0,female,right handed,secondary school,no,village,house/bungalow


In [94]:
df.columns

Index(['Music', 'Slow songs or fast songs', 'Dance', 'Folk', 'Country',
       'Classical music', 'Musical', 'Pop', 'Rock', 'Metal or Hardrock',
       ...
       'Age', 'Height', 'Weight', 'Number of siblings', 'Gender',
       'Left - right handed', 'Education', 'Only child', 'Village - town',
       'House - block of flats'],
      dtype='object', length=150)

Our dataset is a survey of young people (ages 15-30) wherein they give Likert scale respones on over 100 categories for their activites from music genre preferences, movies, academic coursework, hobbies, fears, moral stances, social habits, life outlooks, and basic demographic information (height, weight, age, number of siblings, etc.). 

The Kaggle source can be found [here](https://www.kaggle.com/miroslavsabo/young-people-survey). 

This dataset contains 150 features before expansion, most of it integers. Predicting any of the column responses would be as fair a model as any other for the most part. I am going to start by building a model to predict the responses to column 'God' which is a 1-5 scale response to the statement "I believe in God". 

We'll need dummies for Smoking, Alcohol, Punctuality, Lying, Internet Usage, Gender, Left - Right Handed, Education, Only Child, Village - town, & House - block of flats. 

In [95]:
df = pd.get_dummies(data=df, columns=['Smoking', 'Alcohol', 'Punctuality', 'Lying', 'Internet usage', 'Gender', 'Left - right handed', 'Education', 'Only child', 'Village - town', 'House - block of flats'])

In [96]:
df.head()

Unnamed: 0,Music,Slow songs or fast songs,Dance,Folk,Country,Classical music,Musical,Pop,Rock,Metal or Hardrock,...,Education_doctorate degree,Education_masters degree,Education_primary school,Education_secondary school,Only child_no,Only child_yes,Village - town_city,Village - town_village,House - block of flats_block of flats,House - block of flats_house/bungalow
0,5.0,3.0,2.0,1.0,2.0,2.0,1.0,5.0,5.0,1.0,...,0,0,0,0,1,0,0,1,1,0
1,4.0,4.0,2.0,1.0,1.0,1.0,2.0,3.0,5.0,4.0,...,0,0,0,0,1,0,1,0,1,0
2,5.0,5.0,2.0,2.0,3.0,4.0,5.0,3.0,5.0,3.0,...,0,0,0,1,1,0,1,0,1,0
3,5.0,3.0,2.0,1.0,1.0,1.0,1.0,2.0,2.0,1.0,...,0,0,0,0,0,1,1,0,0,1
4,5.0,3.0,4.0,3.0,2.0,4.0,3.0,5.0,3.0,1.0,...,0,0,0,1,1,0,0,1,0,1


In [97]:
df.isnull().sum().sort_values(ascending=False).head()

Weight           20
Height           20
Passive sport    15
Chemistry        10
Geography         9
dtype: int64

There are between 20 and 74 rows with some missing values. There are 1,010 total rows. I will drop the rows with missing values rather than impute because of the ratio. 

In [98]:
df = df.dropna()

In [99]:
df.isnull().sum().sort_values(ascending=False).head()

House - block of flats_house/bungalow    0
Passive sport                            0
Adrenaline sports                        0
Fun with friends                         0
Theatre                                  0
dtype: int64

In [100]:
df = df.apply(pd.to_numeric).astype('int64')

In [101]:
trainsize = int(df.shape[0] / 2)
df_test = df.iloc[trainsize:, :].copy()
df_train = df.iloc[:trainsize, :].copy()

Y_train = df_train['God'].values.reshape(-1, 1)
X_train = df_train.loc[:, ~(df_train.columns).isin(['God'])]

VANILLA REGRESSION

In [102]:
regr1 = linear_model.LinearRegression()
regr1.fit(X_train, Y_train)
print('\nR-squared simple model:')
print(regr1.score(X_train, Y_train))


R-squared simple model:
0.527817280806929


In [103]:
X = df.drop('God', 1)
y = df['God']

In [104]:
def add_interactions(X):
    # Get feature names
    combos = list(combinations(list(X.columns), 2))
    colnames = list(X.columns) + ['_'.join(X) for X in combos]
    
    # Find interactions
    poly = PolynomialFeatures(interaction_only=True, include_bias=False)
    X = poly.fit_transform(X)
    X = pd.DataFrame(X)
    X.columns = colnames
    
    # Remove interaction terms with all 0 values            
    noint_indicies = [i for i, X in enumerate(list((X == 0).all())) if X]
    X = X.drop(X.columns[noint_indicies], axis=1)
    
    return X

In [105]:
X = add_interactions(X)
print(X.head(5))

   Music  Slow songs or fast songs  Dance  Folk  Country  Classical music  \
0    5.0                       3.0    2.0   1.0      2.0              2.0   
1    4.0                       4.0    2.0   1.0      1.0              1.0   
2    5.0                       5.0    2.0   2.0      3.0              4.0   
3    5.0                       3.0    4.0   3.0      2.0              4.0   
4    5.0                       3.0    2.0   3.0      2.0              3.0   

   Musical  Pop  Rock  Metal or Hardrock  \
0      1.0  5.0   5.0                1.0   
1      2.0  3.0   5.0                4.0   
2      5.0  3.0   5.0                3.0   
3      3.0  5.0   3.0                1.0   
4      3.0  2.0   5.0                5.0   

                               ...                               \
0                              ...                                
1                              ...                                
2                              ...                                
3   

In [106]:
Y_train = y.values.reshape(-1, 1)
X_train = X.loc[:, ~(df_train.columns).isin(['God'])]

In [107]:
regr1 = linear_model.LinearRegression()
regr1.fit(X_train, Y_train)
print('\nR-squared simple model:')
print(regr1.score(X_train, Y_train))


R-squared simple model:
0.49945697025701313


Converting 173 features to 14,646 features seems to have no real impact on our vanilla regression. Let's add an L1 regularization (Ridge)

In [108]:
ridgeregr = linear_model.Ridge(alpha=10, fit_intercept=False) 
ridgeregr.fit(X_train, Y_train)
print(ridgeregr.score(X_train, Y_train))

0.596894539833106


59.7% is the R2 result of our first pass with the Ridge regression. This is set for an alpha of 10. Let's try changing our alpha to see how that improves (or degrades) the results we've built. 

In [109]:
#RIDGE REGRESSIONS 
print('\nR-squared complex model, range of alpha: 10, 5, 0.5, 0.05, 0.005')
print('alpha=10')
ridgeregr = linear_model.Ridge(alpha=10, fit_intercept=False) 
ridgeregr.fit(X_train, Y_train)
print(ridgeregr.score(X_train, Y_train))

print('\n')
print('alpha=5')
ridgeregr = linear_model.Ridge(alpha=5, fit_intercept=False) 
ridgeregr.fit(X_train, Y_train)
print(ridgeregr.score(X_train, Y_train))

print('\n')
print('alpha=0.5')
ridgeregr = linear_model.Ridge(alpha=.5, fit_intercept=False) 
ridgeregr.fit(X_train, Y_train)
print(ridgeregr.score(X_train, Y_train))

print('\n')
print('alpha=0.05')
ridgeregr = linear_model.Ridge(alpha=.05, fit_intercept=False) 
ridgeregr.fit(X_train, Y_train)
print(ridgeregr.score(X_train, Y_train))

print('\n')
print('alpha=0.005')
ridgeregr = linear_model.Ridge(alpha=.005, fit_intercept=False) 
ridgeregr.fit(X_train, Y_train)
print(ridgeregr.score(X_train, Y_train))


R-squared complex model, range of alpha: 10, 5, 0.5, 0.05, 0.005
alpha=10
0.596894539833106


alpha=5
0.5984347317160439


alpha=0.5
0.6037880068884529


alpha=0.05
0.6057255913861206


alpha=0.005
0.6058469848211471


A expected, the alpha was a factor in increasing our R2, but only to a small extent, and with diminishing returns once we crissed from 0.5 to 0.05. The result for 0.005 was better, but only to a miniscule degree. Let's see how a LASSO regression deals with all of these features across a number of alphas. 

In [110]:
#LASSO REGRESSIONS
print('\nR-squared complex model, range of alpha: .001 to 1')
print('alpha=0.001')
lass = linear_model.Lasso(alpha=.001)
lassfit = lass.fit(X_train, Y_train)
print(lass.score(X_train, Y_train))

print('\n')
print('alpha=0.01')
lass = linear_model.Lasso(alpha=.01)
lassfit = lass.fit(X_train, Y_train)
print(lass.score(X_train, Y_train))

print('\n')
print('alpha=0.05')
lass = linear_model.Lasso(alpha=.05)
lassfit = lass.fit(X_train, Y_train)
print(lass.score(X_train, Y_train))

print('\n')
print('alpha=0.35')
lass = linear_model.Lasso(alpha=.35)
lassfit = lass.fit(X_train, Y_train)
print(lass.score(X_train, Y_train))

print('\n')
print('alpha=0.75')
lass = linear_model.Lasso(alpha=.75)
lassfit = lass.fit(X_train, Y_train)
print(lass.score(X_train, Y_train))

print('\n')
print('alpha=1.0')
lass = linear_model.Lasso(alpha=1)
lassfit = lass.fit(X_train, Y_train)
print(lass.score(X_train, Y_train))


R-squared complex model, range of alpha: .001 to 1
alpha=0.001
0.6008489309066706


alpha=0.01
0.5830523658140399


alpha=0.05
0.4999051972729942


alpha=0.35
0.3611544662528425


alpha=0.75
0.18318478785066472


alpha=1.0
0.01673214548365054


With the penalty increased with the LASSO we see the model become predictably less efficient. However, I was surprised to see that the R2 decrease noticeably from 0.001 to .01. I'd expected it to improve slightly before the steep descent of lambda increases began. 

Out of sheer curiosity, let's see the result of our Ridge and LASSO regressions without the two-way interactions, dealing instead with the raw 174 feature dataset. I'll reduce the number of alpha levels tested to just a couple since we know how the higher levels affect our R2. 

In [111]:
trainsize = int(df.shape[0] / 2)
df_test = df.iloc[trainsize:, :].copy()
df_train = df.iloc[:trainsize, :].copy()

Y_train = df_train['God'].values.reshape(-1, 1)
X_train = df_train.loc[:, ~(df_train.columns).isin(['God'])]

In [112]:
print('\nR-squared simple model, range of alpha: 0.05, 0.005')
print('alpha=0.05')
ridgeregr = linear_model.Ridge(alpha=.05, fit_intercept=False) 
ridgeregr.fit(X_train, Y_train)
print(ridgeregr.score(X_train, Y_train))

print('\n')
print('alpha=0.005')
ridgeregr = linear_model.Ridge(alpha=.005, fit_intercept=False) 
ridgeregr.fit(X_train, Y_train)
print(ridgeregr.score(X_train, Y_train))



R-squared simple model, range of alpha: 0.05, 0.005
alpha=0.05
0.7350408502223027


alpha=0.005
0.7353398995397398


In [113]:
#LASSO REGRESSIONS
print('\nR-squared complex model, range of alpha: .001 to .01')
print('alpha=0.001')
lass = linear_model.Lasso(alpha=.001)
lassfit = lass.fit(X_train, Y_train)
print(lass.score(X_train, Y_train))

print('\n')
print('alpha=0.01')
lass = linear_model.Lasso(alpha=.01)
lassfit = lass.fit(X_train, Y_train)
print(lass.score(X_train, Y_train))


R-squared complex model, range of alpha: .001 to .01
alpha=0.001
0.7317102464762757


alpha=0.01
0.7052960786167402


Look at that improvement! We had barely crossed over 60% R2 with our complex model of over 1,400 features. The original data showed a significant improvement, proving once again that feature interactions are not always useful, even in datasets with a large number of features (a large number at least at this point in the training). 

Our Ridge Regression is the best fit for creating predictions for how young people will respond to the question of their belief in God for this dataset, better than the LASSO and much better than the vanilla regression. And the as before, the lambda at those low levels present very small changes. 