# Validation

In [11]:
# Dependencies
import pandas as pd
from sklearn.feature_selection import VarianceThreshold
from sklearn import preprocessing
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression, Lasso
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
import numpy as np
from mlxtend.feature_selection import SequentialFeatureSelector as SFS
from mlxtend.plotting import plot_sequential_feature_selection as plot_sfs
import math
import seaborn as sns
import pickle

In [33]:
# Load in validation datasets
features_df = pd.read_pickle('dataset/validate_feat_cleaned.pickle')
target_df = pd.read_pickle('dataset/validate_target_cleaned.pickle')
features_time = pd.read_pickle("dataset/time_scored/train_feat_cleaned.pickle")
target_time = pd.read_pickle("dataset/time_scored/train_target_cleaned.pickle")

In [7]:
features_df.head()

Unnamed: 0,image_width,data_memorability,user_follows,user_posted_photos,age,glasses,face_emotion_ANGRY,face_emotion_CALM,face_emotion_CONFUSED,face_emotion_DISGUSTED,...,emotion_label_joy,emotion_label_loathing,emotion_label_pensiveness,emotion_label_rage,emotion_label_sadness,emotion_label_serenity,emotion_label_surprise,emotion_label_terror,emotion_label_trust,emotion_label_vigilance
105,624.747082,0.84461,2.278754,2.887617,30.726783,0.129525,0.110315,0.077511,0.15492,0.047318,...,0.238268,0.0,0.001297,0.005188,0.139127,0.01163,0.009295,0.008128,0.012019,0.0
108,636.402614,0.820265,2.575188,3.357363,30.498956,0.22803,0.087107,0.115625,0.129973,0.049372,...,0.162898,0.0,0.000784,0.007495,0.140937,0.022309,0.017756,0.014074,0.019172,0.0
142,631.886364,0.867818,2.635484,2.25042,25.924428,0.051515,0.101129,0.109481,0.064867,0.035978,...,0.203409,0.0,0.005682,0.009848,0.152841,0.014205,0.010795,0.0125,0.007955,0.0
55,640.0,0.72283,0.954243,1.342423,0.0,0.0,0.0,0.0,0.0,0.0,...,0.009091,0.0,0.0,0.118182,0.036364,0.0,0.0,0.009091,0.1,0.0
94,626.64,0.816961,2.575188,2.012837,31.896101,0.163028,0.107735,0.099974,0.103615,0.039589,...,0.238,0.0,0.0,0.02,0.081667,0.013667,0.002,0.018,0.012,0.0


In [8]:
target_df.head()

Unnamed: 0,id,gender,born,education,employed,income,A_2,N_1,P_1,E_1,...,P,E,R,M,A,PERMA,N_EMO,P_EMO,imagecount,private_account
105,4edc6503b585e1d36579c480075abd32,Female,1992,High school graduate,Employed for wages,"$60,000 to $69,999",8,5,8,9,...,8.0,8.666667,8.333333,9.0,6.666667,8.25,4.0,8.0,771.0,public
108,bb1cdf4cd58673f83b1c8f0c91183c1a,Female,1995,College graduate,Employed for wages,"$20,000 to $29,999",7,6,8,7,...,8.0,5.666667,7.333333,6.333333,7.333333,6.8125,5.333333,8.0,1530.0,public
142,558e99e2f98d4317d1d683f9eb8c8a1b,Female,1997,High school graduate,A student,"$10,000 to $19,999",3,8,9,5,...,8.0,7.333333,7.666667,5.333333,4.333333,6.5,6.0,8.0,176.0,public
55,c8586be94f22f2be245b6e4598068780,Male,1988,High school graduate,Employed for wages,"$20,000 to $29,999",6,3,5,6,...,5.333333,6.666667,6.0,6.666667,7.0,6.375,3.0,5.333333,22.0,public
94,5232e1d4bc3dfd509ff8525edb81b6fe,Female,1959,College graduate,Employed for wages,"$50,000 to $59,999",8,8,6,8,...,5.666667,7.333333,8.0,8.0,7.0,7.0625,7.0,5.666667,100.0,public


In [9]:
print(features_df.shape, target_df.shape)

(17, 43) (17, 45)


## Everybody can post import his model below and test against the validation set.

### Normal linear regression

In [30]:
# Basic linear model with all features

# Import model
reg_all = pickle.load(open("models/reg_all.pickle", 'rb'))

# Predict on the test data: y_pred
y_pred = reg_all.predict(features_df)

# Compute and print R^2 and RMSE
print("R^2: {}".format(reg_all.score(features_df, target_df["PERMA"])))
rmse = np.sqrt(mean_squared_error(target_df["PERMA"], y_pred))
print("Root Mean Squared Error: {}".format(rmse))



R^2: -1.6454776686421075
Root Mean Squared Error: 2.0895103886458686


In [31]:
# Evaluate best single model with features chosen by 

best_feat = pickle.load(open("models/best_feat.pickle", 'rb'))
reg_best = pickle.load(open("models/reg_best.pickle", 'rb'))

# Predict on the test data: y_pred
y_pred = reg_best.predict(features_df[best_feat])

# Compute and print R^2 and RMSE
print("R^2: {}".format(reg_best.score(features_df[best_feat], target_df["PERMA"])))
rmse = np.sqrt(mean_squared_error(target_df["PERMA"], y_pred))
print("Root Mean Squared Error: {}".format(rmse))

R^2: -0.8143670025613194
Root Mean Squared Error: 1.7304340581047373


In [39]:
# Evaluate compound model

best_features = pickle.load(open("models/comp_features.pickle", 'rb'))
models = pickle.load(open("models/perma_el.pickle", 'rb'))
perma_elements = pickle.load(open("models/compound.pickle", 'rb'))
predicted_data = pd.DataFrame()

for element in perma_elements:

    # Extract the best features for the element
    features = list(best_features[element][1])
    
    # Load in model
    lrc = models[element]

    # Predict on the test data: y_pred
    y_pred = lrc.predict(features_df[features])
  
    predicted_data[element] = y_pred
    
predicted_data["PERMA"] = predicted_data.mean(1)


# Compute and print R^2 and RMSE
print("R^2: {}".format(r2_score(target_df["PERMA"], predicted_data["PERMA"].values)))
rmse = np.sqrt(mean_squared_error(target_df["PERMA"], predicted_data["PERMA"].values))
print("Root Mean Squared Error: {}".format(rmse))

R^2: -1.4633812280717304
Root Mean Squared Error: 2.0163146174501714


### Time adjusted linear regression

In [34]:
# Basic time-adjusted linear model with all features

# Import model
reg_all = pickle.load(open("models/reg_all-time.pickle", 'rb'))

# Predict on the test data: y_pred
y_pred = reg_all.predict(features_time)

# Compute and print R^2 and RMSE
print("R^2: {}".format(reg_all.score(features_time, target_time["PERMA"])))
rmse = np.sqrt(mean_squared_error(target_time["PERMA"], y_pred))
print("Root Mean Squared Error: {}".format(rmse))

R^2: -1.2941154854377017
Root Mean Squared Error: 2.1492340221272404


In [35]:
# Evaluate time-adjusted best single model with features chosen by 

best_feat = pickle.load(open("models/best_feat-time.pickle", 'rb'))
reg_best = pickle.load(open("models/reg_best-time.pickle", 'rb'))

# Predict on the test data: y_pred
y_pred = reg_best.predict(features_time[best_feat])

# Compute and print R^2 and RMSE
print("R^2: {}".format(reg_best.score(features_time[best_feat], target_time["PERMA"])))
rmse = np.sqrt(mean_squared_error(target_time["PERMA"], y_pred))
print("Root Mean Squared Error: {}".format(rmse))

R^2: 0.3116269204564518
Root Mean Squared Error: 1.1773023407629892


In [38]:
# Evaluate time-adjusted compound model

best_features_time = pickle.load(open("models/comp_features-time.pickle", 'rb'))
models = pickle.load(open("models/perma_el-time.pickle", 'rb'))
perma_elements = pickle.load(open("models/compound-time.pickle", 'rb'))
predicted_data = pd.DataFrame()

for element in perma_elements:

    # Extract the best features for the element
    features = list(best_features_time[element][1])
    
    # Load in model
    lrc = models[element]

    # Predict on the test data: y_pred
    y_pred = lrc.predict(features_time[features])
  
    predicted_data[element] = y_pred
    
predicted_data["PERMA"] = predicted_data.mean(1)


# Compute and print R^2 and RMSE
print("R^2: {}".format(r2_score(target_time["PERMA"], predicted_data["PERMA"].values)))
rmse = np.sqrt(mean_squared_error(target_time["PERMA"], predicted_data["PERMA"].values))
print("Root Mean Squared Error: {}".format(rmse))

R^2: 0.3265008829702597
Root Mean Squared Error: 1.1645136509912646


# SVM model

In [42]:
# Normal
predicted_data = pd.DataFrame()
models_svm = pickle.load(open("models/models_svm.pickle", 'rb'))

for element in perma_elements:
    
    # Extract the best features for the element
    features = list(best_features[element][1])
    
    # Load in model
    clfc = models_svm[element]

    # Predict on the test data: y_pred
    y_pred = clfc.predict(features_df[features])
  
    predicted_data[element] = y_pred
    
predicted_data["PERMA"] = predicted_data.mean(1)

# Compute and print R^2 and RMSE
print("R^2: {}".format(r2_score(target_df["PERMA"], predicted_data["PERMA"].values)))
rmse = np.sqrt(mean_squared_error(target_df["PERMA"], predicted_data["PERMA"].values))
print("Root Mean Squared Error: {}".format(rmse))

R^2: -0.3785297294610632
Root Mean Squared Error: 1.5083445313925805


In [43]:
# Time adjusted
predicted_data = pd.DataFrame()
models_svm_time = pickle.load(open("models/models_svm-time.pickle", 'rb'))

for element in perma_elements:
    
    # Extract the best features for the element
    features = list(best_features_time[element][1])
    
    # Load in model
    clfc = models_svm_time[element]

    # Predict on the test data: y_pred
    y_pred = clfc.predict(features_time[features])
  
    predicted_data[element] = y_pred
    
predicted_data["PERMA"] = predicted_data.mean(1)

# Compute and print R^2 and RMSE
print("R^2: {}".format(r2_score(target_time["PERMA"], predicted_data["PERMA"].values)))
rmse = np.sqrt(mean_squared_error(target_time["PERMA"], predicted_data["PERMA"].values))
print("Root Mean Squared Error: {}".format(rmse))

R^2: 0.08324680025811337
Root Mean Squared Error: 1.358633396606013
