# More cleaning

In [1]:
# basic coding/ML tools
import numpy as np, pandas as pd, os, sys, scipy, xlrd, urllib, itertools, re, warnings
from jupyter_contrib_nbextensions.application import main
from ipywidgets import interact
from pandas.api.types import CategoricalDtype
from scipy import stats
from scipy.stats import ttest_ind

# ML tools
import sklearn, statsmodels.api as sm, statsmodels.formula.api as smf, pingouin as pg
from statsmodels.stats.anova import AnovaRM
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn import preprocessing, metrics
from sklearn.feature_selection import SelectFromModel
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.linear_model import LogisticRegression
from sklearn.dummy import DummyRegressor
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.model_selection import GridSearchCV, KFold, cross_val_score, train_test_split, RandomizedSearchCV
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error, mean_squared_log_error, make_scorer, confusion_matrix, plot_confusion_matrix, classification_report, multilabel_confusion_matrix

# visualization tools
import matplotlib.pyplot as plt, seaborn as sns, IPython.display
from statsmodels.graphics.factorplots import interaction_plot
from IPython.display import Image, HTML
%matplotlib inline
plt.rcParams['figure.figsize'] = (14,4)

# set directory
os.chdir("/Users/home/Desktop/Research/Tylenol/Analysis")

# only get warnings once
warnings.filterwarnings(action='ignore')
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

### Read in Data
Read in the data.

In [2]:
inducedLong = pd.read_csv('Fixed Induced.csv')
print("original inducedLong shape:", inducedLong.shape)

perceivedLong = pd.read_csv('Fixed Perceived.csv')
print("original perceivedLong shape:", perceivedLong.shape)

original inducedLong shape: (4806, 33)
original perceivedLong shape: (9842, 33)


### Missing Data
We will delete observations with no ID or without a drug/placebo indication. We cannot use these observations for analysis.

In [3]:
# examine
null = inducedLong[inducedLong.isnull().any(axis=1)]
print("null shape:", null.shape) 
null.head(5).T

pd.DataFrame(inducedLong.isnull().sum())

# delete 'Other' column
inducedLong = inducedLong.drop(['Other'], axis=1)

# drop rows where there's no ID or drug/placebo information
inducedLong = inducedLong.dropna(subset=['FixedID', 'DrugCode', 'DrugPlacebo'], how='any')

print("new inducedLong shape:", inducedLong.shape)

null shape: (4557, 33)
new inducedLong shape: (4554, 32)


In [4]:
# examine
null = perceivedLong[perceivedLong.isnull().any(axis=1)]
print("null shape:", null.shape) 

pd.DataFrame(perceivedLong.isnull().sum())

# delete 'Other' column
perceivedLong = perceivedLong.drop(['Other'], axis=1)

# drop rows where there's no ID or drug/placebo information
perceivedLong = perceivedLong.dropna(subset=['FixedID', 'DrugCode', 'DrugPlacebo'], how='any')

print("new perceivedLong shape:", perceivedLong.shape)

null shape: (9287, 33)
new perceivedLong shape: (9250, 32)


### Check for Duplicates
Duplicates would indicate an error with the participant ID number fed into the surveys on Qualtrics. These would lead to biased data, so we get rid of them. 

In [5]:
# Check number of induced stimuli
print("Number of Induced Stimuli: ", len(inducedLong['Stimulus'].unique()), "\n")

# Check max ID number
maxid = int(inducedLong['FixedID'].max())
    
# make dataframe checking the number of perceived/induced stimuli per person
participants = pd.DataFrame()
participants['Potential'] = pd.Series(range(1, maxid+1))
participants = participants.set_index('Potential')
participants['Induced'] = inducedLong[['FixedID','DrugCode']].groupby(['FixedID']).agg(['count'])

# where are the errors?
iError = list(participants[participants['Induced'] > len(inducedLong['Stimulus'].unique())].index)
print("induced errors in stimuli: ", iError, "\n")

# delete all IDs who are in error
    # it must be due to some kind of coding mistake or mistake with Qualtrics entries
drop = inducedLong[inducedLong['FixedID'].isin(iError)].index
inducedLong.drop(drop, inplace=True)

# check
print("fixed inducedLong shape:", inducedLong.shape, "\n")

participants['InducedFixed'] = inducedLong[['FixedID','DrugCode']].groupby(['FixedID']).agg(['count'])
iErrorCheck = list(participants[participants['InducedFixed'] > len(inducedLong['Stimulus'].unique())].index)
print("induced errors in stimuli: ", iErrorCheck, "\n")

Number of Induced Stimuli:  18 

induced errors in stimuli:  [55, 91, 92, 117, 133, 172] 

fixed inducedLong shape: (4338, 32) 

induced errors in stimuli:  [] 



In [6]:
# Check number of perceived stimuli
print("Number of Perceived Stimuli: ", len(perceivedLong['Stimulus'].unique()), "\n")

# Check max ID number
maxid = int(perceivedLong['FixedID'].max())
    
# make dataframe checking the number of perceived/induced stimuli per person
participants = pd.DataFrame()
participants['Potential'] = pd.Series(range(1, maxid+1))
participants = participants.set_index('Potential')
participants['Perceived'] = perceivedLong[['FixedID','DrugCode']].groupby(['FixedID']).agg(['count'])

# where are the errors?
pError = list(participants[participants['Perceived'] > len(perceivedLong['Stimulus'].unique())].index)
print("perceived errors in stimuli: ", pError, "\n")

# delete all IDs who are in error
    # it must be due to some kind of coding mistake or mistake with Qualtrics entries
drop = perceivedLong[perceivedLong['FixedID'].isin(pError)].index
perceivedLong.drop(drop, inplace=True)

# check
print("fixed perceivedLong shape:", perceivedLong.shape, "\n")

participants['PerceivedFixed'] = perceivedLong[['FixedID','DrugCode']].groupby(['FixedID']).agg(['count'])
pErrorCheck = list(participants[participants['PerceivedFixed'] > len(perceivedLong['Stimulus'].unique())].index)
print("perceived errors in stimuli: ", pErrorCheck)

Number of Perceived Stimuli:  37 

perceived errors in stimuli:  [55, 91, 92, 117, 275] 

fixed perceivedLong shape: (8880, 32) 

perceived errors in stimuli:  []


### Add summary columns
We want to differentiate blunting effects between ***Positive*** and ***Negative*** stimuli, across ***Music***, ***Speech***, and ***Natural Sounds***, and by ***Arousal and Valence*** scores.

We will add a new column to summarize each of these features based on emotional theory and *a priori* stimulus analysis.

In [7]:
# PosNeg: Positive or Negative Emotions
posneg = [
    (inducedLong['Stimulus'].str.contains('Negative|Sad|Fear', regex=True) == True),
    (inducedLong['Stimulus'].str.contains('Positive|Happy|Tender', regex=True) == True)]
choices = ['Negative', 'Positive']
inducedLong['PosNeg'] = np.select(posneg, choices, default='Neutral')

# SoundType: Music, Speech, Natural Sounds
sound = [
    (inducedLong['Stimulus'].str.contains('Music', regex=True) == True),
    (inducedLong['Stimulus'].str.contains('Speech', regex=True) == True)]
choices = ['Music', 'Speech']
inducedLong['SoundType'] = np.select(sound, choices, default='Natural Sounds')
    # note though: no speech in induced

# Russell: Circumplex model info
russell = [
    (inducedLong['Stimulus'].str.contains('Negative-Valence Low|Sad', regex=True) == True),
    (inducedLong['Stimulus'].str.contains('Negative-Valence High|Fear', regex=True) == True),
    (inducedLong['Stimulus'].str.contains('Positive-Valence Low|Tender', regex=True) == True),
    (inducedLong['Stimulus'].str.contains('Positive-Valence High|Happy', regex=True) == True)]
choices = ['Neg-Valence Low-Arousal', 'Neg-Valence High-Arousal', 'Pos-Valence Low-Arousal', 'Pos-Valence High-Arousal']
inducedLong['Russell'] = np.select(russell, choices, default='Neutral')

# Crosstabs
pd.crosstab(index = inducedLong['Stimulus'], columns = inducedLong['PosNeg'])
pd.crosstab(index = inducedLong['Stimulus'], columns = inducedLong['Russell'])
pd.crosstab(index = inducedLong['Stimulus'], columns = inducedLong['SoundType'])

SoundType,Music,Natural Sounds
Stimulus,Unnamed: 1_level_1,Unnamed: 2_level_1
Fear Music 1,241,0
Fear Music 2,241,0
Happy Music 1,241,0
Happy Music 2,241,0
Negative-Valence High-Arousal Human,0,241
Negative-Valence High-Arousal Non-human,0,241
Negative-Valence Low-Arousal Human,0,241
Negative-Valence Low-Arousal Non-human,0,241
Neutral Human,0,241
Neutral Non-human,0,241


In [8]:
# PosNeg: Positive or Negative Emotions
posneg = [
    (perceivedLong['Stimulus'].str.contains('Negative|Sad|Fear', regex=True) == True),
    (perceivedLong['Stimulus'].str.contains('Positive|Happy|Tender', regex=True) == True)]
choices = ['Negative', 'Positive']
perceivedLong['PosNeg'] = np.select(posneg, choices, default='Neutral')

# SoundType: Music, Speech, Natural Sounds
sound = [
    (perceivedLong['Stimulus'].str.contains('Music', regex=True) == True),
    (perceivedLong['Stimulus'].str.contains('Speech', regex=True) == True)]
choices = ['Music', 'Speech']
perceivedLong['SoundType'] = np.select(sound, choices, default='Natural Sounds')

# Russell: Circumplex model info
russell = [
    (perceivedLong['Stimulus'].str.contains('Negative-Valence Low|Sad', regex=True) == True),
    (perceivedLong['Stimulus'].str.contains('Negative-Valence High|Fear', regex=True) == True),
    (perceivedLong['Stimulus'].str.contains('Positive-Valence Low|Tender', regex=True) == True),
    (perceivedLong['Stimulus'].str.contains('Positive-Valence High|Happy', regex=True) == True)]
choices = ['Neg-Valence Low-Arousal', 'Neg-Valence High-Arousal', 'Pos-Valence Low-Arousal', 'Pos-Valence High-Arousal']
perceivedLong['Russell'] = np.select(russell, choices, default='Neutral')

# Crosstabs
pd.crosstab(index = perceivedLong['Stimulus'], columns = perceivedLong['PosNeg'])
pd.crosstab(index = perceivedLong['Stimulus'], columns = perceivedLong['Russell'])
pd.crosstab(index = perceivedLong['Stimulus'], columns = perceivedLong['SoundType'])

SoundType,Music,Natural Sounds,Speech
Stimulus,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Fear Music 1,240,0,0
Fear Music 2,240,0,0
Fear Music 3,240,0,0
Fear Speech 1,0,0,240
Fear Speech 2,0,0,240
Fear Speech 3,0,0,240
Happy Music 1,240,0,0
Happy Music 2,240,0,0
Happy Music 3,240,0,0
Happy Speech 1,0,0,240


### Write out cleaned dfs

In [9]:
inducedLong.reset_index(inplace = True, drop = True) 
inducedLong.to_csv(r'cleaned_inducedLong.csv', index = None, header = True)

In [10]:
perceivedLong.reset_index(inplace = True, drop = True) 
perceivedLong.to_csv(r'cleaned_perceivedLong.csv', index = None, header = True)

# Pos/Neg/Arousal DFs

In [11]:
model = pd.read_csv('model.csv')

In [12]:
emotions = list(model.loc[:, 'Anger':'Neutral'].columns)

colList = list(model.columns)
colList = [i for i in colList if i not in ('Negative', 'Arousal')]
colList = [i for i in colList if i[:8] != 'Stimulus']
colList = [i for i in colList if i not in emotions]
    
positive = model[colList]
positive = positive.rename(columns={"Positive": "Ratings"})

positive.reset_index(inplace = True, drop = True) 
positive.to_csv(r'positive.csv', index = None, header = True)

In [13]:
colList = list(model.columns)
colList = [i for i in colList if i not in ('Positive', 'Arousal')]
colList = [i for i in colList if i[:8] != 'Stimulus']
colList = [i for i in colList if i not in emotions]
    
negative = model[colList]
negative = negative.rename(columns={"Negative": "Ratings"})

negative.reset_index(inplace = True, drop = True) 
negative.to_csv(r'negative.csv', index = None, header = True)

In [14]:
colList = list(model.columns)
colList = [i for i in colList if i not in ('Positive', 'Negative')]
colList = [i for i in colList if i[:8] != 'Stimulus']
colList = [i for i in colList if i not in emotions]
    
arousal = model[colList]
arousal = arousal.rename(columns={"Arousal": "Ratings"})
arousal = arousal.dropna()

arousal.reset_index(inplace = True, drop = True) 
arousal.to_csv(r'arousal.csv', index = None, header = True)

In [15]:
# add column about rating types
positive['RatingType'] = "Positive"
negative['RatingType'] = "Negative"
arousal['RatingType'] = "Arousal"

# check shapes
print("positive shape:", positive.shape)
print("negative shape:", negative.shape)
print("arousal shape:", arousal.shape, "\n")

# # make sure column names are the same
# print("pos/neg colnames: ", positive.columns == negative.columns, "\n")
# print("pos/ar colnames: ", positive.columns == arousal.columns, "\n")
# print("ar/neg colnames: ", arousal.columns == negative.columns, "\n")

# rbind
alldata = pd.concat([positive, negative, arousal])

# dummify RatingType
alldata = pd.get_dummies(alldata, columns=['RatingType'])

alldata.reset_index(inplace = True, drop = True) 
alldata.to_csv(r'alldata.csv', index = None, header = True)

positive shape: (13218, 78)
negative shape: (13218, 78)
arousal shape: (8880, 78) 



### Wide DF

In [16]:
colList = list(model.columns)
new = model[colList]

# find instances of where "1" is in a cell in the dataframe
row, col = np.where(new.values == 1)

# replace the column index with the column name
colname = new.columns[col]

# make a dataframe of all these values
vals = pd.DataFrame()
vals['row'] = row
vals['cols'] = colname

In [17]:
stimList = list(vals['cols'])
stimList = [i for i in stimList if i[:8] == 'Stimulus']
stim = vals[vals['cols'].isin(stimList)]
stim = stim.set_index('row')
new['Stimulus'] = stim['cols']
new['Stimulus'] = new['Stimulus'].replace(r'^.*Stimulus_', '', regex=True)

# delete stimulus one-hot encoded cols
newList = list(new.columns)
newList = [ x for x in newList if "Stimulus_" not in x ]
new = new[newList]

# reset index
new.reset_index(inplace = True, drop = True) 

In [18]:
posnegList = list(vals['cols'])
posnegList = [i for i in posnegList if i[:6] == 'PosNeg']
posneg = vals[vals['cols'].isin(posnegList)]
posneg = posneg.set_index('row')
new['PosNeg'] = posneg['cols']
new['PosNeg'] = new['PosNeg'].replace(r'^.*PosNeg_', '', regex=True)

# delete stimulus one-hot encoded cols
newList = list(new.columns)
newList = [ x for x in newList if "PosNeg_" not in x ]
new = new[newList]

# reset index
new.reset_index(inplace = True, drop = True) 

In [19]:
highlowList = list(vals['cols'])
highlowList = [i for i in highlowList if i[:7] == 'HighLow']
highlow = vals[vals['cols'].isin(highlowList)]
highlow = highlow.set_index('row')
new['HighLow'] = highlow['cols']
new['HighLow'] = new['HighLow'].replace(r'^.*HighLow_', '', regex=True)

# delete stimulus one-hot encoded cols
newList = list(new.columns)
newList = [ x for x in newList if "HighLow_" not in x ]
new = new[newList]

# reset index
new.reset_index(inplace = True, drop = True) 

In [20]:
soundtypeList = list(vals['cols'])
soundtypeList = [i for i in soundtypeList if i[:9] == 'SoundType']
soundtype = vals[vals['cols'].isin(soundtypeList)]
soundtype = soundtype.set_index('row')
new['SoundType'] = soundtype['cols']
new['SoundType'] = new['SoundType'].replace(r'^.*SoundType_', '', regex=True)

# delete stimulus one-hot encoded cols
newList = list(new.columns)
newList = [ x for x in newList if "SoundType_" not in x ]
new = new[newList]

# reset index
new.reset_index(inplace = True, drop = True) 

In [21]:
ppList = list(vals['cols'])
ppList = [i for i in ppList if i[:14] == 'PoliticalParty']
pp = vals[vals['cols'].isin(ppList)]
pp = pp.set_index('row')
new['PoliticalParty'] = pp['cols']
new['PoliticalParty'] = new['PoliticalParty'].replace(r'^.*PoliticalParty_', '', regex=True)

# delete stimulus one-hot encoded cols
newList = list(new.columns)
newList = [ x for x in newList if "PoliticalParty_" not in x ]
new = new[newList]

# reset index
new.reset_index(inplace = True, drop = True) 

In [22]:
medsList = list(vals['cols'])
medsList = [i for i in medsList if i[:13] == 'PreferredMeds']
meds = vals[vals['cols'].isin(medsList)]
meds = meds.set_index('row')
new['PreferredMeds'] = meds['cols']
new['PreferredMeds'] = new['PreferredMeds'].replace(r'^.*PreferredMeds_', '', regex=True)

# delete stimulus one-hot encoded cols
newList = list(new.columns)
newList = [ x for x in newList if "PreferredMeds_" not in x ]
new = new[newList]

# reset index
new.reset_index(inplace = True, drop = True) 

In [23]:
genderList = list(vals['cols'])
genderList = [i for i in genderList if i[:6] == 'Gender']
gender = vals[vals['cols'].isin(genderList)]
gender = gender.set_index('row')
new['Gender'] = gender['cols']
new['Gender'] = new['Gender'].replace(r'^.*Gender_', '', regex=True)

# delete stimulus one-hot encoded cols
newList = list(new.columns)
newList = [ x for x in newList if "Gender_" not in x ]
new = new[newList]

# reset index
new.reset_index(inplace = True, drop = True) 

In [24]:
new = new.iloc[:, np.r_[0,1,82:86,2:82,86:89]]

In [25]:
wideQ = new.loc[:,'MedsEffectiveness':'Gender']
wideQ['FixedID'] = pd.to_numeric(new['FixedID'])
wideQ['DrugPlacebo'] = new['DrugPlacebo']
wideQ = wideQ.iloc[:, np.r_[55,56,0:55]]
wideQ = wideQ.drop_duplicates()
wideQ = wideQ.sort_values(by=['FixedID'])
wideQ.reset_index(inplace = True, drop = True) 


In [26]:
# positive
p = new[new['Locus'] == 0]
p = p[['FixedID', 'DrugPlacebo', 'Stimulus', 'Positive']]
p['FixedID'] = pd.to_numeric(p['FixedID'])
p['Locus'] = "Perceived"
p['RatingType'] = "Positive"
p = p.rename(columns={"Positive": "Rating"})

# negative
n = new[new['Locus'] == 0]
n = n[['FixedID', 'DrugPlacebo', 'Stimulus', 'Negative']]
n['FixedID'] = pd.to_numeric(n['FixedID'])
n['Locus'] = "Perceived"
n['RatingType'] = "Negative"
n = n.rename(columns={"Negative": "Rating"})

# arousal
a = new[new['Locus'] == 0]
a = a[['FixedID', 'DrugPlacebo', 'Stimulus', 'Arousal']]
a['FixedID'] = pd.to_numeric(a['FixedID'])
a['Locus'] = "Perceived"
a['RatingType'] = "Arousal"
a = a.rename(columns={"Arousal": "Rating"})

# familiarity
f = new[new['Locus'] == 0]
f = f[['FixedID', 'DrugPlacebo', 'Stimulus', 'Familiarity']]
f['FixedID'] = pd.to_numeric(f['FixedID'])
f['Locus'] = "Perceived"
f['RatingType'] = "Familiarity"
f = f.rename(columns={"Familiarity": "Rating"})

# rbind
wideP = pd.concat([p, n, a, f])
wideP = wideP.sort_values(by=['FixedID'])
wideP.reset_index(inplace = True, drop = True) 


In [27]:
# positive
p = new[new['Locus'] == 1]
p = p[['FixedID', 'DrugPlacebo', 'Stimulus', 'Positive']]
p['FixedID'] = pd.to_numeric(p['FixedID'])
p['Locus'] = "Induced"
p['RatingType'] = "Positive"
p = p.rename(columns={"Positive": "Rating"})

# negatikve
n = new[new['Locus'] == 1]
n = n[['FixedID', 'DrugPlacebo', 'Stimulus', 'Negative']]
n['FixedID'] = pd.to_numeric(n['FixedID'])
n['Locus'] = "Induced"
n['RatingType'] = "Negative"
n = n.rename(columns={"Negative": "Rating"})

# familiarity
f = new[new['Locus'] == 1]
f = f[['FixedID', 'DrugPlacebo', 'Stimulus', 'Familiarity']]
f['FixedID'] = pd.to_numeric(f['FixedID'])
f['Locus'] = "Induced"
f['RatingType'] = "Familiarity"
f = f.rename(columns={"Familiarity": "Rating"})

# rbind
wideI = pd.concat([p, n, f])
wideI = wideI.sort_values(by=['FixedID'])
wideI.reset_index(inplace = True, drop = True) 



In [29]:
# rbind
widePI = pd.concat([wideP, wideI])
widePI = widePI.sort_values(by=['FixedID'])
widePI.reset_index(inplace = True, drop = True) 

# unstack
widePI = widePI.set_index(['FixedID', 'DrugPlacebo', 'Stimulus', 'RatingType', 'Locus']).unstack(level = 4)
widePI.columns = pd.Index(widePI.columns).str.join("_")
widePI = pd.DataFrame(widePI.to_records())

widePI = widePI.set_index(['FixedID', 'DrugPlacebo', 'Stimulus', 'RatingType']).unstack(level = 3)
widePI.columns = pd.Index(widePI.columns).str.join("_")
widePI = pd.DataFrame(widePI.to_records())

widePI = widePI.set_index(['FixedID', 'DrugPlacebo', 'Stimulus']).unstack(level = 2)
widePI.columns = pd.Index(widePI.columns).str.join("_")
widePI = pd.DataFrame(widePI.to_records())

# remove "Rating_" from columns
widePI.columns = widePI.columns.str.lstrip('Rating_')

# examine
nans = pd.DataFrame(widePI.isnull().sum())
nans[0].unique()

# delete columns with all nans (means they weren't part of the study)
widePI = widePI.dropna(thresh=len(widePI)-5, axis=1)

# look at rest of nans
null = widePI[widePI.isnull().any(axis=1)]
print("null shape:", null.shape)
    # nans all come from 7 participants: 
    # 4 did the induced, but not perceived block and 3 did the perceived, but not induced block
    
# impute missing values with median for that column
colList = list(widePI.columns)
colList = [i for i in colList if i not in ('FixedID')]
widePI[colList] = widePI[colList].apply(lambda x: x.fillna(x.median()))

# check
null = pd.DataFrame(widePI.isnull().sum() > 0)
print("null values: ", null[null[0] == True])
    # no missing values now
print("widePI shape:", widePI.shape)

# reset indices
widePI.reset_index(inplace = True, drop = True) 

null shape: (7, 204)
null values:  Empty DataFrame
Columns: [0]
Index: []
widePI shape: (244, 204)


In [30]:
# check that the number of rows (which is the number of participants) is the same
print("shapes equal? ", wideQ.shape[0] == widePI.shape[0])

# merge
wide = pd.merge(wideQ, widePI, on=['FixedID', 'DrugPlacebo'])
print("wide shape: ", wide.shape)

shapes equal?  True
wide shape:  (244, 259)


In [31]:
wide.to_csv('wide_data_final.csv', index=False)

### Emotions DF

In [33]:
emotionsList = [i for i in list(new.columns) if i not in ('FixedID', 'EarlyFamilyEnvironment', 'Gender', 'PoliticalParty', 'PreferredMeds')]
emotions = new[emotionsList]
emotions = emotions.drop(columns = list(emotions.loc[:, 'MedsEffectiveness':'YearUniversity'].columns))
emotions = emotions.drop(columns = list(emotions.loc[:, 'Race_Asian':'Race_White'].columns))

# rename for convenience
emotions['DrugPlacebo'].replace(0, "Placebo", inplace=True)
emotions['DrugPlacebo'].replace(1, "Drug", inplace=True)
emotions['Locus'].replace(0, "Perceived", inplace=True)
emotions['Locus'].replace(1, "Induced", inplace=True)

emotions.to_csv('emotions.csv', index=False)

### Scratchpad

In [None]:
# # OLS regression with lasso regularization
# # There's no way to look at standard errors, p-values, R^2, etc. because the theory for these values is still being developed among statisticians.

# lassomodel = sm.OLS(y, X)
# lassoresult = lassomodel.fit_regularized('elastic_net', L1_wt = 1.0) # L1_wt = 0.0 -- ridge (to 1.0 -- lasso)

In [None]:
# # Mixed Model
# # FixedID (the participant ID numbers) will be used as the grouping variable (the random effect)

# hierarchicalmodel = sm.MixedLM(y, X, positive['FixedID'])
# hierarchicalresult = hierarchicalmodel.fit()
# print(hierarchicalresult.summary())

#### Conclusions:
* **The measured variables explain 54% of the variance in positive emotion ratings (adjusted R^2 = 0.54)**
* **The ingestion of acetaminophen blunted ratings of positive emotions compared to the ingestion of a placebo.**

|	Name	|	Coefficient	|	p	|	Interpretation	|
|	------	|	------	|	------	|	------	|
| 	***DrugPlacebo***	| 	***-0.2338***	| 	***0***	| 	*Those who **took acetaminophen** rated positive emotions as **less intense** than those who took the placebo.* ***Namely, acetaminophen blunted the emotional responses.***	| 
| 	PosNeg_Positive	| 	2.9338	| 	0	| 	**Positively-valenced stimuli** resulted in comparatively **high** positive emotion ratings.	| 
| 	PosNeg_Negative	| 	-1.8991	| 	0	| 	**Negatively-valenced stimuli** resulted in comparatively **low** positive emotion ratings.	| 
| 	HighLow_High	| 	0.8216	| 	0	| 	Stimuli that exhibit **high arousal** resulted in comparatively **more intense** positive emotion ratings.	| 
| 	SoundType_Music	| 	0.7015	| 	0	| 	**Musical stimuli** resulted in comparatively **higher ratings of** positive emotion.	| 
| 	Familiarity	| 	0.4672	| 	0	| 	Those who are more **familiar** with the (musical) stimuli rated positive emotions as **more intense.**	| 
| 	A	| 	0.2752	| 	0	| 	Those who are more **agreeable** rated emotions as relatively **more positive** than those who score less high on this personality dimension.	| 
| 	PD	| 	0.2485	| 	0	| 	Those who score higher on the **Personal Distress component of empathy** rated emotions as relatively **more positive** than those who score less high on this trait.	| 
| 	FS	| 	-0.1584	| 	0	| 	Those who score higher on the **Fantasy component of empathy** rated emotions as relatively **less positive** than those who score less high on this trait.	| 
| 	Locus	| 	-0.1575	| 	0.001	| 	Compared to perceived emotion ratings, **induced emotion ratings** were **more positive**.	| 
| 	N	| 	-0.1498	| 	0	| 	Those who are more **neurotic** rated emotions as relatively **less positive** than those who score less high on this personality dimension.	| 
| 	Intense	| 	-0.1285	| 	0	| 	Those who **prefer "Intense" music** (Rock, Punk, Alternative, Heavy Metal) rated emotions as relatively **less positive** (broadly, in response to all stimuli) than those who do not like this genre of music.	| 
| 	E	| 	0.1282	| 	0	| 	Those who are more **extraverted** rated emotions as relatively **more positive** than those who score less high on this personality dimension.	| 
| 	Sophisticated	| 	0.1144	| 	0	| 	Those who **prefer "Sophisticated" music** (Blues, Jazz, Bluegrass, Folk, Classical, Gospel, Opera) rated emotions as relatively **more positive** (broadly, in response to all stimuli) than those who do not like this genre of music.	| 
| 	Nostalgia	| 	-0.0659	| 	0.001	| 	Those who scored high on **nostalgia** rated relatively **less intense** positive ratings (although this effect is small).	|

