In [3]:
import pandas as pd;
import plotly.plotly as plt;
import plotly.figure_factory as pltff;
import plotly.graph_objs as pltgo;
import numpy as np;
import seaborn as sns;



################
# DATA
################
df = pd.read_csv("https://raw.githubusercontent.com/maudnals/ess-data/master/ESS1-8e01-2.csv")
dfs_by_country = {
#     cleanup happiness since we'll ue it extensively. Values > 10 are used for N/A ans such.
    "DE": df[df.cntry == "DE"][df.happy > 10],
    "FR": df[df.cntry == "FR"][df.happy > 10],
    "FI": df[df.cntry == "FI"][df.happy > 10],
};



################
# UTILS.PLOT
################
color_DE = '#F1C122';
color_FI = '#1F94E5';
color_FR = '#C0189C';
marker_DE = dict(
                size = 10,
                color = color_DE,
                line = dict(
                    width = 2,
                )
            );
marker_FI = dict(
                size = 10,
                color = color_FI,
                line = dict(
                    width = 2,
                )
            );
marker_FR = dict(
                size = 10,
                color = color_FR,
                line = dict(
                    width = 2,
                )
            );



################
# UTILS.DATA
################
def mean_for_ess_round(essround, cntry, col_name):
    df = dfs_by_country[cntry];
    return df[df.essround == essround][col_name].mean();

def mean_for_month(month, cntry, col_name):
    df = dfs_by_country[cntry];
    return df[df.inwmm == month][col_name].mean();

def std_for_ess_round(essround, cntry, col_name):
    df = dfs_by_country[cntry];
    return df[df.essround == essround][col_name].std();

def mean_happiness_for_ess_round(essround, cntry):
    return mean_for_ess_round(essround, cntry, "happy");

def std_happiness_for_ess_round(essround, cntry):
    return std_for_ess_round(essround, cntry, "happy");




Boolean Series key will be reindexed to match DataFrame index.


Boolean Series key will be reindexed to match DataFrame index.


Boolean Series key will be reindexed to match DataFrame index.



In [10]:
################
# HAPPINESS ACROSS TIMES FOR ALL COUNTRIES
################

rounds = list(range(1,9));
# happinesses = list(map(mean_happiness_for_ess_round, rounds));

# better, more python: list comprehension (source https://stackoverflow.com/questions/10973766/understanding-the-map-function)
happinesses_DE = [mean_happiness_for_ess_round(r, "DE") for r in rounds]
happinesses_FI = [mean_happiness_for_ess_round(r, "FI") for r in rounds]
happinesses_FR = [mean_happiness_for_ess_round(r, "FR") for r in rounds]

# Plot happinesses = f(ess round);
trace_DE = pltgo.Scatter(x = rounds, 
            y = happinesses_DE, 
            name = 'Germany',
            mode = 'lines', 
            marker = marker_DE);
trace_FR = pltgo.Scatter(x = rounds, 
            y = happinesses_FR, 
            name = 'France',
            mode = 'lines', 
            marker = marker_FR);
trace_FI = pltgo.Scatter(x = rounds, 
            y = happinesses_FI, 
            name = 'Finland',
            mode = 'lines', 
            marker = marker_FI);


    
fig = pltgo.Figure(
    data=[trace_DE, trace_FI, trace_FR], 
    layout=pltgo.Layout(
        paper_bgcolor='#0A010F', 
        plot_bgcolor="#0A010F", 
        title="Happiness = f(ESS Round)", 
        font = {"family": "Courier New", "color": "white"},
        xaxis = {"color": "grey", "title": "ESS Round # (every other year since 2002)", "gridcolor": "#2B2B2B"},
        yaxis = {"color": "grey", "title": "Happiness Level", "gridcolor": "#2B2B2B"} 
    ));

plt.iplot(fig)


Consider using IPython.display.IFrame instead



In [29]:
################
# STANDARD DEVIATION + VIOLIN
################

# list comprehension
# tuples: simpler than dicos!
mean_std_happiness_DE = [(mean_happiness_for_ess_round(r, "DE"), std_happiness_for_ess_round(r, "DE")) for r in rounds];
mean_std_happiness_FI = [(mean_happiness_for_ess_round(r, "FI"), std_happiness_for_ess_round(r, "FI")) for r in rounds];
mean_std_happiness_FR = [(mean_happiness_for_ess_round(r, "FR"), std_happiness_for_ess_round(r, "FR")) for r in rounds];

def mean_happiness(tupl):
    return tupl[0]

mean_std_happiness_DE.sort(key=mean_happiness)
mean_std_happiness_FI.sort(key=mean_happiness)
mean_std_happiness_FR.sort(key=mean_happiness)

trace_mstd_DE = pltgo.Scatter(x = [i[0] for i in mean_std_happiness_DE], 
            y = [i[1] for i in mean_std_happiness_DE],  
            mode = 'lines', 
            marker = marker_DE);
trace_mstd_FI = pltgo.Scatter(x = [i[0] for i in mean_std_happiness_FI], 
            y = [i[1] for i in mean_std_happiness_FI],  
            mode = 'lines', 
            marker = marker_FI);
trace_mstd_FR = pltgo.Scatter(x = [i[0] for i in mean_std_happiness_FR], 
            y = [i[1] for i in mean_std_happiness_FR],  
            mode = 'lines', 
            marker = marker_FR);


fig_violin_DE = pltgo.Figure(
    data=[{"type":"violin", "y": dfs_by_country["DE"][dfs_by_country["DE"].essround == 9]["happy"]}], 
    layout=pltgo.Layout(
        paper_bgcolor='#0A010F', 
        plot_bgcolor="#0A010F", 
        title="Violin DE", 
        font = {"family": "Courier New", "color": "white"},
    )
);

plt.iplot(fig_violin)



# fig = pltff.create_violin(y, title='Violin Plot', colors='#604d9e', paper_bgcolor='#0A010F', 
#         plot_bgcolor="#0A010F", 
#         font = {"family": "Courier New", "color": "white"})
# plt.iplot(fig, filename='violin-visual')



In [35]:
fig_violin_DE = pltgo.Figure(
    data=[{"type":"violin", "y": dfs_by_country["DE"][dfs_by_country["DE"].essround == 8]["happy"]}], 
    layout=pltgo.Layout(
        paper_bgcolor='#0A010F', 
        plot_bgcolor="#0A010F", 
        title="Violin DE", 
        font = {"family": "Courier New", "color": "white"},
    )
);

plt.iplot(fig_violin)

PlotlyRequestError: Hi there, you've reached the threshold of 100 combined image exports and chart saves per 24h period. If you need to raise your daily limit, please consider upgrading to a paid plan
(see: https://plot.ly/products/cloud ).

In [46]:
################
# COUNTRIES MOST IMPACTED BY SEASONAL VARIATIONS: N/U (not usable)
################

# v = [mean_for_month(m, "DE", "happy") for m in list(range(1, 12))]
# w = [mean_for_month(m, "FR", "happy") for m in list(range(1, 12))]
# v = [mean_for_month(m, "FI", "happy") for m in list(range(1, 12))]

# Data missing
# [7.255780346820809,
#  7.594713656387666,
#  6.916666666666667,
#  nan,
#  nan,
#  nan,
#  nan,
#  nan,
#  7.2,
#  7.424242424242424,
#  7.32289156626506]

In [1]:
################
# HAPPINESS BY AGE
################

happiness_by_age_DE = dfs_by_country["DE"].groupby(["agea"], as_index=False).mean();
happiness_by_age_FI = dfs_by_country["FI"].groupby(["agea"], as_index=False).mean();
happiness_by_age_FR = dfs_by_country["FR"].groupby(["agea"], as_index=False).mean();

happiness_by_age_DE.agea

tr_h_age_DE = pltgo.Scatter(x = happiness_by_age_DE.agea, 
            y = happiness_by_age_DE.happy, 
            name = 'Germany',
            mode = 'lines', 
            marker = marker_DE);

tr_h_age_FI = pltgo.Scatter(x = happiness_by_age_FI.agea, 
            y = happiness_by_age_FI.happy, 
            name = 'Finland',
            mode = 'lines', 
            marker = marker_FI);

tr_h_age_FR = pltgo.Scatter(x = happiness_by_age_FR.agea, 
            y = happiness_by_age_FR.happy, 
            name = 'France',
            mode = 'lines', 
            marker = marker_FR);

plt.iplot([tr_h_age_DE, tr_h_age_FI, tr_h_age_FR])

NameError: name 'dfs_by_country' is not defined

In [224]:
################
# HAPPINESS BY HOUSEHOLD SIZE
################

happiness_by_household_DE = dfs_by_country["DE"].groupby(["hhmmb"], as_index=False).mean();
happiness_by_household_FI = dfs_by_country["FI"].groupby(["hhmmb"], as_index=False).mean();
happiness_by_household_FR = dfs_by_country["FR"].groupby(["hhmmb"], as_index=False).mean();

tr_h_house_DE = pltgo.Scatter(x = happiness_by_household_DE.hhmmb, 
            y = happiness_by_household_DE.happy, 
            name = 'Germany',
            mode = 'lines', 
            marker = marker_DE);

tr_h_house_FI = pltgo.Scatter(x = happiness_by_household_FI.hhmmb, 
            y = happiness_by_household_FI.happy, 
            name = 'Finland',
            mode = 'lines', 
            marker = marker_FI);

tr_h_house_FR = pltgo.Scatter(x = happiness_by_household_FR.hhmmb, 
            y = happiness_by_household_FR.happy, 
            name = 'France',
            mode = 'lines', 
            marker = marker_FR);


plt.iplot([tr_h_house_DE, tr_h_house_FI, tr_h_house_FR])



In [63]:
################
# RESPONDANTS AGE DISTRIBUTION
################
# grouped_by_age_DE = dfs_by_country["DE"].groupby(["agea"], as_index=False);

# .size();
# happiness_by_household_FI = dfs_by_country["FI"].groupby(["agea"], as_index=False).mean();
# happiness_by_household_FR = dfs_by_country["FR"].groupby(["agea"], as_index=False).mean();

tr_ages_DE = pltgo.Histogram(
    x = dfs_by_country["DE"]['agea'].tolist(),
    name='DE',
    marker=dict(color = color_DE),
    opacity=0.75
)


tr_ages_FI = pltgo.Histogram(
    x = dfs_by_country["FI"]['agea'].tolist(),
    name='FI',
    marker=dict(color = color_FI),
    opacity=0.75
)


tr_ages_FR = pltgo.Histogram(
    x = dfs_by_country["FR"]['agea'].tolist(),
    name='FR',
    marker=dict(color = color_FR),
    opacity=0.75
)

plt.iplot([tr_ages_DE, tr_ages_FI, tr_ages_FR])


In [197]:
################
# HAPPINESS BY HUMAN VALUE
################

human_values = ["imprich", "ipshabt", "iprspot", "ipsuces", 
                "iphlppl", "ipudrst", "iplylfr", "impenv", "ipeqopt", 
                "impsafe", "ipfrule", "ipmodst" , "ipstrgv", "ipbhprp", "imptrad",
                "impdiff", "ipcrtiv", "ipgdtim","impfree", "ipadvnt", "impfun"
                ];

means_happiness_by_human_value = [dfs_by_country["DE"][dfs_by_country["DE"][i] < 3]["happy"].mean() for i in human_values];
means_happiness_by_human_value_df = pd.DataFrame(means_happiness_by_human_value);
# min-max normalization
means_happiness_by_human_value_normalized_df=(means_happiness_by_human_value_df-means_happiness_by_human_value_df.min())/(means_happiness_by_human_value_df.max()-means_happiness_by_human_value_df.min())

color_se = "red";
color_st = "green";
color_c = "blue";
color_o = "yellow";

colors_by_human_values = {
    "imprich": color_se, "ipshabt": color_se, "iprspot": color_se, "ipsuces": color_se, 
    "iphlppl": color_st, "ipudrst": color_st, "iplylfr": color_st, "impenv": color_st, "ipeqopt": color_st, 
    "impsafe": color_c, "ipfrule": color_c, "ipmodst": color_c , "ipstrgv": color_c, "ipbhprp": color_c, "imptrad": color_c,
     "impdiff": color_o, "ipcrtiv": color_o, "ipgdtim": color_o,"impfree": color_o, "ipadvnt": color_o, "impfun": color_o
}

means_happiness_by_human_value_normalized_df
plt.iplot([
            pltgo.Bar(
                x=human_values, 
                y=means_happiness_by_human_value_normalized_df[0].tolist(), 
                marker=dict(color=[colors_by_human_values[i] for i in human_values]))
        ]);


# takaway: you're most likely to be happy if you consider it very important
# you're least likely to be happy if you consider other people important
# happiness > 7
# average yellow valuation < 3
# happiness level = f(second most important group)

# let's look at the happy people who consider yellow group very important. 
# What is their second values? What do they value less?
# Which are the happiest
#  ++++++ TODO

In [220]:
################
# 2D HIST PLOTS: POPULATION DISTRIBUTION
################

data_histo_2d_happiness_age = [
    pltgo.Histogram2d(x=df_DE[df_DE["agea"] < 103][["agea"]]["agea"].tolist(), y=df_DE[["happy"]]["happy"].tolist(),
        colorscale=[[0, 'rgb(12,51,131)'], [0.25, 'rgb(10,136,186)'], [0.5, 'rgb(242,211,56)'], [0.75, 'rgb(242,143,56)'], [1, 'rgb(217,30,30)']]
    )
]
plt.iplot(data_histo_2d_happiness_age)

data_histo_2d_happiness_hhmmb = [
    pltgo.Histogram2d(x=df_DE[df_DE["hhmmb"] < 12][["hhmmb"]]["hhmmb"].tolist(), y=df_DE[["happy"]]["happy"].tolist(),
        colorscale=[[0, 'rgb(12,51,131)'], [0.25, 'rgb(10,136,186)'], [0.5, 'rgb(242,211,56)'], [0.75, 'rgb(242,143,56)'], [1, 'rgb(217,30,30)']]
    )
]

plt.iplot(data_histo_2d_happiness_hhmmb)






In [4]:
################
# CORRELATION: HAPPINESS VS OTHER FEATURES
################


corr_happiness_others = dfs_by_country["DE"][["agea", "happy", "sclmeet", "health", 
                             "hlthhmp", "hhmmb", "gndr", 
                             "partner", "chldhhe", 
                             "edulvla", "eduyrs", "wkhtot"]].corr();
corr_happiness_others.style.background_gradient()

Unnamed: 0,agea,happy,sclmeet,health,hlthhmp,hhmmb,gndr,partner,chldhhe,edulvla,eduyrs,wkhtot
agea,1.0,0.0727557,-0.0637979,0.215246,0.0376862,-0.0626363,0.119653,0.0910806,0.362676,0.415491,0.422205,0.155835
happy,0.0727557,1.0,0.0161843,-0.132306,-0.165053,-0.0651531,-0.104348,-0.0101929,-0.0826181,0.0680646,0.208943,0.247223
sclmeet,-0.0637979,0.0161843,1.0,0.346475,0.490439,0.394681,-0.178938,0.466212,-0.00519208,-0.0387233,-0.0873788,0.179148
health,0.215246,-0.132306,0.346475,1.0,0.322816,0.395769,-0.098202,0.241456,-0.07424,-0.030447,-0.128491,-0.219033
hlthhmp,0.0376862,-0.165053,0.490439,0.322816,1.0,0.627939,-0.219469,0.598444,-0.113807,0.0244814,-0.0538317,0.222877
hhmmb,-0.0626363,-0.0651531,0.394681,0.395769,0.627939,1.0,-0.144946,0.588543,-0.0285502,-0.0209122,-0.0663936,0.0607902
gndr,0.119653,-0.104348,-0.178938,-0.098202,-0.219469,-0.144946,1.0,-0.0793455,0.232133,0.10709,0.0921518,0.0190307
partner,0.0910806,-0.0101929,0.466212,0.241456,0.598444,0.588543,-0.0793455,1.0,0.0148036,-0.0887801,0.0715956,0.331215
chldhhe,0.362676,-0.0826181,-0.00519208,-0.07424,-0.113807,-0.0285502,0.232133,0.0148036,1.0,0.11033,0.0625817,0.0604511
edulvla,0.415491,0.0680646,-0.0387233,-0.030447,0.0244814,-0.0209122,0.10709,-0.0887801,0.11033,1.0,0.317931,0.105378


In [6]:
corr_happiness_values = dfs_by_country["DE"][["happy", "imprich", "ipshabt", "iprspot", "ipsuces", 
                "iphlppl", "ipudrst", "iplylfr", "impenv", "ipeqopt", 
                "impsafe", "ipfrule", "ipmodst" , "ipstrgv", "ipbhprp", "imptrad",
                "impdiff", "ipcrtiv", "ipgdtim","impfree", "ipadvnt", "impfun"
                ]].corr();
corr_happiness_values.style.background_gradient()

Unnamed: 0,happy,imprich,ipshabt,iprspot,ipsuces,iphlppl,ipudrst,iplylfr,impenv,ipeqopt,impsafe,ipfrule,ipmodst,ipstrgv,ipbhprp,imptrad,impdiff,ipcrtiv,ipgdtim,impfree,ipadvnt,impfun
happy,1.0,0.0235441,0.087142,-0.0102636,2.55931e-17,0.112282,0.112122,0.0879946,0.0957826,0.238328,-0.0104331,0.0496218,0.0891657,0.0843593,0.0772375,-0.0905794,0.121961,0.216586,0.0342529,0.0389298,-0.0671787,-0.0964416
imprich,0.0235441,1.0,0.342502,0.393944,0.312688,0.205066,0.213045,0.475294,0.179094,0.342284,0.348498,0.223379,0.202585,0.458969,0.270409,0.193754,0.351933,0.397906,0.413307,0.394909,0.457891,0.315684
ipshabt,0.087142,0.342502,1.0,0.459939,0.645474,0.612403,0.581349,0.443308,0.403076,0.551126,0.405315,0.471008,0.468471,0.41994,0.596671,0.252759,0.484567,0.587444,0.410099,0.490265,0.415091,0.482866
iprspot,-0.0102636,0.393944,0.459939,1.0,0.434468,0.635308,0.531292,0.596523,0.507101,0.631525,0.411618,0.489333,0.489915,0.502613,0.639418,0.324126,0.463306,0.47432,0.497905,0.394821,0.437311,0.434537
ipsuces,2.55931e-17,0.312688,0.645474,0.434468,1.0,0.588248,0.58462,0.547138,0.414415,0.473992,0.381954,0.436153,0.506531,0.451596,0.453241,0.38388,0.465665,0.447946,0.427148,0.48882,0.402092,0.500971
iphlppl,0.112282,0.205066,0.612403,0.635308,0.588248,1.0,0.805061,0.58842,0.714416,0.72679,0.418039,0.576266,0.623426,0.466986,0.757717,0.52664,0.59683,0.609413,0.488988,0.555576,0.372298,0.58511
ipudrst,0.112122,0.213045,0.581349,0.531292,0.58462,0.805061,1.0,0.572387,0.651848,0.732495,0.410541,0.491276,0.629689,0.378486,0.645568,0.489062,0.526471,0.659534,0.442557,0.707057,0.368052,0.627227
iplylfr,0.0879946,0.475294,0.443308,0.596523,0.547138,0.58842,0.572387,1.0,0.435309,0.590494,0.430568,0.336945,0.406399,0.549314,0.522423,0.37477,0.500375,0.455813,0.52617,0.546466,0.529559,0.429808
impenv,0.0957826,0.179094,0.403076,0.507101,0.414415,0.714416,0.651848,0.435309,1.0,0.657378,0.326441,0.440119,0.553395,0.420168,0.555344,0.621775,0.546122,0.49068,0.35886,0.505518,0.20251,0.427276
ipeqopt,0.238328,0.342284,0.551126,0.631525,0.473992,0.72679,0.732495,0.590494,0.657378,1.0,0.484721,0.438129,0.647383,0.527203,0.671924,0.431454,0.664403,0.565779,0.458661,0.480276,0.391003,0.376186


In [1]:
dfs_by_country["DE"][["happy", "imprich", "ipshabt", "iprspot", "ipsuces", 
                "iphlppl", "ipudrst", "iplylfr", "impenv", "ipeqopt", 
                "impsafe", "ipfrule", "ipmodst" , "ipstrgv", "ipbhprp", "imptrad",
                "impdiff", "ipcrtiv", "ipgdtim","impfree", "ipadvnt", "impfun"
                ]].corr()

NameError: name 'dfs_by_country' is not defined

In [None]:
dfs_by_country["DE"]

In [22]:
model = RandomForestRegressor(n_estimators=10, max_features=2)
# model.fit(features, labels)

NameError: name 'RandomForestRegressor' is not defined

In [None]:
# STANDARD DEVIATION, STATISTICAL SIGNIFICANCE
# HOUSEHOLD SIZE AND AGE AND HAPPINESS (heatmap)
# VALUES AND HAPPINESS
# VALUES ACROSS AGES
# VALUES: SAY AND DO
# MACHINE LEARNING