# Workbook - data processing 

 Retrieving Data

H0 = There is no difference between big 5 testing scores in general population vs celebrities

H1 = There is a significant difference between big 5 testing scores in general population vs celebrities

EXT1    P   I am the life of the party.
EXT2    N   I don't talk a lot.
EXT3    P   I feel comfortable around people.
EXT4    N   I keep in the background.
EXT5    P   I start conversations.
EXT6    N   I have little to say.
EXT7    P   I talk to a lot of different people at parties.
EXT8    N   I don't like to draw attention to myself.
EXT9    P   I don't mind being the center of attention.
EXT10   N   I am quiet around strangers.
EST1    P   I get stressed out easily.
EST2    N   I am relaxed most of the time.
EST3    P   I worry about things.
EST4    N   I seldom feel blue.
EST5    P   I am easily disturbed.
EST6    P   I get upset easily.
EST7    P   I change my mood a lot.
EST8    P   I have frequent mood swings.
EST9    P   I get irritated easily.
EST10   P   I often feel blue.
AGR1    N   I feel little concern for others.
AGR2    P   I am interested in people.
AGR3    N   I insult people.
AGR4    P   I sympathize with others' feelings.
AGR5    N   I am not interested in other people's problems.
AGR6    P   I have a soft heart.
AGR7    N   I am not really interested in others.
AGR8    P   I take time out for others.
AGR9    P   I feel others' emotions.
AGR10   P   I make people feel at ease.
CSN1    P   I am always prepared.
CSN2    N   I leave my belongings around.
CSN3    P   I pay attention to details.
CSN4    N   I make a mess of things.
CSN5    P   I get chores done right away.
CSN6    N   I often forget to put things back in their proper place.
CSN7    P   I like order.
CSN8    N   I shirk my duties.
CSN9    P   I follow a schedule.
CSN10   P   I am exacting in my work.
OPN1    P   I have a rich vocabulary.
OPN2    N   I have difficulty understanding abstract ideas.
OPN3    P   I have a vivid imagination.
OPN4    N   I am not interested in abstract ideas.
OPN5    P   I have excellent ideas.
OPN6    N   I do not have a good imagination.
OPN7    P   I am quick to understand things.
OPN8    P   I use difficult words.
OPN9    P   I spend time reflecting on things.
OPN10   P   I am full of ideas.

In [None]:
#Import libraries
import pandas as pd
import numpy as np
from scipy.stats import ttest_rel
from scipy.stats import ttest_1samp
from scipy.stats import ttest_ind
import os
import seaborn as sns; sns.set()
import matplotlib.pyplot as plt
import geopandas as gpd
import pycountry

In [None]:
#General population Big 5 results 
general = pd.read_csv('../data/datageneral.csv', sep='\t')

In [None]:
#Exploring data - general 
general.head()

In [None]:
#Exploring data - general; NAAN
general.info()

In [None]:
#Cleaning - general 
#5 categories : op; co; ex; ag; ne
#1 = disagree; 3 = neutral; 5 = agree
start_rows = len(general)
general = general.replace(0, np.nan).dropna(axis=0).reset_index(drop=True)
remove_rows = start_rows - len(general)
print('Removed', remove_rows ,'rows that had incomplete pieces of data.')
print('This was', (remove_rows/start_rows * 100),'of the total data.')
print('Number of countries:', len(set(general.country.values)))

In [None]:
country_dict = {i.alpha_2: i.alpha_3 for i in pycountry.countries}
countries = pd.DataFrame(general.country.value_counts()).T\
              .drop('NONE', axis=1)\
              .rename(columns=country_dict, index={'country': 'count'})
countries_rank = countries.T.rename_axis('iso_a3').reset_index()
countries_rank['rank'] = countries_rank['count'].rank()
countries_rank.T

In [None]:
sns.set_style("white")

file = gpd.datasets.get_path('naturalearth_lowres')
world = gpd.read_file(file)
world = pd.merge(world, right=countries_rank, how='left', on='iso_a3').fillna(0)
fig, ax = plt.subplots(figsize=(20,10))
ax.set_xticks([])
ax.set_yticks([])
ax.set_title('Countries who completed the assessment (by rank)', size=16)
world.drop(159).plot(column='rank', cmap='Blues', linewidth=0.8, ax=ax, edgecolor='0.2'); sns.set()
plt.box(on=None)

#twilight', 
#'viridis', 'viridis_r', 'vlag', 'vlag_r', 'winter', 'winter_r'
# 'cividis', 'cividis_r', 'cool', 'cool_r', 'coolwarm', 'coolwarm_r', 'copper', 'copper_r', 'cubehelix',

In [None]:
#Former research 
pos_questions = [ # positive questions adding to the trait.
    'EXT1','EXT3','EXT5','EXT7','EXT9',                       # 5 Extroversion
    'EST1','EST3','EST5','EST6','EST7','EST8','EST9','EST10', # 8 Neuroticism
    'AGR2','AGR4','AGR6','AGR8','AGR9','AGR10',               # 6 Agreeableness
    'CSN1','CSN3','CSN5','CSN7','CSN9','CSN10',               # 6 Conscientiousness
    'OPN1','OPN3','OPN5','OPN7','OPN8','OPN9','OPN10',        # 7 Openness
]
neg_questions = [ # negative (negating) questions subtracting from the trait.
    'EXT2','EXT4','EXT6','EXT8','EXT10', # 5 Extroversion
    'EST2','EST4',                       # 2 Neuroticism
    'AGR1','AGR3','AGR5','AGR7',         # 4 Agreeableness
    'CSN2','CSN4','CSN6','CSN8',         # 4 Conscientiousness
    'OPN2','OPN4','OPN6',                # 3 Openness
]

general[pos_questions] = general[pos_questions].replace({1:-2, 2:-1, 3:0, 4:1, 5:2})
general[neg_questions] = general[neg_questions].replace({1:2, 2:1, 3:0, 4:-1, 5:-2})
cols = pos_questions + neg_questions
general = general[sorted(cols)]
general.head()

In [None]:
traits = ['EXT', 'EST', 'AGR', 'CSN', 'OPN']
#trait_labels = ['Extroversion', 'Neuroticism', 'Agreeableness', 'Conscientiousness', 'Openness']
for trait in traits:
    trait_cols = sorted([col for col in general.columns if trait in col and '_E' not in col])
    general[trait] = general[trait_cols].sum(axis=1)
general[traits].head(10)

#add neuroticism --> if added shows 0 overall; issues here --> add manually  

In [None]:
bycountry= general[general['Country']][['AGR1']]
bycountry.pivot_table(index='Country', columns='Description', aggfunc='sum').iplot(kind='bar',
              title='Revenue per Country per Products')

In [None]:
#Celebrities Big 5 results 
celebrities = pd.read_csv('../data/datatwitter.csv')

In [None]:
#Explore
celebrities.info()

In [None]:
#Cleaning - celebrities
start_rows2 = len(general)
celebrities = celebrities.replace(0, np.nan).dropna(axis=0).reset_index(drop=True)
remove_rows2 = start_rows2 - len(general)
print('Removed', remove_rows2 ,'rows that had incomplete pieces of data.')
print('This was', (remove_rows2/start_rows2 * 100),'% of the total data.')

In [None]:
#Renaming columns to match - general 
#trait_labels = ['Extroversion', 'Neuroticism', 'Agreeableness', 'Conscientiousness', 'Openness']
generaltrait = pd.DataFrame(general[traits])
generaltrait.rename({'EXT': 'EXT_GEN', 'EST': 'NEU_GEN', 'AGR': 'AGR_GEN', 'CSN': 'CSN_GEN', 'OPN':'OPN_GEN'}, axis=1, inplace=True)
generaltrait.head()

In [None]:
generaltrait1 =generaltrait
generaltrait1['index_col'] = generaltrait.index
generaltrait.pivot_table(index=['index_col'],values=['EXT_GEN'], aggfunc={np.mean,np.std})

In [None]:
#Renaming columns to match - celebrities 
celebrities.rename({'op': 'OPN_CEL', 'ex': 'EXT_CEL', 'ne':'NEU_CEL', 'ag':'AGR_CEL', 'co': 'CSN_CEL'}, axis=1, inplace=True)
celebrities.head()

In [None]:
#Overall trait visualization: general vs celebrities: 'Extroversion'
#fig, ax =plt.subplots(1,2)
#sns.distplot(generaltrait['EXT_GEN'], ax=ax[0])
#sns.distplot(celebrities['EXT_CEL']), ax=ax[1])
#fig.show()
sns.distplot(generaltrait['EXT_GEN'])

In [None]:
#Overall trait visualization: general vs celebrities: 'Extroversion'
sns.distplot(celebrities['EXT_CEL'])

In [None]:
#Overall trait visualization: general vs celebrities:  'Neuroticism'
sns.distplot(generaltrait['NEU_GEN'])

In [None]:
sns.distplot(celebrities['NEU_CEL'])

In [None]:
#Overall trait visualization: general vs celebrities: 'Agreeableness'
sns.distplot(generaltrait['AGR_GEN'])

In [None]:
sns.distplot(celebrities['AGR_CEL'])

In [None]:
#Overall trait visualization: general vs celebrities: 'Conscientiousness'
sns.distplot(generaltrait['CSN_GEN'])
#sns.distplot(celebrities['CSN_CEL'])

In [None]:
sns.distplot(celebrities['CSN_CEL'])

In [None]:
#Overall trait visualization: general vs celebrities: 'Openness'
sns.distplot(generaltrait['OPN_GEN'])

In [None]:
sns.distplot(celebrities['OPN_CEL'])

In [None]:
#Correlation Vizualization - general - Matrix
# your code here
sns.set(style="white")
# Compute the correlation matrix
corr = generaltrait.corr()

# Generate a mask for the upper triangle
mask = np.triu(np.ones_like(corr, dtype=np.bool))

# Set up the matplotlib figure
f, ax = plt.subplots(figsize=(11, 9))

# Generate a custom diverging colormap
cmap = sns.diverging_palette(220, 10, as_cmap=True)

# Draw the heatmap with the mask and correct aspect ratio
sns.heatmap(corr, mask=mask, cmap=cmap, vmax=.3, center=0,
            square=True, linewidths=.5, cbar_kws={"shrink": .5}, annot = True)

In [None]:
#Correlation Vizualization - celebrities - Matrix 
# your code here
sns.set(style="white")
# Compute the correlation matrix
corr = celebrities.corr()

# Generate a mask for the upper triangle
mask = np.triu(np.ones_like(corr, dtype=np.bool))

# Set up the matplotlib figure
f, ax = plt.subplots(figsize=(11, 9))

# Generate a custom diverging colormap
cmap = sns.diverging_palette(220, 10, as_cmap=True)

# Draw the heatmap with the mask and correct aspect ratio
sns.heatmap(corr, mask=mask, cmap=cmap, vmax=.3, center=0,
            square=True, linewidths=.5, cbar_kws={"shrink": .5}, annot = True)
#should drop the unnecc cols 

In [None]:
#Joining dataframes 
#total = generaltrait.append(celebrities, sort=False)
#total.head()
#generaltrait.corrwith(celebrities, axis = 1) 
#different lenghts; won't work 

In [None]:
#Joining dataframes 
#Creating mean values columns - generaltraits
generalmeans = pd.DataFrame(generaltrait.mean().to_dict(),index=[generaltrait.index.values[-1]])
#generalmeans.head()
#new_df.rename(columns=lambda c: "mean_"+str(c))
#Creating mean values columns - celebrities
celebritiesmeans = pd.DataFrame(celebrities.mean().to_dict(),index=[celebrities.index.values[-1]])
#celebritiesmeans.head()
#Joining into one df 
#total = generalmeans.append(celebritiesmeans, sort=False)
#total = pd.concat([generalmeans, celebritiesmeans], ignore_index=True)
#total.head()

In [None]:
#generalmeans = pd.DataFrame(generaltrait.mean().to_dict(),index=[generaltrait.index.values[-1]])
#generalmeans.rename({'OPN_CEL', 'ex': 'EXT_CEL', 'ne':'NEU_CEL', 'ag':'AGR_CEL', 'co': 'CSN_CEL'}, axis=1, inplace=True)
generaltrait.mean(axis = 0)
#generalmeans.columns = ['EXT', 'NEU', 'AGR', 'CSN', 'OPN', 'NE']
#generalmeans.head()


In [None]:
#check
#generaltrait.mean(axis = 0)
generalmeans2 = pd.DataFrame(generaltrait.mean(axis = 0).to_dict(), index = generaltrait.index.values)
generalmeans2.head()

In [None]:
generaltrait.std(axis = 0)

In [None]:
celebritiesmeans = pd.DataFrame(celebrities.mean(axis = 0).to_dict(),index= celebrities.index.values)
#celebritiesmeans.columns = ['OPN','CSN', 'EXT', 'AGR','NEU', 'x', 'y']
celebritiesmeans.head()
#generalmeans.corrwith(celebritiesmeans, axis = 1) 
#need to recount the scores

In [None]:
celebrities.mean(axis = 0)

In [None]:
celebrities.std(axis = 0)

In [None]:
#Correlation testing - correlation matrix 
#generalmeans.corrwith(celebritiesmeans, axis = 0) 

In [None]:
#Extroversion 
EXT1= generaltrait["EXT_GEN"]
EXT2= celebrities["EXT_CEL"]
EXTcorrelation = EXT1.corr(EXT2)
print('EXT Correlation:', EXTcorrelation)

In [None]:
#Openess 
OPN1= generaltrait["OPN_GEN"]
OPN2= celebrities["OPN_CEL"]
OPNcorrelation = OPN1.corr(OPN2)
print('OPN Correlation:', OPNcorrelation)

In [None]:
#Agreeableness 
AGR1= generaltrait["AGR_GEN"]
AGR2= celebrities["AGR_CEL"]
AGRcorrelation = AGR1.corr(AGR2)
print('AGR Correlation:', AGRcorrelation)

In [None]:
#Conscientiousness 
CSN1= generaltrait["CSN_GEN"]
CSN2= celebrities["CSN_CEL"]
CSNcorrelation = CSN1.corr(CSN2)
print('CSN Correlation:', CSNcorrelation)

In [None]:
#Neuroticism 
NEU1= generaltrait["NEU_GEN"]
NEU2= celebrities["NEU_CEL"]
NEUcorrelation = NEU1.corr(NEU2)
print('NEU Correlation:', NEUcorrelation)

In [None]:
# Correlation testing - test choice 
#Type of values: Ordinal [testing scale 1 - 3- 5]
#Purpose: Examining differences between populations
#Measuring: Independent values 
#Using: Mann-Whitney U test
#Difference between MW & t-test: T-test is parametric; WMW is for non-parametric. 
from scipy.stats import mannwhitneyu

EXTMWT = mannwhitneyu(generaltrait["EXT_GEN"], celebrities["EXT_CEL"])
print ('Extroversion MW:', EXTMWT)
OPNMWT = mannwhitneyu(generaltrait["OPN_GEN"], celebrities["OPN_CEL"])
print ('Openess MW:', OPNMWT)
AGRMWT = mannwhitneyu(generaltrait["AGR_GEN"], celebrities["AGR_CEL"])
print ('Agreeableness MW:', AGRMWT)
CSNMWT = mannwhitneyu(generaltrait["CSN_GEN"], celebrities["CSN_CEL"])
print ('Conscentiousness MW:', CSNMWT)
NEUMWT = mannwhitneyu(generaltrait["NEU_GEN"], celebrities["NEU_CEL"])
print ('Neuroticism MW:', NEUMWT )

In [None]:
from ipywidgets import interact

In [None]:
@interact(EXTGEN1=generaltrait['EXT_GEN'].unique(), 
          EXTCEB1= celebrities['EXT_CEL'].unique())

def linechart(EXTGEN1, EXTCEB1):
    df = generaltrait[(generaltrait['EXT_GEN']) & (celebrities['EXT_CEL'])
    grouped= generaltrait.groupby('OPN_GEN').agg('sum')['OPN_GEN']
    grouped.iplot(kind='line', title='xyz')