In [None]:
#import modules
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns

#load data
data = pd.read_csv("data-final.csv", sep="\t")
data.head(10)

In [None]:
data.info()

In [None]:
data.describe()

In [None]:
len(data)

In [None]:
#count missing values
data.isnull().values.sum()

In [None]:
#drop rows with missing values

df = data.copy()
df.dropna(inplace=True)

len(df)

In [None]:
df.isnull().values.any()

In [None]:
#plot countries with more than 10.000 participants

countries = pd.DataFrame(df['country'].value_counts())
countries_10k = countries[countries['country'] >= 10000]

sns.barplot(data=countries_10k, x=countries_10k.index, y='country')

In [None]:
#group questions with column name 

ext_questions = {'EXT1' : 'I am the life of the party',
                 'EXT2' : 'I dont talk a lot',
                 'EXT3' : 'I feel comfortable around people',
                 'EXT4' : 'I keep in the background',
                 'EXT5' : 'I start conversations',
                 'EXT6' : 'I have little to say',
                 'EXT7' : 'I talk to a lot of different people at parties',
                 'EXT8' : 'I dont like to draw attention to myself',
                 'EXT9' : 'I dont mind being the center of attention',
                 'EXT10': 'I am quiet around strangers'}

est_questions = {'EST1' : 'I get stressed out easily',
                 'EST2' : 'I am relaxed most of the time',
                 'EST3' : 'I worry about things',
                 'EST4' : 'I seldom feel blue',
                 'EST5' : 'I am easily disturbed',
                 'EST6' : 'I get upset easily',
                 'EST7' : 'I change my mood a lot',
                 'EST8' : 'I have frequent mood swings',
                 'EST9' : 'I get irritated easily',
                 'EST10': 'I often feel blue'}

agr_questions = {'AGR1' : 'I feel little concern for others',
                 'AGR2' : 'I am interested in people',
                 'AGR3' : 'I insult people',
                 'AGR4' : 'I sympathize with others feelings',
                 'AGR5' : 'I am not interested in other peoples problems',
                 'AGR6' : 'I have a soft heart',
                 'AGR7' : 'I am not really interested in others',
                 'AGR8' : 'I take time out for others',
                 'AGR9' : 'I feel others emotions',
                 'AGR10': 'I make people feel at ease'}

csn_questions = {'CSN1' : 'I am always prepared',
                 'CSN2' : 'I leave my belongings around',
                 'CSN3' : 'I pay attention to details',
                 'CSN4' : 'I make a mess of things',
                 'CSN5' : 'I get chores done right away',
                 'CSN6' : 'I often forget to put things back in their proper place',
                 'CSN7' : 'I like order',
                 'CSN8' : 'I shirk my duties',
                 'CSN9' : 'I follow a schedule',
                 'CSN10' : 'I am exacting in my work'}

opn_questions = {'OPN1' : 'I have a rich vocabulary',
                 'OPN2' : 'I have difficulty understanding abstract ideas',
                 'OPN3' : 'I have a vivid imagination',
                 'OPN4' : 'I am not interested in abstract ideas',
                 'OPN5' : 'I have excellent ideas',
                 'OPN6' : 'I do not have a good imagination',
                 'OPN7' : 'I am quick to understand things',
                 'OPN8' : 'I use difficult words',
                 'OPN9' : 'I spend time reflecting on things',
                 'OPN10': 'I am full of ideas'}

In [None]:
#group columns for every type

EXT_col = [col for col in data if col.startswith('EXT')]
EST_col = [col for col in data if col.startswith('EST')]
AGR_col = [col for col in data if col.startswith('AGR')]
CSN_col = [col for col in data if col.startswith('CSN')]
OPN_col = [col for col in data if col.startswith('OPN')]

In [None]:
#plot answers for extroversion personality

plt.figure(figsize=(40,60))
for i in range(1, 11):
    plt.subplot(10,5,i)
    plt.hist(df[EXT_col[i-1]], color="red", bins=20)
    plt.title(ext_questions[EXT_col[i-1]], fontsize=20)

In [None]:
#plot answers for neuroticism personality

plt.figure(figsize=(40,60))
for i in range(1, 11):
    plt.subplot(10,5,i)
    plt.hist(df[EST_col[i-1]], color="blue", bins=20)
    plt.title(est_questions[EST_col[i-1]], fontsize=20)

In [None]:
#plot answers for agreeable personality

plt.figure(figsize=(40,60))
for i in range(1, 11):
    plt.subplot(10,5,i)
    plt.hist(df[AGR_col[i-1]], color="green", bins=20)
    plt.title(agr_questions[AGR_col[i-1]], fontsize=20)

In [None]:
#plot answers for conscientious personality

plt.figure(figsize=(40,60))
for i in range(1, 11):
    plt.subplot(10,5,i)
    plt.hist(df[CSN_col[i-1]], color="orange", bins=20)
    plt.title(csn_questions[CSN_col[i-1]], fontsize=20)

In [None]:
#plot answers for open personality

plt.figure(figsize=(40,60))
for i in range(1, 11):
    plt.subplot(10,5,i)
    plt.hist(df[OPN_col[i-1]], color="black", bins=20)
    plt.title(opn_questions[OPN_col[i-1]], fontsize=20)

In [None]:
#group positively and negatively keyed questions

positiv_questions = ['EXT1', 'EXT3', 'EXT5', 'EXT7', 'EXT9',
                    'EST1', 'EST3', 'EST5', 'EST6', 'EST7', 
                    'EST8', 'EST9', 'EST10',
                    'AGR2', 'AGR4', 'AGR6', 'AGR8', 'AGR9', 'AGR10',
                    'CSN1', 'CSN3', 'CSN5', 'CSN7', 'CSN9', 'CSN10', 
                    'OPN1', 'OPN3', 'OPN5', 'OPN7', 'OPN8', 'OPN9', 
                    'OPN10']

negative_questions = ['EXT2', 'EXT4', 'EXT6', 'EXT8', 'EXT10',
                    'EST2', 'EST4',
                    'AGR1', 'AGR3', 'AGR5', 'AGR7', 
                    'CSN2', 'CSN4', 'CSN6', 'CSN8', 
                    'OPN2', 'OPN4', 'OPN6']

In [None]:
#reverse values of negative questions

df.loc[:, negative_questions] = 6 - df.loc[:, negative_questions]
df.head(10)

In [None]:
from sklearn.cluster import KMeans

df_model = df[df.columns.tolist()[:50]]

kmeans = KMeans(n_clusters=5)
km_fitted = kmeans.fit(df_model)

In [None]:
df_model.loc[:, 'Cluster'] = km_fitted.labels_
df_model.head(10)

In [None]:
df_model.Cluster.value_counts()