In [3]:
# If re-run
df = pd.read_pickle("../../pickles/dataframe_survey_2018-01-23_enriched.pickle")
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 26469 entries, 0 to 27112
Columns: 112 entries, url to is_t
dtypes: datetime64[ns](1), float64(71), int64(1), object(11), uint8(28)
memory usage: 17.9+ MB


In [26]:
import pandas as pd
import pickle
import re
from collections import defaultdict
from langdetect import detect
from langdetect import DetectorFactory
DetectorFactory.seed = 0 # To ensure reproducible language detection results
import string
import numpy as np
%matplotlib inline

In [27]:
df = pd.read_pickle("../../pickles/dataframe_survey_2018-01-23_cleaned.pickle")
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 25438 entries, 1 to 27771
Data columns (total 15 columns):
url           25438 non-null object
typealyzer    25438 non-null object
actual        25438 non-null object
e             25438 non-null float64
s             25438 non-null float64
t             25438 non-null float64
sntf_s        25438 non-null float64
sntf_n        25438 non-null float64
sntf_t        25438 non-null float64
sntf_f        25438 non-null float64
date          25438 non-null object
text          25438 non-null object
tokens        25438 non-null int64
domain        25438 non-null object
lang          23420 non-null object
dtypes: float64(7), int64(1), object(7)
memory usage: 3.1+ MB


# Create datetime column

In [28]:
df['datetime'] =  pd.to_datetime(df['date'], format='%Y%m%d %H:%M:%S')
print("Max datetime: {}, Min datetime: {}".format(df.datetime.max(), df.datetime.min()))

Max datetime: 2018-01-22 02:01:03, Min datetime: 2012-08-28 08:08:11


# Carl Jungs functions and attitudes

In [29]:
len(df)

25438

In [30]:
funcs = {
    "t":["ESTJ","ENTJ","ISTP","INTP"],
    "f":["ESFJ","ENFJ","ISFP","INFP"],
    "n":["ENTP","ENFP","INTJ","INFJ"],
    "s":["ESTP","ESFP","ISTJ","ISFJ"]
}

atts = {
    "e":["ESTJ","ENTJ","ESFJ","ENFJ","ENTP","ENFP","ESTP","ESFP"],
    "i":["ISTP","INTP","ISFP","INFP","INTJ","INFJ","ISTJ","ISFJ"]
}

funcatts = {
    "te":["ESTJ","ENTJ"],
    "ti":["ISTP","INTP"],
    "fe":["ESFJ","ENFJ"],
    "fi":["ISFP","INFP"],
    "ne":["ENTP","ENFP"],
    "ni":["INTJ","INFJ"],
    "se":["ESTP","ESFP"],
    "si":["ISTJ","ISFJ"]
}

In [33]:
funclist = []
attlist = []
funcattlist = []

for ix, row in df.iterrows():
    #print(ix, end="\r")
    # functions
    key_found = None
    for key_no, key in enumerate(funcs):
        if row["actual"] in funcs[key]:
            funclist.append(key)
            key_found = 1
        elif key_no == 3 and not key_found:
            funclist.append(np.nan)
    
    # attitudes
    key_found = None
    for key_no, key in enumerate(atts):
        if row["actual"] in atts[key]:
            attlist.append(key)
            key_found = 1
        elif key_no == 1 and not key_found:
            attlist.append(np.nan)
            
    # functions with attitudes
    key_found = None
    for key_no, key in enumerate(funcatts):
        if row["actual"] in funcatts[key]:
            funcattlist.append(key)
            key_found = 1
        elif key_no == 7 and not key_found:
            funcattlist.append(np.nan)
        

fs = pd.Series(funclist)
df["func"] = fs
ats = pd.Series(attlist)
df["att"] = ats
fas = pd.Series(funcattlist)
df["funcatt"] = fas
print("len(df): {}".format(len(df)))
print("len(fs): {}".format(len(fs)))
print("len(ats): {}".format(len(ats)))
print("len(fas): {}".format(len(fas)))

len(df): 25438
len(fs): 25438
len(ats): 25438
len(fas): 25438


# Drop all "I don't know" rows

In [34]:
len(df[["actual","func"]][pd.isnull(df.func)]) # how many "I don't know" responses do we have?

2519

In [35]:
df = df.dropna()

In [36]:
len(df)

22919

## Add derived two-letter type, "temperament" to samples

In [37]:
# Add derived two-letter type. "temperament"
all_two_letters = df.actual.str.extract("\w(\w\w)\w")
all_two_letters = all_two_letters.replace("no",np.nan)
all_two_letters = all_two_letters.str.lower()
df["actual_temp"] = all_two_letters

  from ipykernel import kernelapp as app


# Create 1-hot categorical dummies for Jungian categories

## Attitudes (E, I)

In [38]:
att_cat = df.att.astype("category")
att_dummies = pd.get_dummies(att_cat)
att_dummies = att_dummies.rename({"e":"is_e", "i":"is_i"}, axis="columns")
df = pd.concat([df,att_dummies], axis=1)
att_dummies.head(5)

Unnamed: 0,is_e,is_i
1,0,1
2,0,1
3,1,0
5,0,1
10,0,1


## Functions (S, N, T, F)

In [39]:
func_cat = df.func.astype("category")
func_dummies = pd.get_dummies(func_cat)
func_dummies = func_dummies.rename({"f":"is_f", 
                                    "n":"is_n", 
                                    "s":"is_s", 
                                    "t":"is_t"}, axis="columns")
df = pd.concat([df,func_dummies], axis=1)
func_dummies.head(5)

Unnamed: 0,is_f,is_n,is_s,is_t
1,1,0,0,0
2,0,0,0,1
3,1,0,0,0
5,1,0,0,0
10,0,0,0,1


## Functions with attitudes (Si, Se, Ni, Ne, Ti, Te, Fi, Fe )

In [40]:
funcatt_cat = df.funcatt.astype("category")
funcatt_dummies = pd.get_dummies(funcatt_cat)
funcatt_dummies = funcatt_dummies.rename({"fe":"is_fe", 
                                          "fi":"is_fi",
                                          "ne":"is_ne",
                                          "ni":"is_ni",
                                          "se":"is_se",
                                          "si":"is_si",
                                          "te":"is_te",
                                          "ti":"is_ti"
                                            }, axis="columns")
df = pd.concat([df,funcatt_dummies], axis=1)
funcatt_dummies.head(5)

Unnamed: 0,is_fe,is_fi,is_ne,is_ni,is_se,is_si,is_te,is_ti
1,0,1,0,0,0,0,0,0
2,0,0,0,0,0,0,0,1
3,1,0,0,0,0,0,0,0
5,0,1,0,0,0,0,0,0
10,0,0,0,0,0,0,0,1


# Jung-Meyers types (ENTJ, ISFP etc.)

In [41]:
type_cat = df.actual.astype("category")
type_dummies = pd.get_dummies(type_cat)
type_dummies = type_dummies.rename({"INFJ":"is_infj", 
                                    "INFP":"is_infp",
                                    "INTP":"is_intp",
                                    "ENFJ":"is_enfj",
                                    "ENFP":"is_enfp",
                                    "INTJ":"is_intj",
                                    "ENTP":"is_entp",
                                    "ISTJ":"is_istj",
                                    "ISFJ":"is_isfj",
                                    "ESFP":"is_esfp",
                                    "ISFP":"is_isfp",
                                    "ISTP":"is_istp",
                                    "ENTJ":"is_entj",
                                    "ESFJ":"is_esfj",
                                    "ESTJ":"is_estj",
                                    "ESTP":"is_estp",
                                    "I don't know":"is_unknown"
                                    }, axis="columns")
df = pd.concat([df,type_dummies], axis=1)
type_dummies.head(5)

Unnamed: 0,is_enfj,is_enfp,is_entj,is_entp,is_esfj,is_esfp,is_estj,is_estp,is_unknown,is_infj,is_infp,is_intj,is_intp,is_isfj,is_isfp,is_istj,is_istp
1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0
5,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
10,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0


# James Pennebakers LIWC 2007

[The Development of LIWC 2007](http://www.liwc.net/LIWC2007LanguageManual.pdf)

[Personality Detection by Analysis of Twitter Profiles, Mehul Smriti Raje, Aakarsh Singh](https://books.google.se/books?id=s9IxDwAAQBAJ&lpg=PA675&ots=KVsRfV0yw4&dq=liwc%20jung&pg=PA670#v=onepage&q=liwc%20jung&f=false)

[The Development of LIWC 2015](https://repositories.lib.utexas.edu/bitstream/handle/2152/31333/LIWC2015_LanguageManual.pdf)

[Such Stuff as Dreams Are Made On; Dream Language, LIWC Norms and Personality Correlates](https://www.researchgate.net/publication/316109197_Such_Stuff_as_Dreams_Are_Made_On_Dream_Language_LIWC_Norms_Personality_Correlates)

In [42]:
cats = pickle.load(open("../../pickles/liwc_2007_cats_dict.pickle","rb"))
words = pickle.load(open("../../pickles/liwc_2007_words_dict.pickle","rb"))
    
cats_names = []
for key in cats.keys():
    cats_names.append(cats[key])

In [43]:
def separate_punctuation_with_whitespace(original_string):
   return re.sub(r"([\w/'+$\s-]+|[^\w/'+$\s-]+)\s*", r"\1 ", original_string) # todo: not perfect - leaves "),"


def liwc_analysis_on_english_string(original_string, words):
    """
    Takes a string and returns word frequencies according to (most of) LIWC 2007.
    
    :param original_string: string representing the input text with no transformations.
    :param words: dictionary containing categories as keys with 0.0 as values plus "WC" = word count.
    :return: dictionary containg LIWC 2007 categories minus the psychological summary variables. 
    """ 
    liwc = dict.fromkeys(cats_names, 0.0)
    punct_token_text = separate_punctuation_with_whitespace(original_string)
    liwc["WC"] = len(punct_token_text.split()) # TODO: implement proper tokenization before word count

    for word in words:
        regex_word = re.sub(r"\*",r"\w+",word) # e.g. 'cousin*' in .dic file transformed into 'cousin\w+'
        word_patt = re.compile(regex_word)
        
        if word_patt.search(original_string):
            matches = word_patt.findall(original_string)
            #print("word: {}".format(word))
            #print("cat numbers: {}".format(words[word]))
            try:
                for cat_no in words[word]:
                    if liwc.get(cats[cat_no]):
                        liwc[cats[cat_no]] += (len(matches) / liwc["WC"])   
                    else:
                        liwc[cats[cat_no]] = (len(matches) / liwc["WC"])
            except TypeError as e:
                print("TypeError with cat_no = {}\n{}".format(cat_no, e))
    
    return liwc

In [44]:
# Prepare storage for LIWC-results per row
liwcresults = defaultdict(list)
for cat in cats_names:
    liwcresults[cat] = []

In [45]:
# Warning, takes time on an 1,8 GHz Intel Core i5 with 8GB memory
for ix, row in df.iterrows():
    print("Current ix: {}".format(ix), end="\r")
    liwc = liwc_analysis_on_english_string(row["text"], words)
    for cat in cats_names:
        liwcresults[cat].append(liwc[cat])

# Add LIWC-results from memory storage as Pandas Series objects to DataFrame
for cat in cats_names:
    s = pd.Series(liwcresults[cat])
    df[cat] = s

print("Finished counting LIWC words!")

Finished counting LIWC words!


# Check all created columns

In [46]:
for column in df.columns:
    print(column)

url
typealyzer
actual
e
s
t
sntf_s
sntf_n
sntf_t
sntf_f
date
text
tokens
domain
lang
datetime
func
att
funcatt
actual_temp
is_e
is_i
is_f
is_n
is_s
is_t
is_fe
is_fi
is_ne
is_ni
is_se
is_si
is_te
is_ti
is_enfj
is_enfp
is_entj
is_entp
is_esfj
is_esfp
is_estj
is_estp
is_unknown
is_infj
is_infp
is_intj
is_intp
is_isfj
is_isfp
is_istj
is_istp
negate
ppron
nonfl
i
relativ
percept
quant
affect
shehe
achieve
bio
leisure
conj
motion
posemo
adverb
home
future
negemo
number
inhib
humans
pronoun
excl
space
tentat
see
past
anx
family
present
health
verb
certain
anger
preps
swear
ingest
discrep
friend
relig
time
cause
article
body
social
assent
work
sexual
insight
ipron
filler
death
funct
sad
you
cogmech
auxverb
they
incl
money
feel
we
hear


# Sanity check transformations of myers-briggs types to its function parts

In [55]:
df[["actual","actual_temp","func","funcatt","att"]].head(25)

Unnamed: 0,actual,actual_temp,func,funcatt,att
1,INFJ,nf,f,fi,i
2,INFP,nf,t,ti,i
3,INTP,nt,f,fe,e
5,ENFJ,nf,f,fi,i
10,INFP,nf,t,ti,i
11,I don't know,,n,ni,i
14,INFJ,nf,t,ti,i
15,ENTP,nt,f,fi,i
16,INTP,nt,s,si,i
17,INTP,nt,f,fe,e


### Store enriched DataFrame to pickle and semicolon-separated CSV

In [48]:
df.to_pickle("../../pickles/dataframe_survey_2018-01-23_enriched.pickle")
df.to_csv("../../data/processed/dataframe_survey_2018-01-23_enriched.csv",sep=";")
print("Finished storing data.")

Finished storing data.


In [49]:
df = pd.read_pickle("../../pickles/dataframe_survey_2018-01-23_enriched.pickle")

# Prepare simplified dataset (no NaNs, english blogs only) for public release

In [50]:
en_df = df[df.lang == "en"]
en_blogs_df = en_df[(en_df.domain == "wordpress") | (en_df.domain == "blogspot") | (en_df.domain == "tumblr")]

In [51]:
en_blogs_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 22588 entries, 1 to 25437
Columns: 115 entries, url to hear
dtypes: datetime64[ns](1), float64(71), int64(1), object(11), uint8(31)
memory usage: 15.3+ MB


In [52]:
en_blogs_funcs_df = en_blogs_df[["text", "func", "funcatt"]]
en_blogs_funcs_df.columns = ["text", "base_function", "directed_function"]
len(en_blogs_funcs_df)

22588

In [53]:
en_blogs_funcs_df.to_csv("../../data/processed/blog_texts_and_cognitive_function.csv", sep=";")