In [3]:
# If re-run
df = pd.read_pickle("../../pickles/dataframe_survey_2018-01-23_enriched.pickle")

In [6]:
df[["actual"][pd.isnull(df.func)]

27113            INFJ
27114            INTP
27115            INTP
27116            INFJ
27117            INFP
27118            INTJ
27119            ENFP
27120            INFJ
27121            INFP
27122            INFP
27123            INFP
27124            INFP
27125            INFP
27126            INFP
27127            INFP
27128            INFJ
27129            INTP
27130            INTP
27131            INFP
27132            ESTJ
27133            INFP
27134            INFJ
27135            INTJ
27136            INFP
27137            INTJ
27138            ESFP
27139            ISTJ
27140            INFJ
27141            ENTP
27142            ISFP
             ...     
27929            INFJ
27930    I don't know
27931            INFP
27932            ENTJ
27933            INTP
27934            INFJ
27935            ESTJ
27936            INTP
27937            INTJ
27938            ISFP
27939            ENFJ
27940            INTP
27941            INTP
27942            ENTP
27943     

In [4]:
import pandas as pd
import pickle
import re
from collections import defaultdict
from langdetect import detect
from langdetect import DetectorFactory
DetectorFactory.seed = 0 # To ensure reproducible language detection results

%matplotlib inline

In [69]:
df = pd.read_pickle("../../pickles/dataframe_survey_2018-01-23_cleaned.pickle")

In [71]:
df.sample(2).head()

Unnamed: 0,url,typealyzer,actual,e,s,t,sntf_s,sntf_n,sntf_t,sntf_f,date,text,tokens,domain
637,http://jaschmehl.wordpress.com,ENTP,I don't know,0.571364,0.432009,0.560149,0.246421,0.317213,0.235832,0.200534,20130220 05:02:44,"Mind of a Mouse | Easily Startled, Probably Pa...",104,wordpress
18953,http://alanaisreading.tumblr.com,INFP,INTJ,0.156309,0.170416,0.374652,0.176104,0.713201,0.038251,0.072443,20140814 04:08:08,Alana Is Reading Alana Is Reading American in ...,488,tumblr


# Add language classification result to DataFrame

In [72]:
langs = []
for ix, row in df.iterrows():
    print("current ix: {}".format(ix), end="\r")
    try:
        langs.append(detect(row["text"]))
    except:
        print("ix: {} tokens: {}\ntext{}".format(ix, row["tokens"], row["text"]))
df["lang"] = pd.Series(langs)
df.lang.value_counts()

current ix: 27958

en    27174
fr       54
da       40
no       28
de       28
sv       27
tl       16
nl       15
ca       15
es       15
ja       15
af       14
it       14
ro       13
et       13
so       12
cy       11
id       10
pt        9
ko        9
fi        7
sl        5
pl        5
el        5
lv        4
bn        4
lt        3
ar        2
sw        2
sk        2
uk        2
sq        2
tr        2
vi        2
cs        2
hr        1
ru        1
ta        1
mk        1
hi        1
Name: lang, dtype: int64

# Create datetime column

In [74]:
df['datetime'] =  pd.to_datetime(df['date'], format='%Y%m%d %H:%M:%S')
print("Max datetime: {}, Min datetime: {}".format(df.datetime.max(), df.datetime.min()))

Max datetime: 2018-01-22 05:01:47, Min datetime: 2012-08-28 08:08:11


# Carl Jungs functions and attitudes

In [84]:
funcs = {
    "t":["ESTJ","ENTJ","ISTP","INTP"],
    "f":["ESFJ","ENFJ","ISFP","INFP"],
    "n":["ENTP","ENFP","INTJ","INFJ"],
    "s":["ESTP","ESFP","ISTJ","ISFJ"]
}

atts = {
    "e":["ESTJ","ENTJ","ESFJ","ENFJ","ENTP","ENFP","ESTP","ESFP"],
    "i":["ISTP","INTP","ISFP","INFP","INTJ","INFJ","ISTJ","ISFJ"]
}

funcatts = {
    "te":["ESTJ","ENTJ"],
    "ti":["ISTP","INTP"],
    "fe":["ESFJ","ENFJ"],
    "fi":["ISFP","INFP"],
    "ne":["ENTP","ENFP"],
    "ni":["INTJ","INFJ"],
    "se":["ESTP","ESFP"],
    "si":["ISTJ","ISFJ"]
}

In [85]:
funclist = []
attlist = []
funcattlist = []

for ix, row in df.iterrows():
    # functions
    for key in funcs:
        if row["actual"] in funcs[key]:
            funclist.append(key)
    
    # attitudes
    for key in atts:
        if row["actual"] in atts[key]:
            attlist.append(key) 
            
    # functions with attitudes
    for key in funcatts:
        if row["actual"] in funcatts[key]:
            funcattlist.append(key) 

fs = pd.Series(funclist)
df["func"] = fs
ats = pd.Series(attlist)
df["att"] = ats
fas = pd.Series(funcattlist)
df["funcatt"] = fas

# Create 1-hot categorical dummies for Jungian categories

## Attitudes (E, I)

In [86]:
att_cat = df.att.astype("category")
att_dummies = pd.get_dummies(att_cat)
att_dummies = att_dummies.rename({"e":"is_e", "i":"is_i"}, axis="columns")
df = pd.concat([df,att_dummies], axis=1)
att_dummies.head(1)

Unnamed: 0,is_e,is_i
0,0,1


## Functions (S, N, T, F)

In [87]:
func_cat = df.func.astype("category")
func_dummies = pd.get_dummies(func_cat)
func_dummies = func_dummies.rename({"f":"is_f", 
                                    "n":"is_n", 
                                    "s":"is_s", 
                                    "t":"is_t"}, axis="columns")
df = pd.concat([df,func_dummies], axis=1)
func_dummies.head(1)

Unnamed: 0,is_f,is_n,is_s,is_t
0,0,1,0,0


## Functions with attitudes (Si, Se, Ni, Ne, Ti, Te, Fi, Fe )

In [88]:
funcatt_cat = df.funcatt.astype("category")
funcatt_dummies = pd.get_dummies(funcatt_cat)
funcatt_dummies = funcatt_dummies.rename({"fe":"is_fe", 
                                          "fi":"is_fi",
                                          "ne":"is_ne",
                                          "ni":"is_ni",
                                          "se":"is_se",
                                          "si":"is_si",
                                          "te":"is_te",
                                          "ti":"is_ti"
                                            }, axis="columns")
df = pd.concat([df,funcatt_dummies], axis=1)
funcatt_dummies.head(1)

Unnamed: 0,is_fe,is_fi,is_ne,is_ni,is_se,is_si,is_te,is_ti
0,0,0,0,1,0,0,0,0


# Jung-Meyers types (ENTJ, ISFP etc.)

In [89]:
type_cat = df.actual.astype("category")
type_dummies = pd.get_dummies(type_cat)
type_dummies = type_dummies.rename({"INFJ":"is_infj", 
                                    "INFP":"is_infp",
                                    "INTP":"is_intp",
                                    "ENFJ":"is_enfj",
                                    "ENFP":"is_enfp",
                                    "INTJ":"is_intj",
                                    "ENTP":"is_entp",
                                    "ISTJ":"is_istj",
                                    "ISFJ":"is_isfj",
                                    "ESFP":"is_esfp",
                                    "ISFP":"is_isfp",
                                    "ISTP":"is_istp",
                                    "ENTJ":"is_entj",
                                    "ESFJ":"is_esfj",
                                    "ESTJ":"is_estj",
                                    "ESTP":"is_estp",
                                    "I don't know":"is_unknown"
                                    }, axis="columns")
df = pd.concat([df,type_dummies], axis=1)
type_dummies.head(1)

Unnamed: 0,is_enfj,is_enfp,is_entj,is_entp,is_esfj,is_esfp,is_estj,is_estp,is_unknown,is_infj,is_infp,is_intj,is_intp,is_isfj,is_isfp,is_istj,is_istp
0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0


# James Pennebakers LIWC 2007

[The Development of LIWC 2007](http://www.liwc.net/LIWC2007LanguageManual.pdf)

[Personality Detection by Analysis of Twitter Profiles, Mehul Smriti Raje, Aakarsh Singh](https://books.google.se/books?id=s9IxDwAAQBAJ&lpg=PA675&ots=KVsRfV0yw4&dq=liwc%20jung&pg=PA670#v=onepage&q=liwc%20jung&f=false)

[The Development of LIWC 2015](https://repositories.lib.utexas.edu/bitstream/handle/2152/31333/LIWC2015_LanguageManual.pdf)

[Such Stuff as Dreams Are Made On; Dream Language, LIWC Norms and Personality Correlates](https://www.researchgate.net/publication/316109197_Such_Stuff_as_Dreams_Are_Made_On_Dream_Language_LIWC_Norms_Personality_Correlates)

In [90]:
cats = pickle.load(open("../../pickles/liwc_2007_cats_dict.pickle","rb"))
words = pickle.load(open("../../pickles/liwc_2007_words_dict.pickle","rb"))
    
cats_names = []
for key in cats.keys():
    cats_names.append(cats[key])

In [91]:
def separate_punctuation_with_whitespace(original_string):
   return re.sub(r"([\w/'+$\s-]+|[^\w/'+$\s-]+)\s*", r"\1 ", original_string) # todo: not perfect - leaves "),"


def liwc_analysis_on_english_string(original_string, words):
    """
    Takes a string and returns word frequencies according to (most of) LIWC 2007.
    
    :param original_string: string representing the input text with no transformations.
    :param words: dictionary containing categories as keys with 0.0 as values plus "WC" = word count.
    :return: dictionary containg LIWC 2007 categories minus the psychological summary variables. 
    """ 
    liwc = dict.fromkeys(cats_names, 0.0)
    punct_token_text = separate_punctuation_with_whitespace(original_string)
    liwc["WC"] = len(punct_token_text.split()) # TODO: implement proper tokenization before word count

    for word in words:
        regex_word = re.sub(r"\*",r"\w+",word) # e.g. 'cousin*' in .dic file transformed into 'cousin\w+'
        word_patt = re.compile(regex_word)
        
        if word_patt.search(original_string):
            matches = word_patt.findall(original_string)
            #print("word: {}".format(word))
            #print("cat numbers: {}".format(words[word]))
            try:
                for cat_no in words[word]:
                    if liwc.get(cats[cat_no]):
                        liwc[cats[cat_no]] += (len(matches) / liwc["WC"])   
                    else:
                        liwc[cats[cat_no]] = (len(matches) / liwc["WC"])
            except TypeError as e:
                print("TypeError with cat_no = {}\n{}".format(cat_no, e))
    
    return liwc

In [92]:
# Prepare storage for LIWC-results per row
liwcresults = defaultdict(list)
for cat in cats_names:
    liwcresults[cat] = []

In [93]:
# Warning, takes time on an 1,8 GHz Intel Core i5 with 8GB memory
for ix, row in df.iterrows():
    print("Current ix: {}".format(ix), end="\r")
    liwc = liwc_analysis_on_english_string(row["text"], words)
    for cat in cats_names:
        liwcresults[cat].append(liwc[cat])

# Add LIWC-results from memory storage as Pandas Series objects to DataFrame
for cat in cats_names:
    s = pd.Series(liwcresults[cat])
    df[cat] = s

print("Finished counting LIWC words!")

Finished counting LIWC words!


# Final check of all created columns

In [94]:
for column in df.columns:
    print(column)

url
typealyzer
actual
e
s
t
sntf_s
sntf_n
sntf_t
sntf_f
date
text
tokens
domain
lang
datetime
func
att
funcatt
is_e
is_i
is_f
is_n
is_s
is_t
is_fe
is_fi
is_ne
is_ni
is_se
is_si
is_te
is_ti
is_enfj
is_enfp
is_entj
is_entp
is_esfj
is_esfp
is_estj
is_estp
is_unknown
is_infj
is_infp
is_intj
is_intp
is_isfj
is_isfp
is_istj
is_istp
quant
anx
they
certain
i
money
past
funct
insight
shehe
feel
relig
preps
ipron
nonfl
incl
cogmech
motion
health
friend
sexual
adverb
leisure
discrep
cause
negemo
present
auxverb
space
filler
future
negate
relativ
home
excl
anger
hear
pronoun
death
article
inhib
family
affect
we
achieve
swear
ingest
body
verb
humans
sad
tentat
work
ppron
posemo
social
assent
bio
conj
see
percept
time
you
number


### Store enriched DataFrame to pickle and semicolon-separated CSV

In [95]:
df.to_pickle("../../pickles/dataframe_survey_2018-01-23_enriched.pickle")
df.to_csv("../../data/processed/dataframe_survey_2018-01-23_enriched.csv",sep=";")
print("Finished storing data.")

Finished storing data.
