In [45]:
import pandas as pd
import re
from collections import Counter
pd.set_option('precision',6)

In [46]:
df = pd.read_pickle("../pickles/dataframe_survey_2018-01-23_enriched.pickle")

In [59]:
df["func"].tail() # Ensure values for func all the way through the dataframe

27108    f
27109    n
27110    n
27111    n
27112    n
Name: func, dtype: object

In [47]:
df.domain.value_counts()

tumblr                          23776
blogspot                          566
wordpress                         525
co                                283
twitter                           240
facebook                          168
google                             55
livejournal                        53
Tumblr                             29
weebly                             27
reddit                             27
okcupid                            21
TUMBLR                             20
fanfiction                         20
intjforum                          16
tumbr                              16
youtube                            13
personalitycafe                    13
personalityjunkie                  13
dreamwidth                         13
typealyzer                         12
ovh                                12
pointlesssites                     12
deviantart                         12
Twitter                            11
fighunter                          11
instagram   

# Filter out English blog texts over 100 tokens

In [60]:
df = df[(df.tokens > 100) & 
       (df.lang == "en") &
       (df.domain.str.contains("tumblr")) |
       (df.domain.str.contains("blogspot")) |
       (df.domain.str.contains("wordpress"))]
len(df)

20708

# Kaggle data set

In [82]:
!ls ../data

README.md [34mdata[m[m      [34mnotebooks[m[m [34mpickles[m[m   [34mpy[m[m


In [89]:
data = df[["text","func","funcatt"]]
data.columns = ["text","base_function", "directed_function"]
data.to_csv("../data/processed/blog_texts_and_cognitive_function.csv", sep=";")
print(data.directed_function.value_counts())
print(len(data))
print(data.head())

ni    6653
fi    4822
ti    3803
ne    1938
si    1843
fe     677
se     533
te     439
Name: directed_function, dtype: int64
20708
                                                text base_function  \
1  ‚ùÄ*a drop of colour*‚ùÄ 1/39 next‚Üí home ask past ...             n   
2  Neko cool kids can't die home family daveblog ...             f   
3  Anything... Anything Mass Effect-related Music...             t   
5  Perpetually In Transit Perpetually In Transit ...             n   
6  Moving on Moving on Chronicles of an Anglo/Spa...             f   

  directed_function  
1                ni  
2                fi  
3                ti  
5                ne  
6                fi  


In [86]:
len(data.text.unique())

20157

# Filter out text and Jungian functions (S,N,T,F) only

In [61]:
func_text = df[["text","func"]]
func_text.head()

Unnamed: 0,text,func
1,‚ùÄ*a drop of colour*‚ùÄ 1/39 next‚Üí home ask past ...,n
2,Neko cool kids can't die home family daveblog ...,f
3,Anything... Anything Mass Effect-related Music...,t
5,Perpetually In Transit Perpetually In Transit ...,n
6,Moving on Moving on Chronicles of an Anglo/Spa...,f


In [62]:
len(func_text[pd.isnull(func_text.func)]) # Ensure no NaN values

0

In [52]:
def calc_word_percentages(func=None):
    if not func:
        word_prc = {}
        tot_words = 0
        cnt = Counter()
        for ix, row in df.iterrows():
            tokens = row["text"].split()
            cnt.update(tokens)
            for token in tokens:
                tot_words += 1
        for word, count in cnt.most_common(100000):
            word_prc[word] = count / tot_words
        word_prc["tokns"] = tot_words
        return word_prc
    else:
        word_prc = {}
        tot_words = 0
        cnt = Counter()
        func_df = df[df["func"] == func]
        for ix, row in func_df.iterrows():
            tokens = row["text"].split()
            cnt.update(tokens)
            for token in tokens:
                tot_words += 1
        for word, count in cnt.most_common(1000):
            word_prc[word] = count / tot_words
        word_prc["tokns"] = tot_words
        return word_prc

In [63]:
series_list

[                                        1.007487e-06
 !                                        1.038487e-04
 !!                                       2.913963e-05
 !!!                                      2.100223e-05
 !!!!                                     7.129909e-06
 !!!!!                                    6.354919e-06
 !!!!!!                                   2.867463e-06
 !!!!!!!                                  2.014974e-06
 !!!!!!!!                                 1.394982e-06
 !!!!!!!!!                                1.084986e-06
 !!!!!!!!!!                               6.974911e-07
 !!!!!!!!!!!                              5.424931e-07
 !!!!!!!!!!!!                             1.084986e-06
 !!!!!!!!!!!!!                            6.974911e-07
 !!!!!!!!!!!!!!                           6.974911e-07
 !!!!!!!!!!!!!!!!!!                       3.874950e-07
 !!!!!!!!!!!!!!!!!!!                      5.424931e-07
 !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!    7.749901e-07
 !!!!?    

In [53]:
series_list = []
for func in [None, "s", "n", "t", "f"]:
    d = calc_word_percentages(func=func)
    if func:
        series_name = func + "_s"
        series_name = pd.Series(d, name=func)
    else:
        series_name = "tot" + "_s"
        series_name = pd.Series(d, name="tot")
    series_list.append(series_name)
data = pd.concat(series_list, axis=1)
pd.to_pickle(data, "../pickles/jung_functions_word_freqs.pickle")
print(data.head(5))

           tot         s         n         t         f
     0.000001       NaN       NaN       NaN       NaN
!     0.000104  0.000096  0.000099  0.000113  0.000108
!!    0.000029       NaN       NaN       NaN       NaN
!!!   0.000021       NaN       NaN       NaN       NaN
!!!!  0.000007       NaN       NaN       NaN       NaN


In [79]:
prc_df = pd.DataFrame(data=[series_list], index=series_list[0].index, columns=["tot_prc","s_prc","n_prc","t_prc","f_prc"])
prc_df[["s_prc","n_prc"]].head(3)

Unnamed: 0,s_prc,n_prc
,! 0.000096 # 0.005061 #i ...,! 0.000099 # 0.004635 #i ...
!,! 0.000096 # 0.005061 #i ...,! 0.000099 # 0.004635 #i ...
!!,! 0.000096 # 0.005061 #i ...,! 0.000099 # 0.004635 #i ...


In [71]:
series_list[0].index

Index(['', '!', '!!', '!!!', '!!!!', '!!!!!', '!!!!!!', '!!!!!!!', '!!!!!!!!',
       '!!!!!!!!!',
       ...
       'üòç', 'üòçüòç', 'üòçüòçüòçüòç', 'üòé', 'üòè', 'üòí', 'üò≠', 'üò≥', 'üôå', 'üôè'],
      dtype='object', length=100001)

In [65]:
df["tot_prc"] = series_list[0]
df["s_prc"] = series_list[1]
df["n_prc"] = series_list[2]
df["t_prc"] = series_list[3]
df["f_prc"] = series_list[4]
df


Unnamed: 0,url,typealyzer,actual,e,s,t,sntf_s,sntf_n,sntf_t,sntf_f,...,sad,anger,discrep,negate,quant,tot_prc,s_prc,n_prc,t_prc,f_prc
1,http://adropofcolour.tumblr.com,ISFP,INFJ,0.291281,0.787844,0.460961,0.663515,0.178565,6.928170e-02,0.088638,...,0.001133,0.005663,0.016988,0.041903,0.033975,,,,,
2,http://godheadcomplex.tumblr.com,ESFP,INFP,0.883579,0.951693,0.238407,0.855921,0.046931,2.185050e-02,0.075297,...,0.000000,0.009852,0.014778,0.034483,0.024631,,,,,
3,http://chaotikaeon2.tumblr.com,INTJ,INTP,0.332444,0.357863,0.591322,0.147668,0.252326,3.398310e-01,0.260175,...,0.000000,0.012259,0.015762,0.040280,0.029772,,,,,
5,http://perpetually-in-transit.blogspot.com,ESFP,ENFJ,0.944394,0.943192,0.105527,0.778825,0.051134,1.729950e-02,0.152742,...,0.003283,0.006098,0.012195,0.017824,0.031895,,,,,
6,http://www.chocolateannie.blogspot.com,ESFP,ENFP,0.816134,0.860018,0.051798,0.536634,0.090553,1.824580e-02,0.354567,...,0.003268,0.000000,0.032680,0.027778,0.055556,,,,,
10,http://museofmystery.wordpress.com/2012/08/29/...,ISTP,INFP,0.073352,0.850472,0.608812,0.628322,0.112762,1.492700e-01,0.109646,...,0.013865,0.005199,0.015598,0.015598,0.019064,,,,,
11,http://ash-the-awesome.tumblr.com/,ISTP,I don't know,0.418129,0.769263,0.537529,0.495471,0.150067,1.815430e-01,0.172920,...,0.002497,0.004994,0.007491,0.038702,0.041199,,,,,
15,http://random-nerdy-stuff.tumblr.com,ESTP,ENTP,0.967408,0.985934,0.987147,0.882092,0.011472,1.049620e-01,0.001473,...,0.000000,0.019324,0.014493,0.091787,0.004831,,,,,
16,http://supernoondles.tumblr.com,ISFP,INTP,0.183238,0.732849,0.267514,0.522277,0.192229,7.170000e-02,0.213794,...,0.009061,0.010708,0.008237,0.066722,0.023888,,,,,
17,http://supernoondles.tumblr.com/tagged/writing,ESFP,INTP,0.730095,0.913702,0.076379,0.711090,0.067741,1.635580e-02,0.204813,...,0.006737,0.016467,0.014970,0.026946,0.036677,,,,,


In [54]:
data[data.tot.isna()] # Check that there are no faulty lines

Unnamed: 0,tot,s,n,t,f


In [55]:
data

Unnamed: 0,tot,s,n,t,f
,1.007487e-06,,,,
!,1.038487e-04,0.000096,0.000099,0.000113,0.000108
!!,2.913963e-05,,,,
!!!,2.100223e-05,,,,
!!!!,7.129909e-06,,,,
!!!!!,6.354919e-06,,,,
!!!!!!,2.867463e-06,,,,
!!!!!!!,2.014974e-06,,,,
!!!!!!!!,1.394982e-06,,,,
!!!!!!!!!,1.084986e-06,,,,


# Sensing vs iNtuition

In [120]:
data = pd.read_pickle("../pickles/jung_functions_word_freqs.pickle")

In [121]:
sn_df = data[["s","n","tot"]]
sn_df.head(3)

Unnamed: 0,s,n,tot
,,,7.949e-07
!,0.0001071096,0.0001006225,0.0001037598
!!,,,2.65972e-05


In [83]:
s_dev = []
for ix, row in sn_df.iterrows():
    s_dev.append(row["s"] - row["tot"])
s = pd.Series(s_dev, index=sn_df.index)
s = s.fillna(0)
s_df = s.to_frame(name="sDev")

n_dev = []
for ix, row in sn_df.iterrows():
    n_dev.append(row["n"] - row["tot"])
n = pd.Series(n_dev, index=sn_df.index)
n = n.fillna(0)
n_df = n.to_frame(name="nDev")

In [98]:
s_df[s_df.sDev > 0.0005]

Unnamed: 0,sDev
#,0.0011583929
),0.0005867616
",",0.0005546446
-->,0.0006103559
:,0.0016770887
ago,0.0011413999
notes,0.0013304999


In [96]:
n_df[n_df.nDev > 0.0001]

Unnamed: 0,nDev
I,0.0002013858
a,0.00011999
and,0.0002068865
of,0.000343521
the,0.0004494211


In [111]:
print("sn_df: {}, s_df: {}, n_df: {}".format(len(sn_df.index), len(s_df.index), len(n_df.index)))

sn_df: 100001, s_df: 100001, n_df: 100001


In [112]:
sn_df.index

Index(['', '!', '!!', '!!!', '!!!!', '!!!!!', '!!!!!!', '!!!!!!!', '!!!!!!!!',
       '!!!!!!!!!',
       ...
       'üòé', 'üòè', 'üòë', 'üòí', 'üòò', 'üò≠', 'üò≥', 'üôÇ', 'üôå', 'üôè'],
      dtype='object', length=100001)

In [113]:
s_df.index

Index(['', '!', '!!', '!!!', '!!!!', '!!!!!', '!!!!!!', '!!!!!!!', '!!!!!!!!',
       '!!!!!!!!!',
       ...
       'üòé', 'üòè', 'üòë', 'üòí', 'üòò', 'üò≠', 'üò≥', 'üôÇ', 'üôå', 'üôè'],
      dtype='object', length=100001)

In [114]:
n_df.index

Index(['', '!', '!!', '!!!', '!!!!', '!!!!!', '!!!!!!', '!!!!!!!', '!!!!!!!!',
       '!!!!!!!!!',
       ...
       'üòé', 'üòè', 'üòë', 'üòí', 'üòò', 'üò≠', 'üò≥', 'üôÇ', 'üôå', 'üôè'],
      dtype='object', length=100001)

In [116]:
df = pd.concat([sn_df.reset_index(drop=True), s_df.reset_index(drop=True), n_df.reset_index(drop=True)], axis=1)

In [117]:
df.index = sn_df.index
df

Unnamed: 0,s,n,tot,sDev,nDev
,,,0.0000007949,0.0000000000,0.0000000000
!,0.0001071096,0.0001006225,0.0001037598,0.0000033498,-0.0000031373
!!,,,0.0000265972,0.0000000000,0.0000000000
!!!,,,0.0000191378,0.0000000000,0.0000000000
!!!!,,,0.0000064812,0.0000000000,0.0000000000
!!!!!,,,0.0000052583,0.0000000000,0.0000000000
!!!!!!,,,0.0000023234,0.0000000000,0.0000000000
!!!!!!!,,,0.0000018954,0.0000000000,0.0000000000
!!!!!!!!,,,0.0000011617,0.0000000000,0.0000000000
!!!!!!!!!,,,0.0000009783,0.0000000000,0.0000000000


In [119]:
sn_df

Unnamed: 0,s,n,tot
,,,0.0000007949
!,0.0001071096,0.0001006225,0.0001037598
!!,,,0.0000265972
!!!,,,0.0000191378
!!!!,,,0.0000064812
!!!!!,,,0.0000052583
!!!!!!,,,0.0000023234
!!!!!!!,,,0.0000018954
!!!!!!!!,,,0.0000011617
!!!!!!!!!,,,0.0000009783


In [53]:
n.index

Index(['', '!', '!!', '!!!', '!!!!', '!!!!!', '!!!!!!', '!!!!!!!', '!!!!!!!!',
       '!!!!!!!!!',
       ...
       'üòé', 'üòè', 'üòë', 'üòí', 'üòò', 'üò≠', 'üò≥', 'üôÇ', 'üôå', 'üôè'],
      dtype='object', length=100001)

In [54]:
sn_df.index

Index(['', '!', '!!', '!!!', '!!!!', '!!!!!', '!!!!!!', '!!!!!!!', '!!!!!!!!',
       '!!!!!!!!!',
       ...
       'üòé', 'üòè', 'üòë', 'üòí', 'üòò', 'üò≠', 'üò≥', 'üôÇ', 'üôå', 'üôè'],
      dtype='object', length=100001)