In [2]:
import pandas as pd
import requests
from collections import defaultdict
import os
import operator
import pickle
import time
from collections import Counter
from datetime import datetime

In [3]:
df_pickle_path = "../../pickles/dataframe_survey_2018-01-23_enriched.pickle"

In [4]:
indata = pd.read_pickle(df_pickle_path)
print(indata[["actual","actual_temp","is_s"]].head(5))
print(indata[["actual","actual_temp","is_s"]].tail(5))

   actual actual_temp  is_s
1    INFJ          nf     0
2    INFP          nf     0
3    INTP          nt     0
5    ENFJ          nf     0
10   INFP          nf     0
      actual actual_temp  is_s
25432   INTJ          nt     0
25433   INTP          nt     0
25435   INFP          nf     0
25436   ENFP          nf     0
25437   INFP          nf     0


# Remove non-English texts

In [5]:
len(indata)

22919

In [6]:
indata = indata[indata.lang == "en"]

In [7]:
len(indata)

22588

# Sample n texts per class

In [8]:
sntf_samples = pd.concat([
            indata[indata.func == "s"].sample(100, random_state=123456)[["text","tokens","func","actual_temp"]],
            indata[indata.func == "n"].sample(100, random_state=123456)[["text","tokens","func","actual_temp"]],
            indata[indata.func == "t"].sample(100, random_state=123456)[["text","tokens","func","actual_temp"]],
            indata[indata.func == "f"].sample(100, random_state=123456)[["text","tokens","func","actual_temp"]]
            ])


In [9]:
sntf_samples

Unnamed: 0,text,tokens,func,actual_temp
9630,Sweetdreamsmusic ☁ theme ▶ o t 7 h e a v e n S...,634,s,nf
384,Book.Blog.Bake. | Taking on one book & one rec...,4704,s,nf
13750,--> Time and Tide and Buttered Eggs Wait for n...,1019,s,nt
5455,one swood guy please pick only one column plea...,67,s,nf
5226,pop pop about archive message Theme by Theme S...,73,s,nf
20565,"cunning cunning Mainly hp, with a mix of other...",415,s,sf
2563,Paradox >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>...,237,s,nf
2162,Peek into what's in my brain. Peek into what's...,158,s,nf
20023,Innocent Dreams Return Message Archive Theme F...,414,s,sf
14383,Geronimo! I II III IV V VI Geronimo! Hi. I'm S...,493,s,nt


# Rough tokenization of the sampled texts

In [10]:
tokens = set()
for ix, text in sntf_samples.iterrows():
    for token in text["text"].split():
        tokens.add(token)
len(tokens)

40488

In [18]:
# to Jon
tokens_s = pd.Series(list(tokens))
tokens_s.to_string("../../data/processed/tokens_n40488_100samples_per_class_sntf.txt")

# Classify word-for-word, classifier-for_classifier

In [216]:
def classify_jung_percieving_function_of_text(text):
    """Does what it says, pretty much."""
    header = {"Content-Type": "application/json",
             "Authorization": "Token " + os.environ["UCLASSIFY_READ"]}
    data = {"texts":[text]} # send a one-item list for now, since we don't have a feel for sizes
    try:
        result = requests.post("https://api.uclassify.com/v1/prfekt/jungian-cognitive-function-sensing-intuition/classify",
                       json = data,
                       headers = header)
    except Exception as e:
        time.sleep(600)
        print("Error connecting with uClassify. Sleeping 10 minutes and retrying.")
        result = requests.post("https://api.uclassify.com/v1/prfekt/jungian-cognitive-function-sensing-intuition/classify",
                       json = data,
                       headers = header)
        
    json_result = result.json()
    
    res_dict = {"s":0, "n":0}
    
    for classItem in json_result[0]["classification"]:
        res_dict[classItem["className"]] = classItem["p"]
    
    # sorted_dict = sorted(res_dict.items(), key=operator.itemgetter(1), reverse=True)
    return res_dict

def classify_jung_judging_function_of_text(text):
    """Does what it says, pretty much."""
    header = {"Content-Type": "application/json",
             "Authorization": "Token " + os.environ["UCLASSIFY_READ"]}
    data = {"texts":[text]} # send a one-item list for now, since we don't have a feel for sizes
    try:
        result = requests.post("https://api.uclassify.com/v1/prfekt/jungian-cognitive-function-thinking-feeling/classify",
                       json = data,
                       headers = header)
    except Exception as e:
        time.sleep(600)
        print("Error connecting with uClassify. Sleeping 10 minutes and retrying.")
        result = requests.post("https://api.uclassify.com/v1/prfekt/jungian-cognitive-function-thinking-feeling/classify",
                       json = data,
                       headers = header)
    
    json_result = result.json()
    
    res_dict = {"t":0, "f":0}
    
    for classItem in json_result[0]["classification"]:
        res_dict[classItem["className"]] = classItem["p"]
    
    # sorted_dict = sorted(res_dict.items(), key=operator.itemgetter(1), reverse=True)
    return res_dict

In [None]:
startTime = datetime.now()
results = defaultdict(list)
# Result order S,N,T,F
word_cnt = 1
for word in tokens:
    for name in ["sn","tf"]:
        if name == "sn":
            # classify and store results in order s, n in results dict e.g. results["smokin"].appen(sResultValue)
            res = classify_jung_percieving_function_of_text(word)
            results[word].append(res["s"])
            results[word].append(res["n"])
        else:
            res = classify_jung_judging_function_of_text(word)
            results[word].append(res["t"])
            results[word].append(res["f"])
    word_cnt +=1
    if word_cnt % 100 == 0:
            print("word {} of {} classified".format(word_cnt, len(tokens)))
            print("Time taken: {}".format(datetime.now() - startTime))

word 100 of 40488 classified
Time taken: 0:01:33.941238


# Get keywords from classifiers via keywords endpoint

In [14]:
def get_keywords_from_uclassify_perceiving_classifer(text):
    """
    Uses the keywords endpoint inherent in each classifier on uClassify
    to retrieve the most significant keywords for each class.
    
    See: https://www.uclassify.com/docs/restapi#readcalls-keywords
    ."""
    header = {"Content-Type": "application/json",
             "Authorization": "Token " + os.environ["UCLASSIFY_READ"]}
    data = {"texts":[text]} # send a one-item list for now, since we don't have a feel for sizes
    try:
        result = requests.post("https://api.uclassify.com/v1/prfekt/jungian-cognitive-function-sensing-intuition/keywords",
                       json = data,
                       headers = header)
    except Exception as e:
        time.sleep(180)
        print("Error connecting with uClassify. Sleeping 3 minutes and retrying.")
        result = requests.post("https://api.uclassify.com/v1/prfekt/jungian-cognitive-function-sensing-intuition/keywords",
                       json = data,
                       headers = header)
    
    return result.json()

def get_keywords_from_uclassify_judging_classifer(text):
    """
    Uses the keywords endpoint inherent in each classifier on uClassify
    to retrieve the most significant keywords for each class.
    
    See: https://www.uclassify.com/docs/restapi#readcalls-keywords
    ."""
    header = {"Content-Type": "application/json",
             "Authorization": "Token " + os.environ["UCLASSIFY_READ"]}
    data = {"texts":[text]} # send a one-item list for now, since we don't have a feel for sizes
    try:
        result = requests.post("https://api.uclassify.com/v1/prfekt/jungian-cognitive-function-thinking-feeling/keywords",
                       json = data,
                       headers = header)
    except Exception as e:
        time.sleep(180)
        print("Error connecting with uClassify. Sleeping 3 minutes and retrying.")
        result = requests.post("https://api.uclassify.com/v1/prfekt/jungian-cognitive-function-thinking-feeling/keywords",
                       json = data,
                       headers = header)
    
    return result.json()


# Perceiving

In [11]:
tokens_str = " ".join(tokens)
tokens_str[:30]

'region peeta Jade Delphine THE'

In [12]:
len(tokens_str)

343716

In [15]:
startTime = datetime.now()
res = get_keywords_from_uclassify_perceiving_classifer(tokens_str)
print("took {}".format(datetime.now() - startTime))

took 0:00:01.260022


In [16]:
res

[[{'className': 's', 'keyword': '#x', 'p': 0.996307},
  {'className': 'n', 'keyword': 'ziad', 'p': 0.996454},
  {'className': 'n', 'keyword': 'earthstory', 'p': 0.996454},
  {'className': 'n', 'keyword': 'settlement', 'p': 0.996894},
  {'className': 'n', 'keyword': 'increased', 'p': 0.996454},
  {'className': 'n', 'keyword': 'saleh', 'p': 0.996454},
  {'className': 's', 'keyword': 'lancaster', 'p': 0.996589},
  {'className': 's', 'keyword': 'ashlyn', 'p': 0.997883},
  {'className': 'n', 'keyword': 'deir', 'p': 0.998084},
  {'className': 'n', 'keyword': 'doctrines', 'p': 0.996689},
  {'className': 'n', 'keyword': '↱', 'p': 0.996689},
  {'className': 's', 'keyword': 'jace', 'p': 0.996589},
  {'className': 's', 'keyword': 'noot', 'p': 0.997661},
  {'className': 'n', 'keyword': 'toggle', 'p': 0.996894},
  {'className': 's', 'keyword': '#ph', 'p': 0.997041},
  {'className': 'n', 'keyword': 'modesty', 'p': 0.997925},
  {'className': 'n', 'keyword': 'funnymomentposts', 'p': 0.998915},
  {'cla

In [17]:
kw = pd.DataFrame(res[0])
kw.className.value_counts()

n    22
s     8
Name: className, dtype: int64

In [18]:
sDf = kw[kw.className == "s"]
nDf = kw[kw.className == "n"]

In [19]:
sDf.loc[sDf.p.sort_values(ascending=False).index]

Unnamed: 0,className,keyword,p
23,s,lin,0.997979
26,s,¶,0.997883
7,s,ashlyn,0.997883
12,s,noot,0.997661
14,s,#ph,0.997041
11,s,jace,0.996589
6,s,lancaster,0.996589
0,s,#x,0.996307


In [20]:
nDf.loc[nDf.p.sort_values(ascending=False).index]

Unnamed: 0,className,keyword,p
25,n,roche,0.999469
22,n,ooooooooo,0.999307
16,n,funnymomentposts,0.998915
17,n,thelightiswhite,0.998812
20,n,youfoundmj,0.998652
18,n,thepatheticaesthetic,0.998392
27,n,bankruptspermbank,0.998282
8,n,deir,0.998084
15,n,modesty,0.997925
28,n,☩,0.997738


# Perceiving: get keywords from most common words

In [21]:
common_tokens = Counter()
for ix, text in sntf_samples.iterrows():
    common_tokens.update(text["text"].split())
common_tokens.most_common(10)

[('the', 4878),
 ('and', 3669),
 ('to', 3569),
 ('I', 3182),
 ('a', 3106),
 (':', 2940),
 ('of', 2899),
 ('in', 2042),
 ('notes', 2015),
 ('is', 1750)]

In [22]:
common_tokens_list = []
for tup in common_tokens.most_common(1500):
    common_tokens_list.append(tup[0])

In [23]:
len(common_tokens_list)

1500

In [24]:
# remove n most common tokens
common_tokens_list = common_tokens_list[500:]

In [25]:
common_tokens_str = " ".join(common_tokens_list)
common_tokens_str[:30]

'words months point another alm'

In [28]:
common_perc_res = get_keywords_from_uclassify_perceiving_classifer(common_tokens_str)
common_perc_res

[[{'className': 's', 'keyword': '2nd', 'p': 0.67344},
  {'className': 'n', 'keyword': 'pinterest', 'p': 0.676971},
  {'className': 's', 'keyword': 'spock', 'p': 0.673587},
  {'className': 's', 'keyword': '7th', 'p': 0.711965},
  {'className': 'n', 'keyword': 'poetry', 'p': 0.687495},
  {'className': 'n', 'keyword': 'palestinian', 'p': 0.71323},
  {'className': 's', 'keyword': '↳', 'p': 0.687896},
  {'className': 's', 'keyword': 'lvl', 'p': 0.760022},
  {'className': 'n', 'keyword': '‘', 'p': 0.733083},
  {'className': 'n', 'keyword': 'israel', 'p': 0.728096},
  {'className': 's', 'keyword': '･', 'p': 0.83395},
  {'className': 'n', 'keyword': '■', 'p': 0.764686},
  {'className': 's', 'keyword': 'ho', 'p': 0.730921},
  {'className': 'n', 'keyword': 'israeli', 'p': 0.724133},
  {'className': 's', 'keyword': '☂', 'p': 0.691112},
  {'className': 's', 'keyword': '—i', 'p': 0.959016},
  {'className': 'n', 'keyword': 'gaza', 'p': 0.940188},
  {'className': 'n', 'keyword': '☩', 'p': 0.997738},


In [29]:
common_perc_kw = pd.DataFrame(common_res[0])
common_perc_kw.className.value_counts()

n    17
s    13
Name: className, dtype: int64

In [30]:
common_sDf = common_perc_kw[common_perc_kw.className == "s"]
common_nDf = common_perc_kw[common_perc_kw.className == "n"]

In [31]:
common_sDf

Unnamed: 0,className,keyword,p
0,s,2nd,0.67344
2,s,spock,0.673587
3,s,7th,0.711965
6,s,↳,0.687896
7,s,lvl,0.760022
10,s,･,0.83395
12,s,ho,0.730921
14,s,☂,0.691112
15,s,—i,0.959016
18,s,±,0.764574


In [33]:
common_nDf

Unnamed: 0,className,keyword,p
1,n,pinterest,0.676971
4,n,poetry,0.687495
5,n,palestinian,0.71323
8,n,‘,0.733083
9,n,israel,0.728096
11,n,■,0.764686
13,n,israeli,0.724133
16,n,gaza,0.940188
17,n,☩,0.997738
20,n,wage,0.735467


# Judging

In [34]:
tokens_str = " ".join(tokens)
tokens_str[:30]

'region peeta Jade Delphine THE'

In [35]:
res = get_keywords_from_uclassify_judging_classifer(tokens_str)
res

[[{'className': 'f', 'keyword': 'veganism', 'p': 0.997285},
  {'className': 'f', 'keyword': 'agonistes', 'p': 0.997569},
  {'className': 'f', 'keyword': 'brideshead', 'p': 0.997285},
  {'className': 'f', 'keyword': 'blimeycow', 'p': 0.997899},
  {'className': 't', 'keyword': 'kepler', 'p': 0.99763},
  {'className': 't', 'keyword': 'cosmicvastness', 'p': 0.99763},
  {'className': 't', 'keyword': 'sodium', 'p': 0.997382},
  {'className': 't', 'keyword': 'renamonkalou', 'p': 0.998008},
  {'className': 't', 'keyword': 'astronomyandastrophotography', 'p': 0.998652},
  {'className': 't', 'keyword': 'cassini', 'p': 0.997835},
  {'className': 't', 'keyword': 'thepatheticaesthetic', 'p': 0.999405},
  {'className': 'f', 'keyword': '©mc', 'p': 0.99769},
  {'className': 't', 'keyword': 'Август', 'p': 0.998339},
  {'className': 'f', 'keyword': 'amalia', 'p': 0.997569},
  {'className': 'f', 'keyword': 'jaegerhugs', 'p': 0.997569},
  {'className': 't', 'keyword': 'bankruptspermbank', 'p': 0.99896},
 

In [36]:
kw = pd.DataFrame(res[0])
tDf = kw[kw.className == "t"]
fDf = kw[kw.className == "f"]
kw.className.value_counts()

t    21
f     9
Name: className, dtype: int64

In [37]:
tDf.loc[tDf.p.sort_values(ascending=False).index]

Unnamed: 0,className,keyword,p
21,t,roche,0.999828
16,t,ooooooo,0.999701
25,t,funnymomentposts,0.999554
10,t,thepatheticaesthetic,0.999405
19,t,centreforaviation,0.99904
15,t,bankruptspermbank,0.99896
23,t,youfoundmj,0.998866
27,t,earthstory,0.99884
28,t,oooooooooo,0.998812
29,t,barazani,0.998783


In [38]:
fDf.loc[fDf.p.sort_values(ascending=False).index]

Unnamed: 0,className,keyword,p
22,f,thelightiswhite,0.999428
20,f,lulazel,0.999187
3,f,blimeycow,0.997899
11,f,©mc,0.99769
14,f,jaegerhugs,0.997569
13,f,amalia,0.997569
1,f,agonistes,0.997569
2,f,brideshead,0.997285
0,f,veganism,0.997285


# Judging: get keywords from most common words

In [39]:
common_tokens = Counter()
for ix, text in sntf_samples.iterrows():
    common_tokens.update(text["text"].split())
common_tokens.most_common(10)

[('the', 4878),
 ('and', 3669),
 ('to', 3569),
 ('I', 3182),
 ('a', 3106),
 (':', 2940),
 ('of', 2899),
 ('in', 2042),
 ('notes', 2015),
 ('is', 1750)]

In [40]:
common_tokens_list = []
for tup in common_tokens.most_common(1500):
    common_tokens_list.append(tup[0])
common_tokens_str = " ".join(common_tokens_list)
common_tokens_str[:30]

'the and to I a : of in notes i'

In [41]:
# remove n most common tokens
common_tokens_list = common_tokens_list[500:]

In [42]:
common_res = get_keywords_from_uclassify_judging_classifer(common_tokens_str)
common_res

[[{'className': 'f', 'keyword': '13th', 'p': 0.651443},
  {'className': 't', 'keyword': '●', 'p': 0.652034},
  {'className': 't', 'keyword': '☂', 'p': 0.652428},
  {'className': 'f', 'keyword': '►', 'p': 0.653902},
  {'className': 'f', 'keyword': 'surnames', 'p': 0.654696},
  {'className': 't', 'keyword': 'aoû', 'p': 0.718954},
  {'className': 'f', 'keyword': 'grace', 'p': 0.655412},
  {'className': 'f', 'keyword': '･', 'p': 0.758571},
  {'className': 't', 'keyword': '2015', 'p': 0.710488},
  {'className': 't', 'keyword': 'software', 'p': 0.676922},
  {'className': 't', 'keyword': 'pee', 'p': 0.692516},
  {'className': 't', 'keyword': '☩', 'p': 0.726374},
  {'className': 'f', 'keyword': '✈', 'p': 0.800839},
  {'className': 'f', 'keyword': '☁', 'p': 0.735535},
  {'className': 't', 'keyword': '11th', 'p': 0.659531},
  {'className': 't', 'keyword': '0803', 'p': 0.996689},
  {'className': 't', 'keyword': '—i', 'p': 0.862141},
  {'className': 'f', 'keyword': '±', 'p': 0.931917},
  {'classNa

In [43]:
common_kw = pd.DataFrame(common_res[0])
common_tDf = common_kw[common_kw.className == "t"]
common_fDf = common_kw[common_kw.className == "f"]
common_kw.className.value_counts()

t    19
f    11
Name: className, dtype: int64

In [44]:
common_tDf

Unnamed: 0,className,keyword,p
1,t,●,0.652034
2,t,☂,0.652428
5,t,aoû,0.718954
8,t,2015,0.710488
9,t,software,0.676922
10,t,pee,0.692516
11,t,☩,0.726374
14,t,11th,0.659531
15,t,0803,0.996689
16,t,—i,0.862141


In [45]:
common_fDf

Unnamed: 0,className,keyword,p
0,f,13th,0.651443
3,f,►,0.653902
4,f,surnames,0.654696
6,f,grace,0.655412
7,f,･,0.758571
12,f,✈,0.800839
13,f,☁,0.735535
17,f,±,0.931917
21,f,gaza,0.713557
22,f,jaegerhugs,0.997569


# Inspect the 7300 classified words from 15 samples

In [193]:
df = pd.read_pickle("sntf_7300_sampled_words_classification_results_df.pickle")
df.head(5)

Unnamed: 0,s,n,t,f
!,0.5,0.5,0.5,0.5
!!!,0.5,0.5,0.5,0.5
!!!!!,0.5,0.5,0.5,0.5
#,0.606613,0.393387,0.398213,0.601787
#000<semic>,0.84893,0.15107,0.847797,0.152203


In [194]:
df.s.sort_values(ascending=False).head(20)

mmatsuokah            1.000000
tomhazeldine          0.999999
nosebleed             0.999999
trolltina             0.999999
출처:                   0.999999
kvotheunkvothe        0.999999
Dhavernas             0.999998
veinteunopilotas      0.999998
shyghost              0.999998
lucyheartfllia        0.999997
watchmen…             0.999997
giffing               0.999994
ntonystark            0.999993
postracialcomments    0.999993
tolzmannia            0.999993
kijikun               0.999993
largecoin             0.999993
jusblaz               0.999993
#glasslip             0.999987
jackbarakat           0.999987
Name: s, dtype: float64

In [195]:
df.n.sort_values(ascending=False).head(20)

Secular        1.000000
secular        1.000000
15-year-old    0.999999
Voiceover.     0.999999
Jennwith2ns    0.999999
#sunggyu       0.999999
wecansexy      0.999999
Rescuing       0.999998
knickers       0.999997
Buechner.      0.999997
Buechner       0.999997
Shusterman     0.999997
freudian       0.999995
h-a-r-p-o      0.999995
re-visit       0.999995
#myungsoo      0.999993
bonkers.       0.999993
Rarasaur       0.999993
agnostics,     0.999993
get,           0.999991
Name: n, dtype: float64

In [196]:
df.t.sort_values(ascending=False).head(20)

DERSITE                 1.000000
#nozaki                 0.999999
Nobodies.               0.999998
artalias                0.999998
sagging                 0.999998
antichamber             0.999996
saeto15                 0.999995
i-killed-your-senpai    0.999995
kiekstn                 0.999995
valvrave                0.999993
Valvrave                0.999993
Jacobin                 0.999991
bakandroids             0.999990
taliabobalia            0.999990
guertena-art-museum     0.999990
brontosaurs             0.999978
brontosaurus,           0.999978
mcstump                 0.999970
arquius                 0.999970
hoes”                   0.999970
Name: t, dtype: float64

In [197]:
df.f.sort_values(ascending=False).head(20)

Jennwith2ns           0.999999
anondracomalfoy       0.999999
its-hipster-alpaca    0.999999
tw<semic>             0.999998
karahkan              0.999998
Grendel               0.999997
Buechner              0.999997
Buechner.             0.999997
three’s               0.999997
Waltzing              0.999997
doubtless             0.999996
unwound.              0.999996
onac911               0.999996
#dongwoo              0.999992
aswechoke             0.999992
Galoshes)             0.999992
ntonystark            0.999992
starrlesscity         0.999992
avpm/avps/avpsy       0.999992
Galoshes              0.999992
Name: f, dtype: float64