In [1]:
import pandas as pd
import os
import requests
import numpy as np
import operator
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
import seaborn as sns
import numpy as np
import time

In [2]:
!pwd

/Users/mos/Dropbox/memeticscience/typealyzer-dataset/notebooks/jungian_classification


In [3]:
df_pickle_path = "../../pickles/dataframe_survey_2018-01-23_enriched.pickle"

In [4]:
indata = pd.read_pickle(df_pickle_path)
indata[["actual","actual_temp","is_s"]].head(5)

Unnamed: 0,actual,actual_temp,is_s
1,INFJ,nf,0
2,INFP,nf,0
3,INTP,nt,0
5,ENFJ,nf,0
10,INFP,nf,0


In [5]:
indata.domain.value_counts()

tumblr       21938
blogspot       513
wordpress      468
Name: domain, dtype: int64

In [6]:
indata.lang.value_counts().head()

en    22588
fr       50
da       34
de       25
no       23
Name: lang, dtype: int64

# Filter out English texts only

In [7]:
len(indata)

22919

In [8]:
indata = indata[indata.lang == "en"]

In [9]:
len(indata)

22588

# Add columns percieving and judging for evaluation of s/n and t/f classifiers

In [10]:
indata["perc_func"] = indata.actual_temp.str.extract("(\w)\w", expand=False)
indata["judg_func"] = indata.actual_temp.str.extract("\w(\w)", expand=False)

In [11]:
len(indata[pd.isnull(indata["tokens"])])

0

In [12]:
indata[pd.isnull(indata["tokens"])]

Unnamed: 0,url,typealyzer,actual,e,s,t,sntf_s,sntf_n,sntf_t,sntf_f,...,cogmech,auxverb,they,incl,money,feel,we,hear,perc_func,judg_func


# Inspect original data function distributions

In [13]:
s_series = indata[indata.perc_func == "s"]["tokens"]
n_series = indata[indata.perc_func == "n"]["tokens"]
t_series = indata[indata.judg_func == "t"]["tokens"]
f_series = indata[indata.judg_func == "f"]["tokens"]

avg_tkns = {
    "s":s_series.mean(),
    "n":n_series.mean(),
    "t":t_series.mean(),
    "f":f_series.mean()
}
avg_tkns

{'f': 488.2908894968084,
 'n': 511.4496560721063,
 's': 457.63189127105665,
 't': 511.27211970074814}

In [14]:
indata.perc_func.value_counts()

n    16864
s     5224
Name: perc_func, dtype: int64

In [15]:
indata.judg_func.value_counts()

f    12063
t    10025
Name: judg_func, dtype: int64

# Sample equal size text chunks for training and evaluation data
See: [Pandas sample()](https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.sample.html)

## Percieving function

In [16]:
# We have 5224 cases in the smallest class s 
perc_samples = pd.concat([
            indata[indata.perc_func == "s"].sample(3000, random_state=123456)[["text","tokens","perc_func","judg_func","actual_temp"]],
            indata[indata.perc_func == "n"].sample(3000, random_state=123456)[["text","tokens","perc_func","judg_func","actual_temp"]]
            ])

In [17]:
len(perc_samples)

6000

In [18]:
perc_samples.perc_func.value_counts()

s    3000
n    3000
Name: perc_func, dtype: int64

In [19]:
perc_samples.judg_func.value_counts()

f    3363
t    2637
Name: judg_func, dtype: int64

In [55]:
perc_samples.to_pickle("jung_percieving_functions_samples_blogs_totn6000.pickle")

In [56]:
perc_samples = pd.read_pickle("jung_percieving_functions_samples_blogs_totn6000.pickle")

## Judging function

In [57]:
# We have 10025 cases in the smallest class t
judg_samples = pd.concat([
            indata[indata.judg_func == "t"].sample(3000, random_state=123456)[["text","tokens","perc_func","judg_func","actual_temp"]],
            indata[indata.judg_func == "f"].sample(3000, random_state=123456)[["text","tokens","perc_func","judg_func","actual_temp"]]
            ])

In [58]:
len(judg_samples)

6000

In [59]:
judg_samples.judg_func.value_counts()

f    3000
t    3000
Name: judg_func, dtype: int64

In [60]:
judg_samples.perc_func.value_counts()

n    4598
s    1402
Name: perc_func, dtype: int64

In [61]:
judg_samples.to_pickle("jung_judging_functions_samples_blogs_totn6000.pickle")

In [62]:
judg_samples = pd.read_pickle("jung_judging_functions_samples_blogs_totn6000.pickle")

# Setup uClassify classifer and prepare training and evaluation datasets
The variable `os.environ["UCLASSIFY_WRITE"]` is created by adding a line to e.g. `~/.profile`:

`export UCLASSIFY_WRITE = "<your_uclassify_write_key>"`

# Split percieving samples into train and eval subsets.

In [44]:
len(perc_samples)

6000

In [45]:
# prepare column to keep track of what's been used for training
zeros = np.zeros(len(perc_samples))
perc_samples["perc_training_set"] = zeros
perc_samples["perc_training_set"] = perc_samples.perc_training_set.astype("int")
perc_samples.head(3)

Unnamed: 0,text,tokens,perc_func,judg_func,actual_temp,perc_training_set
8623,Sonny Jooooooooon INDEX ASK PAST THEME Sonny J...,386,s,f,sf,0
11987,Log in | Tumblr Sign up Terms Privacy Posted b...,52,s,t,st,0
5340,a thing of blood © hi im logan and i love the ...,440,s,f,sf,0


In [47]:
sn_traing_set_size = 2100 # e.g. 2100 is 70% of 3000 samples
perc_s_train = perc_samples[perc_samples.perc_func == "s"].sample(sn_traing_set_size).index
perc_n_train = perc_samples[perc_samples.perc_func == "n"].sample(sn_traing_set_size).index
#print(len(set(perc_s_train.index)|set(perc_n_train.index)))

perc_train = perc_s_train.union(perc_n_train)

perc_samples.loc[perc_train, "perc_training_set"] = 1
perc_samples.to_pickle("../../pickles/n2100training_and_n900evaluation_samples_dataframe_sn.pickle")
perc_samples.head(3)

Unnamed: 0,text,tokens,perc_func,judg_func,actual_temp,perc_training_set
8623,Sonny Jooooooooon INDEX ASK PAST THEME Sonny J...,386,s,f,sf,1
11987,Log in | Tumblr Sign up Terms Privacy Posted b...,52,s,t,st,0
5340,a thing of blood © hi im logan and i love the ...,440,s,f,sf,0


In [52]:
len(perc_s_train.union(perc_n_train))

4200

In [53]:
len(perc_s_train.intersection(perc_n_train))

0

In [54]:
# Inspect training DataFrame
perc_train_for_inspection = perc_samples[perc_samples.perc_training_set == 1]
perc_train_for_inspection.head(3)

Unnamed: 0,text,tokens,perc_func,judg_func,actual_temp,perc_training_set
8623,Sonny Jooooooooon INDEX ASK PAST THEME Sonny J...,386,s,f,sf,1
17158,the aquariums I go by Mint/Minty (Still workin...,196,s,f,sf,1
20508,void of lights void of lights Search Ask me an...,1922,s,t,st,1


In [55]:
# Separate evaluation DataFrame
perc_eval_set = perc_samples[perc_samples.perc_training_set == 0]
perc_eval_set.head(3)

Unnamed: 0,text,tokens,perc_func,judg_func,actual_temp,perc_training_set
11987,Log in | Tumblr Sign up Terms Privacy Posted b...,52,s,t,st,0
5340,a thing of blood © hi im logan and i love the ...,440,s,f,sf,0
23420,Nobody can be uncheered with a baloon (^ v ^) ...,253,s,f,sf,0


In [63]:
perc_samples.perc_training_set.value_counts()

1    4200
0    1800
Name: perc_training_set, dtype: int64

In [77]:
# set union. Expected value 6000
len( ( set( perc_samples[perc_samples.perc_training_set == 1].index) | set( perc_eval_set.index ) ) )

6000

In [78]:
# Make sure 'perc_eval_set' is really the same as 'perc_samples[perc_samples.perc_training_set == 0]'
len((set(perc_samples[perc_samples.perc_training_set == 1].index) | set(perc_samples[perc_samples.perc_training_set == 0].index)))

6000

In [79]:
# set intersection. Expected value 0, meaning no overlap.
len( ( set(perc_samples[perc_samples.perc_training_set == 1]) & set(perc_samples[perc_samples.perc_training_set == 0])))

6

In [85]:
# set intersection on indeces explicitly. Exepected value 0, meaning no overlap.
len(  set(perc_eval_set.index) & set(perc_samples[perc_samples.perc_training_set == 1].index ) )

0

In [59]:
# set intersection. Expected value 0
len((set(perc_samples[perc_samples.perc_training_set == 1]) & set(perc_eval_set.index)))

0

In [73]:
# Indeces explicitly.
s_train_ix = perc_samples.loc[(perc_samples.perc_func == "s") & (perc_samples.perc_training_set == 1)].index
s_eval_ix = perc_samples.loc[(perc_samples.perc_func == "s") & (perc_samples.perc_training_set == 0)].index
print(len(s_train_ix.union(s_eval_ix)))
print(len(s_train_ix.intersection(s_eval_ix)))

3000
0


In [88]:
# indices implicilty.
s_train = perc_samples.loc[(perc_samples.perc_func == "s") & (perc_samples.perc_training_set == 1)]
s_eval = perc_samples.loc[(perc_samples.perc_func == "s") & (perc_samples.perc_training_set == 0)]
print(len(s_train.index.union(s_eval.index)))
print(len(s_train.index.intersection(s_eval.index)))

3000
0


In [74]:
n_train_ix = perc_samples.loc[(perc_samples.perc_func == "n") & (perc_samples.perc_training_set == 1)].index
n_eval_ix = perc_samples.loc[(perc_samples.perc_func == "n") & (perc_samples.perc_training_set == 0)].index
print(len(n_train_ix.union(n_eval_ix)))
print(len(n_train_ix.intersection(n_eval_ix)))

3000
0


## Train SN classifer

prfekt/jung-sensing-2100-20180319

In [95]:
def train_jung_cognitive_functions_en_classes(func, classifier):
    """Presupposes that classifier is created and that setup_jung_functions_en_classes() is already run.
    func: expects one of ["s","n","t","f"]
    classifier: expects on of ["sntf", "tf", "sn"]
    
    """
    text_count = 1
    if classifier == "sn":
        for ix, row in perc_samples.loc[(perc_samples.perc_func == name) & (perc_samples.perc_training_set == 1)].iterrows():
            
            data = {"texts":[row["text"]]}
            header = {"Content-Type": "application/json",
                 "Authorization": "Token " + os.environ["UCLASSIFY_WRITE"]}
            
            try:
                response = requests.post('https://api.uclassify.com/v1/me/jung-perceving-2100-20180319/' + name + "/train", 
                    json = data,
                    headers = header)
            except Exception as e:
                print("Error: {}. retrying in 3 minutes.")
                time.sleep(180)
                response = requests.post('https://api.uclassify.com/v1/me/jung-perceving-2100-20180319/' + name + "/train", 
                    json = data,
                    headers = header)
                
            if text_count % 100 == 0:
                    print("{}:{}".format(name, text_count))
                
            text_count += 1
        
    elif classifier == "tf":
        for ix, row in judg_samples.loc[(judg_samples.judg_func == name) & (judg_samples.judg_training_set == 1)].iterrows():
    
            data = {"texts":[row["text"]]}
            header = {"Content-Type": "application/json",
                 "Authorization": "Token " + os.environ["UCLASSIFY_WRITE"]}
            
            try:
                response = requests.post('https://api.uclassify.com/v1/me/jung-judging-2100-20180319/' + name + "/train", 
                    json = data,
                    headers = header)
            except Exception as e:
                print("Error: {}. retrying in 3 minutes.")
                time.sleep(180)
                response = requests.post('https://api.uclassify.com/v1/me/jung-judging-2100-20180319/' + name + "/train", 
                    json = data,
                    headers = header)
                
            if text_count % 100 == 0:
                print("{}:{}".format(name, text_count))
            text_count += 1
        
    
    print("Finished training Jung Cognitive Functions: {}".format(name))

In [96]:
perc_trained_ix = []
for name in ["s","n"]:
    functions_trained_ix = train_jung_cognitive_functions_en_classes(name, classifier="sn")
    perc_trained_ix.append(perc_trained_ix)

s:100
s:200
s:300
s:400
s:500
s:600
s:700
s:800
s:900
s:1000
s:1100
s:1200
s:1300
s:1400
s:1500
s:1600
s:1700
s:1800
s:1900
s:2000
s:2100
Finished training Jung Cognitive Functions: s
n:100
n:200
n:300
n:400
n:500
n:600
n:700
n:800
n:900
n:1000
n:1100
n:1200
n:1300
n:1400
n:1500
n:1600
n:1700
n:1800
n:1900
n:2000
n:2100
Finished training Jung Cognitive Functions: n


In [71]:
print("length perc_eval_set: {}".format(len(perc_eval_set)))
print("length perc_trained_ix: {}".format(len(perc_trained_ix)))

length perc_eval_set: 1800
length perc_trained_ix: 2


In [72]:
perc_trained_ix #oooops!

[[...], [...]]

# Split judging samples into train and eval subsets.

In [73]:
# prepare column to keep track of what's been used for training
zeros = np.zeros(len(judg_samples))
judg_samples["judg_training_set"] = zeros
judg_samples["judg_training_set"] = judg_samples.judg_training_set.astype("int")
judg_samples.head(3)

Unnamed: 0,text,tokens,perc_func,judg_func,actual_temp,judg_training_set
22981,it is what it is About Name: Heidi Age:16 Wher...,565,s,t,st,0
24378,https://www.tumblr.com/themes/by/leentheme htt...,582,n,t,nt,0
5187,three things cannot be long hidden © three thi...,516,n,t,nt,0


In [74]:
tf_traing_set_size = 2100 # e.g. 2100 is 70% of 3000 samples
judg_t_train = judg_samples[judg_samples.judg_func == "t"].sample(tf_traing_set_size).index
judg_f_train = judg_samples[judg_samples.judg_func == "f"].sample(tf_traing_set_size).index

judg_train = judg_t_train.union(judg_f_train)

judg_samples.loc[judg_train, "judg_training_set"] = 1
judg_samples.to_pickle("../../pickles/2100training_and_n900evaluation_samples_dataframe_tf.pickle")
judg_samples.head(15)

Unnamed: 0,text,tokens,perc_func,judg_func,actual_temp,judg_training_set
22981,it is what it is About Name: Heidi Age:16 Wher...,565,s,t,st,0
24378,https://www.tumblr.com/themes/by/leentheme htt...,582,n,t,nt,1
5187,three things cannot be long hidden © three thi...,516,n,t,nt,1
4307,"none gf with left feel 12442 ★ August 1st, 201...",499,s,t,st,1
2606,"God, Faith, & Fitness God, Faith, & Fitness Me...",924,n,t,nt,0
18214,"big hype, big letdown ♡ i'm charlotte and i li...",101,s,t,st,0
10718,Love the life you live Live the life you love ...,353,n,t,nt,1
20277,❤❤❤ - - - - - ♚ - - | momo | 14 | ♎ | ESTJ | |...,473,s,t,st,1
846,Cynically Marvelous | It's Axiomatic. Cynicall...,6973,n,t,nt,1
16405,I'll just pretend that youth will never end I'...,693,n,t,nt,1


In [75]:
# Separate evaluation DataFrame
judg_eval_set = judg_samples[judg_samples.judg_training_set == 0]
judg_eval_set.head(3)

Unnamed: 0,text,tokens,perc_func,judg_func,actual_temp,judg_training_set
22981,it is what it is About Name: Heidi Age:16 Wher...,565,s,t,st,0
2606,"God, Faith, & Fitness God, Faith, & Fitness Me...",924,n,t,nt,0
18214,"big hype, big letdown ♡ i'm charlotte and i li...",101,s,t,st,0


## Train TF classifier

In [76]:
tf_trained_ix = []
for name in ["t","f"]:
    tf_trained_ix = train_jung_cognitive_functions_en_classes(name, classifier="tf")
    tf_trained_ix.append(tf_trained_ix)

t:100
t:200
t:300
t:400
t:500
t:600
t:700
t:800
t:900
t:1000
t:1100
t:1200
t:1300
t:1400
t:1500
t:1600
t:1700
t:1800
t:1900
t:2000
t:2100
Finished training Jung Cognitive Functions: t
f:100
f:200
f:300
f:400
f:500
f:600
f:700
f:800
f:900
f:1000
f:1100
f:1200
f:1300
f:1400
f:1500
f:1600
f:1700
f:1800
f:1900
f:2000
f:2100
Finished training Jung Cognitive Functions: f


In [77]:
print("length judg_eval_set: {}".format(len(judg_eval_set)))
print("length tf_trained_ix: {}".format(len(tf_trained_ix)))

length judg_eval_set: 1800
length tf_trained_ix: 2101


# Classify percieving function 

In [78]:
def classify_jung_percieving_function_of_text(text):
    """Does what it says, pretty much."""
    header = {"Content-Type": "application/json",
             "Authorization": "Token " + os.environ["UCLASSIFY_READ"]}
    data = {"texts":[text]} # send a one-item list for now, since we don't have a feel for sizes
    result = requests.post("https://api.uclassify.com/v1/prfekt/jungian-cognitive-function-sensing-intuition/classify",
                       json = data,
                       headers = header)
    json_result = result.json()
    
    res_dict = {"s":0, "n":0}
    
    for classItem in json_result[0]["classification"]:
        res_dict[classItem["className"]] = classItem["p"]
    
    sorted_dict = sorted(res_dict.items(), key=operator.itemgetter(1), reverse=True)
    return sorted_dict

In [80]:
zeros = np.zeros(len(perc_eval_set))
sn_results = []
row_cnt = 1
for ix, row in perc_eval_set.iterrows():
    print("row: {} of {}".format(row_cnt, len(perc_eval_set)),end="\r")
    res = classify_jung_percieving_function_of_text(row["text"])
    sn_results.append(res[0][0])
    row_cnt += 1

row: 1800 of 1800

In [81]:
len(sn_results)

1800

Add the percieving classification results to the evaulation dataset 

In [82]:
perc_eval_set = pd.concat([perc_eval_set,
                      pd.DataFrame(sn_results, index=perc_eval_set.index)
                     ], axis=1, ignore_index=True)
perc_eval_set.columns = ["text","tokens","perc_func","judg_func","actual_temp","perc_training_set","sn"]
perc_eval_set.to_pickle("classification_results_percieving_function_blogs_n5000_dataframe.pickle")
print(perc_eval_set.head(3))

                                                    text  tokens perc_func  \
7557   IT'S ALL COMIN' DOWN ON US, BOYS why is my das...     336         s   
14753  this and this and this this and this and this ...     945         s   
9469   Player and Audience Player and Audience / next...     305         s   

      judg_func actual_temp  perc_training_set sn  
7557          t          st                  0  s  
14753         f          sf                  0  s  
9469          f          sf                  0  s  


# Classify TF

In [83]:
def classify_jung_judging_function_of_text(text):
    """Does what it says, pretty much."""
    header = {"Content-Type": "application/json",
             "Authorization": "Token " + os.environ["UCLASSIFY_READ"]}
    data = {"texts":[text]} # send a one-item list for now, since we don't have a feel for sizes
    result = requests.post("https://api.uclassify.com/v1/prfekt/jungian-cognitive-function-thinking-feeling/classify",
                       json = data,
                       headers = header)
    json_result = result.json()
    
    res_dict = {"t":0, "f":0}
    
    for classItem in json_result[0]["classification"]:
        res_dict[classItem["className"]] = classItem["p"]
    
    sorted_dict = sorted(res_dict.items(), key=operator.itemgetter(1), reverse=True)
    return sorted_dict

In [84]:
zeros = np.zeros(len(judg_eval_set))
tf_results = []
row_cnt = 1
for ix, row in judg_eval_set.iterrows():
    print("row: {} of {}".format(row_cnt, len(judg_eval_set)),end="\r")
    res = classify_jung_judging_function_of_text(row["text"])
    tf_results.append(res[0][0])
    row_cnt += 1

row: 1800 of 1800

Add judging classification results to evaluation set

In [85]:
judg_eval_set = pd.concat([judg_eval_set,
                      pd.DataFrame(tf_results, index=judg_eval_set.index)
                     ], axis=1, ignore_index=True)
judg_eval_set.columns = ["text","tokens","perc_func","judg_func","actual_temp","judg_training_set","tf"]
judg_eval_set.to_pickle("classification_results_judging_function_blogs_n10000_dataframe.pickle")
print(judg_eval_set.head(3))

                                                    text  tokens perc_func  \
22981  it is what it is About Name: Heidi Age:16 Wher...     565         s   
2606   God, Faith, & Fitness God, Faith, & Fitness Me...     924         n   
18214  big hype, big letdown ♡ i'm charlotte and i li...     101         s   

      judg_func actual_temp  judg_training_set tf  
22981         t          st                  0  t  
2606          t          nt                  0  t  
18214         t          st                  0  t  


# Evaluation of percieving classification

## Classification report percieving

In [86]:
sn_cr = classification_report(perc_eval_set['perc_func'], perc_eval_set['sn'])
print(sn_cr)

             precision    recall  f1-score   support

          n       0.87      0.86      0.87       900
          s       0.86      0.88      0.87       900

avg / total       0.87      0.87      0.87      1800



## Percieving accuracy

In [87]:
sn_accuracy = sum(perc_eval_set['perc_func']==perc_eval_set['sn'])/len(perc_eval_set)
print(sn_accuracy)

0.8672222222222222


## Percieving Kappa

In [88]:
sn_kappa = (sn_accuracy - 0.5)/0.5
print(sn_kappa)

0.7344444444444445


# Evaluation of judging classificiation

## Classification report judging

In [89]:
tf_cr = classification_report(judg_eval_set['judg_func'], judg_eval_set['tf'])
print(tf_cr)

             precision    recall  f1-score   support

          f       0.86      0.89      0.87       900
          t       0.88      0.85      0.87       900

avg / total       0.87      0.87      0.87      1800



## Judging accuracy

In [90]:
tf_accuracy = sum(judg_eval_set['judg_func']==judg_eval_set['tf'])/len(judg_eval_set)
print(tf_accuracy)

0.8705555555555555


## Judging Kappa

In [91]:
tf_kappa = (tf_accuracy - 0.5)/0.5
print(tf_kappa)

0.741111111111111


In [92]:
tf_kappa = (tf_accuracy - 0.5)/0.5
print(tf_kappa)

0.741111111111111


# Conclusion and further research

* Two dichotomic classifiers seems to improve the results tremendously. 

* Is the experiment done correctly? Peer-review the code.

* Check what words, and later, phrases are the most influential for each class. 


