In [1]:
import pandas as pd
import os
import requests
import numpy as np
import operator
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
import seaborn as sns
import numpy as np

In [2]:
!pwd

/Users/mos/Dropbox/memeticscience/typealyzer-dataset/notebooks/jungian_classification


The pickle-file used below is available [via Open Science Framework](https://osf.io/gyrc7/)

In [3]:
df_pickle_path = "../../pickles/dataframe_survey_2018-01-23_enriched.pickle"

In [4]:
indata = pd.read_pickle(df_pickle_path)
indata[["actual","actual_temp","is_s"]].head(5)

Unnamed: 0,actual,actual_temp,is_s
1,INFJ,nf,0
2,INFP,nf,0
3,INTP,nt,0
5,ENFJ,nf,0
10,INFP,nf,0


In [5]:
indata.domain.value_counts()

tumblr       21938
blogspot       513
wordpress      468
Name: domain, dtype: int64

In [6]:
indata.lang.value_counts().head()

en    22588
fr       50
da       34
de       25
no       23
Name: lang, dtype: int64

# Filter out English texts only

In [7]:
len(indata)

22919

In [8]:
indata = indata[indata.lang == "en"]

In [9]:
len(indata)

22588

# Add columns percieving and judging for evaluation of s/n and t/f classifiers

In [10]:
indata["perc_func"] = indata.actual_temp.str.extract("(\w)\w", expand=False)
indata["judg_func"] = indata.actual_temp.str.extract("\w(\w)", expand=False)

In [11]:
len(indata[pd.isnull(indata["tokens"])])

0

In [12]:
indata[pd.isnull(indata["tokens"])]

Unnamed: 0,url,typealyzer,actual,e,s,t,sntf_s,sntf_n,sntf_t,sntf_f,...,cogmech,auxverb,they,incl,money,feel,we,hear,perc_func,judg_func


# Inspect original data function distributions

In [37]:
s_series = indata[indata.perc_func == "s"]["tokens"]
n_series = indata[indata.perc_func == "n"]["tokens"]
t_series = indata[indata.judg_func == "t"]["tokens"]
f_series = indata[indata.judg_func == "f"]["tokens"]

avg_tkns = {
    "s":s_series.mean(),
    "n":n_series.mean(),
    "t":t_series.mean(),
    "f":f_series.mean()
}
avg_tkns

{'f': 488.2908894968084,
 'n': 511.4496560721063,
 's': 457.63189127105665,
 't': 511.27211970074814}

In [38]:
indata.perc_func.value_counts()

n    16864
s     5224
Name: perc_func, dtype: int64

In [39]:
indata.judg_func.value_counts()

f    12063
t    10025
Name: judg_func, dtype: int64

# Sample equal size text chunks for training and evaluation data
See: [Pandas sample()](https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.sample.html)

## Percieving function

In [40]:
# We have 5224 cases in the smallest class s 
perc_samples = pd.concat([
            indata[indata.perc_func == "s"].sample(3000, random_state=123456)[["text","tokens","perc_func","judg_func","actual_temp"]],
            indata[indata.perc_func == "n"].sample(3000, random_state=123456)[["text","tokens","perc_func","judg_func","actual_temp"]]
            ])

In [41]:
len(perc_samples)

6000

In [42]:
perc_samples.perc_func.value_counts()

s    3000
n    3000
Name: perc_func, dtype: int64

In [43]:
perc_samples.judg_func.value_counts()

f    3363
t    2637
Name: judg_func, dtype: int64

In [44]:
perc_samples.to_pickle("jung_percieving_functions_samples_blogs_totn6000.pickle")

In [45]:
perc_samples = pd.read_pickle("jung_percieving_functions_samples_blogs_totn6000.pickle")

## Judging function

In [46]:
# We have 10025 cases in the smallest class t
judg_samples = pd.concat([
            indata[indata.judg_func == "t"].sample(3000, random_state=123456)[["text","tokens","perc_func","judg_func","actual_temp"]],
            indata[indata.judg_func == "f"].sample(3000, random_state=123456)[["text","tokens","perc_func","judg_func","actual_temp"]]
            ])

In [47]:
len(judg_samples)

6000

In [48]:
judg_samples.judg_func.value_counts()

f    3000
t    3000
Name: judg_func, dtype: int64

In [49]:
judg_samples.perc_func.value_counts()

n    4598
s    1402
Name: perc_func, dtype: int64

In [50]:
judg_samples.to_pickle("jung_judging_functions_samples_blogs_totn6000.pickle")

In [51]:
judg_samples = pd.read_pickle("jung_judging_functions_samples_blogs_totn6000.pickle")

# Split percieving samples into train and eval subsets.

In [52]:
len(perc_samples)

6000

In [53]:
# prepare column to keep track of what's been used for training
zeros = np.zeros(len(perc_samples))
perc_samples["perc_training_set"] = zeros
perc_samples["perc_training_set"] = perc_samples.perc_training_set.astype("int")
perc_samples.head(3)

Unnamed: 0,text,tokens,perc_func,judg_func,actual_temp,perc_training_set
8623,Sonny Jooooooooon INDEX ASK PAST THEME Sonny J...,386,s,f,sf,0
11987,Log in | Tumblr Sign up Terms Privacy Posted b...,52,s,t,st,0
5340,a thing of blood © hi im logan and i love the ...,440,s,f,sf,0


In [54]:
sn_traing_set_size = 2100 # e.g. 2100 is 70% of 3000 samples
perc_s_train = perc_samples[perc_samples.perc_func == "s"].sample(sn_traing_set_size).index
perc_n_train = perc_samples[perc_samples.perc_func == "n"].sample(sn_traing_set_size).index

perc_train = perc_s_train.union(perc_n_train)

perc_samples.loc[perc_train, "perc_training_set"] = 1
perc_samples.head(3)

Unnamed: 0,text,tokens,perc_func,judg_func,actual_temp,perc_training_set
8623,Sonny Jooooooooon INDEX ASK PAST THEME Sonny J...,386,s,f,sf,0
11987,Log in | Tumblr Sign up Terms Privacy Posted b...,52,s,t,st,1
5340,a thing of blood © hi im logan and i love the ...,440,s,f,sf,1


In [55]:
len(perc_samples[perc_samples.perc_training_set == 1])

4200

In [56]:
# Separate evaluation DataFrame
perc_eval_set = perc_samples[perc_samples.perc_training_set == 0]
perc_eval_set.head(3)

Unnamed: 0,text,tokens,perc_func,judg_func,actual_temp,perc_training_set
8623,Sonny Jooooooooon INDEX ASK PAST THEME Sonny J...,386,s,f,sf,0
20508,void of lights void of lights Search Ask me an...,1922,s,t,st,0
4604,*tamp tamp* *tamp tamp* Index : Ask : submit :...,543,s,t,st,0


In [57]:
len(perc_eval_set)

1800

# Split judging samples into train and eval subsets.

In [58]:
# prepare column to keep track of what's been used for training
zeros = np.zeros(len(judg_samples))
judg_samples["judg_training_set"] = zeros
judg_samples["judg_training_set"] = judg_samples.judg_training_set.astype("int")
judg_samples.head(3)

Unnamed: 0,text,tokens,perc_func,judg_func,actual_temp,judg_training_set
22981,it is what it is About Name: Heidi Age:16 Wher...,565,s,t,st,0
24378,https://www.tumblr.com/themes/by/leentheme htt...,582,n,t,nt,0
5187,three things cannot be long hidden © three thi...,516,n,t,nt,0


In [59]:
# Separate evaluation DataFrame
judg_eval_set = judg_samples[judg_samples.judg_training_set == 0]
judg_eval_set.head(3)

Unnamed: 0,text,tokens,perc_func,judg_func,actual_temp,judg_training_set
22981,it is what it is About Name: Heidi Age:16 Wher...,565,s,t,st,0
24378,https://www.tumblr.com/themes/by/leentheme htt...,582,n,t,nt,0
5187,three things cannot be long hidden © three thi...,516,n,t,nt,0


# Classify percieving function 

In [62]:
def classify_jung_percieving_function_of_text(text):
    """Does what it says, pretty much."""
    header = {"Content-Type": "application/json",
             "Authorization": "Token " + os.environ["UCLASSIFY_READ"]}
    data = {"texts":[text]} # send a one-item list for now, since we don't have a feel for sizes
    result = requests.post("https://api.uclassify.com/v1/prfekt/jungian-cognitive-function-sensing-intuition/classify",
                       json = data,
                       headers = header)
    json_result = result.json()
    
    res_dict = {"s":0, "n":0}
    
    for classItem in json_result[0]["classification"]:
        res_dict[classItem["className"]] = classItem["p"]
    
    sorted_dict = sorted(res_dict.items(), key=operator.itemgetter(1), reverse=True)
    return sorted_dict

In [63]:
zeros = np.zeros(len(perc_eval_set))
sn_results = []
row_cnt = 1
for ix, row in perc_eval_set.iterrows():
    print("row: {} of {}".format(row_cnt, len(perc_eval_set)),end="\r")
    res = classify_jung_percieving_function_of_text(row["text"])
    sn_results.append(res[0][0])
    row_cnt += 1

row: 1800 of 1800

In [64]:
len(sn_results)

1800

Add the percieving classification results to the evaulation dataset 

In [65]:
perc_eval_set = pd.concat([perc_eval_set,
                      pd.DataFrame(sn_results, index=perc_eval_set.index)
                     ], axis=1, ignore_index=True)
perc_eval_set.columns = ["text","tokens","perc_func","judg_func","actual_temp","perc_training_set","sn"]
perc_eval_set.to_pickle("classification_results_percieving_function_blogs_n5000_dataframe.pickle")
print(perc_eval_set.head(3))

                                                    text  tokens perc_func  \
8623   Sonny Jooooooooon INDEX ASK PAST THEME Sonny J...     386         s   
20508  void of lights void of lights Search Ask me an...    1922         s   
4604   *tamp tamp* *tamp tamp* Index : Ask : submit :...     543         s   

      judg_func actual_temp  perc_training_set sn  
8623          f          sf                  0  s  
20508         t          st                  0  n  
4604          t          st                  0  s  


# Classify TF

In [68]:
def classify_jung_judging_function_of_text(text):
    """Does what it says, pretty much."""
    header = {"Content-Type": "application/json",
             "Authorization": "Token " + os.environ["UCLASSIFY_READ"]}
    data = {"texts":[text]} # send a one-item list for now, since we don't have a feel for sizes
    result = requests.post("https://api.uclassify.com/v1/prfekt/jungian-cognitive-function-thinking-feeling/classify",
                       json = data,
                       headers = header)
    json_result = result.json()
    
    res_dict = {"t":0, "f":0}
    
    for classItem in json_result[0]["classification"]:
        res_dict[classItem["className"]] = classItem["p"]
    
    sorted_dict = sorted(res_dict.items(), key=operator.itemgetter(1), reverse=True)
    return sorted_dict

In [69]:
zeros = np.zeros(len(judg_eval_set))
tf_results = []
row_cnt = 1
for ix, row in judg_eval_set.iterrows():
    print("row: {} of {}".format(row_cnt, len(judg_eval_set)),end="\r")
    res = classify_jung_judging_function_of_text(row["text"])
    tf_results.append(res[0][0])
    row_cnt += 1

row: 2358 of 6000

ConnectionError: HTTPSConnectionPool(host='api.uclassify.com', port=443): Max retries exceeded with url: /v1/prfekt/jungian-cognitive-function-thinking-feeling/classify (Caused by NewConnectionError('<urllib3.connection.VerifiedHTTPSConnection object at 0x1207f5438>: Failed to establish a new connection: [Errno 60] Operation timed out',))

Add judging classification results to evaluation set

In [None]:
judg_eval_set = pd.concat([judg_eval_set,
                      pd.DataFrame(tf_results, index=judg_eval_set.index)
                     ], axis=1, ignore_index=True)
judg_eval_set.columns = ["text","tokens","perc_func","judg_func","actual_temp","judg_training_set","tf"]
judg_eval_set.to_pickle("classification_results_judging_function_blogs_n10000_dataframe.pickle")
print(judg_eval_set.head(3))

# Evaluation of percieving classification

## Classification report percieving

In [None]:
sn_cr = classification_report(perc_eval_set['perc_func'], perc_eval_set['sn'])
print(sn_cr)

## Percieving accuracy

In [None]:
sn_accuracy = sum(perc_eval_set['perc_func']==perc_eval_set['sn'])/len(perc_eval_set)
print(sn_accuracy)

## Percieving Kappa

In [None]:
sn_kappa = (sn_accuracy - 0.5)/0.5
print(sn_kappa)

# Evaluation of judging classificiation

## Classification report judging

In [None]:
tf_cr = classification_report(judg_eval_set['judg_func'], judg_eval_set['tf'])
print(tf_cr)

## Judging accuracy

In [None]:
tf_accuracy = sum(judg_eval_set['judg_func']==judg_eval_set['tf'])/len(judg_eval_set)
print(tf_accuracy)

## Judging Kappa

In [None]:
tf_kappa = (tf_accuracy - 0.5)/0.5
print(tf_kappa)

In [None]:
tf_kappa = (tf_accuracy - 0.5)/0.5
print(tf_kappa)

# Conclusion and further research

* Two dichotomic classifiers seems to improve the results tremendously. 

* Is the experiment done correctly? Peer-review the code.

* Check what words, and later, phrases are the most influential for each class. 


