In [1]:
import os
import re
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
depression_texts_length = {}
depression_texts_dir_path = './data_zhihu/depression_patients/'
for i in range(1, 16):
    with open(os.path.join(depression_texts_dir_path, '{:02}.txt'.format(i)), 'r') as f:
        depression_texts_length['text_{:02}'.format(i)] = len(f.read())
depression_texts_length = pd.Series(depression_texts_length)

In [3]:
ans1 = pd.read_csv('data_zhihu/daily_experience/你日常的一天是怎么度过的.csv', index_col=0)[['answer']]
ans2 = pd.read_csv('data_zhihu/daily_experience/你的日常生活是什么样子的.csv', index_col=0)[['answer']]
control_1_texts_length = ans1['answer'].apply(len)
control_2_texts_length = ans2['answer'].apply(len)
control_1_texts_length.index = ['text_{:02}'.format(i) for i in control_1_texts_length.index]
control_2_texts_length.index = ['text_{:02}'.format(i) for i in control_2_texts_length.index]

In [4]:
depression_word_counts_path = './word_counts/absolute_counts/depression'
depression_proportion_by_text_length_path = './word_counts/proportion_by_text_length/depression'
depression_proportion_by_total_words_detected_path = './word_counts/proportion_by_total_words_detected/depression'
for file in os.listdir(depression_word_counts_path):
    if file[-4:] == '.csv':
        counts = pd.read_csv(os.path.join(depression_word_counts_path, file), index_col=0)
        proportions_by_text_length = counts.divide(depression_texts_length, axis=0)
        proportions_by_text_length.to_csv(os.path.join(depression_proportion_by_text_length_path, file))
        row_sums = counts.sum(axis=1)
        proportions_by_total_words_detected = counts.divide(row_sums, axis=0)
        proportions_by_total_words_detected.to_csv(os.path.join(depression_proportion_by_total_words_detected_path, file))

In [5]:
control_word_counts_path = './word_counts/absolute_counts/control'
control_proportion_by_text_length_path = './word_counts/proportion_by_text_length/control'
control_proportion_by_total_words_detected_path = './word_counts/proportion_by_total_words_detected/control'
for file in os.listdir(control_word_counts_path):
    if file[-4:] == '.csv':
        counts = pd.read_csv(os.path.join(control_word_counts_path, file), index_col=0)
        if file[-5] == '1':
            proportions_by_text_length = counts.divide(control_1_texts_length, axis=0)
        elif file[-5] == '2':
            proportions_by_text_length = counts.divide(control_2_texts_length, axis=0)
        proportions_by_text_length.to_csv(os.path.join(control_proportion_by_text_length_path, file))
        row_sums = counts.sum(axis=1)
        proportions_by_total_words_detected = counts.divide(row_sums, axis=0)
        proportions_by_total_words_detected.to_csv(os.path.join(control_proportion_by_total_words_detected_path, file))

# By Text Length

In [6]:
word_types = ['4-i', '90-focuspast', '31-posemo', '32-negemo']
text_types = ['depression', 'control']
column_names = [[wt, tt] for wt in word_types for tt in text_types]
df_columns = pd.DataFrame(column_names, columns=['word_type', 'text_type'])
columns = pd.MultiIndex.from_frame(df_columns)

topns = [25, 50, 75, 100]
thresholds = [0.7, 0.75, 0.8, 0.85]
index_names = [[0, 1]] + [[topn, thresh] for topn in topns for thresh in thresholds]
df_index = df = pd.DataFrame(index_names, columns=['nearest_neighbors', 'threshold'])
index = pd.MultiIndex.from_frame(df_index)

idx = pd.IndexSlice

In [7]:
data_by_text_length = pd.DataFrame(np.zeros([len(index_names), len(column_names)]),
                                   index=index, columns=columns)

In [8]:
for file in os.listdir(depression_proportion_by_text_length_path):
    proportions = pd.read_csv(os.path.join(depression_proportion_by_text_length_path, file),
                              index_col=0)
    means = proportions.mean(axis=0)
    topn, thresh = 0, 1
    if file[:3] == 'new':
        topn, thresh = file[0:-4].split('_')[2:4]
        topn = int(topn)
        thresh = float(thresh)
    data_by_text_length.loc[idx[topn,thresh],idx['4-i','depression']] = means['4']
    data_by_text_length.loc[idx[topn,thresh],idx['90-focuspast','depression']] = means['90']
    data_by_text_length.loc[idx[topn,thresh],idx['31-posemo','depression']] = means['31']
    data_by_text_length.loc[idx[topn,thresh],idx['32-negemo','depression']] = means['32']
    
for file in os.listdir(control_proportion_by_text_length_path):
    proportions = pd.read_csv(os.path.join(control_proportion_by_text_length_path, file),
                              index_col=0)
    means = proportions.mean(axis=0)
    topn, thresh = 0, 1
    if file[:3] == 'new':
        topn, thresh = file.split('_')[2:4]
        topn = int(topn)
        thresh = float(thresh)
    data_by_text_length.loc[idx[topn,thresh],idx['4-i','control']] += means['4']
    data_by_text_length.loc[idx[topn,thresh],idx['90-focuspast','control']] += means['90']
    data_by_text_length.loc[idx[topn,thresh],idx['31-posemo','control']] += means['31']
    data_by_text_length.loc[idx[topn,thresh],idx['32-negemo','control']] += means['32']
    
for topn, thresh in index_names:
    data_by_text_length.loc[idx[topn,thresh],idx['4-i','control']] /= 2
    data_by_text_length.loc[idx[topn,thresh],idx['90-focuspast','control']] /= 2
    data_by_text_length.loc[idx[topn,thresh],idx['31-posemo','control']] /= 2
    data_by_text_length.loc[idx[topn,thresh],idx['32-negemo','control']] /= 2

In [9]:
data_by_text_length

Unnamed: 0_level_0,word_type,4-i,4-i,90-focuspast,90-focuspast,31-posemo,31-posemo,32-negemo,32-negemo
Unnamed: 0_level_1,text_type,depression,control,depression,control,depression,control,depression,control
nearest_neighbors,threshold,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2
0,1.0,0.043566,0.010246,0.010195,0.01106,0.035063,0.02629,0.065617,0.027477
25,0.7,0.043566,0.010246,0.010328,0.011231,0.035455,0.026723,0.069968,0.029801
25,0.75,0.043566,0.010246,0.010195,0.01106,0.035103,0.026296,0.066498,0.028284
25,0.8,0.043566,0.010246,0.010195,0.01106,0.035063,0.026292,0.065801,0.027481
25,0.85,0.043566,0.010246,0.010195,0.01106,0.035063,0.02629,0.065617,0.027477
50,0.7,0.043566,0.010246,0.010328,0.011231,0.035465,0.026762,0.070604,0.029857
50,0.75,0.043566,0.010246,0.010195,0.01106,0.035103,0.026302,0.066651,0.028323
50,0.8,0.043566,0.010246,0.010195,0.01106,0.035063,0.026296,0.065931,0.027508
50,0.85,0.043566,0.010246,0.010195,0.01106,0.035063,0.026292,0.065617,0.027477
75,0.7,0.043566,0.010246,0.010328,0.011231,0.035967,0.02703,0.071507,0.030722


In [10]:
data_by_total_words_detected = pd.DataFrame(np.zeros([len(index_names), len(column_names)]),
                                            index=index, columns=columns)

In [11]:
for file in os.listdir(depression_proportion_by_total_words_detected_path):
    proportions = pd.read_csv(os.path.join(depression_proportion_by_total_words_detected_path, file),
                              index_col=0)
    means = proportions.mean(axis=0)
    topn, thresh = 0, 1
    if file[:3] == 'new':
        topn, thresh = file[0:-4].split('_')[2:4]
        topn = int(topn)
        thresh = float(thresh)
    data_by_total_words_detected.loc[idx[topn,thresh],idx['4-i','depression']] = means['4']
    data_by_total_words_detected.loc[idx[topn,thresh],idx['90-focuspast','depression']] = means['90']
    data_by_total_words_detected.loc[idx[topn,thresh],idx['31-posemo','depression']] = means['31']
    data_by_total_words_detected.loc[idx[topn,thresh],idx['32-negemo','depression']] = means['32']
    
for file in os.listdir(control_proportion_by_total_words_detected_path):
    proportions = pd.read_csv(os.path.join(control_proportion_by_total_words_detected_path, file),
                              index_col=0)
    means = proportions.mean(axis=0)
    topn, thresh = 0, 1
    if file[:3] == 'new':
        topn, thresh = file.split('_')[2:4]
        topn = int(topn)
        thresh = float(thresh)
    data_by_total_words_detected.loc[idx[topn,thresh],idx['4-i','control']] += means['4']
    data_by_total_words_detected.loc[idx[topn,thresh],idx['90-focuspast','control']] += means['90']
    data_by_total_words_detected.loc[idx[topn,thresh],idx['31-posemo','control']] += means['31']
    data_by_total_words_detected.loc[idx[topn,thresh],idx['32-negemo','control']] += means['32']
    
for topn, thresh in index_names:
    data_by_total_words_detected.loc[idx[topn,thresh],idx['4-i','control']] /= 2
    data_by_total_words_detected.loc[idx[topn,thresh],idx['90-focuspast','control']] /= 2
    data_by_total_words_detected.loc[idx[topn,thresh],idx['31-posemo','control']] /= 2
    data_by_total_words_detected.loc[idx[topn,thresh],idx['32-negemo','control']] /= 2

In [12]:
data_by_total_words_detected

Unnamed: 0_level_0,word_type,4-i,4-i,90-focuspast,90-focuspast,31-posemo,31-posemo,32-negemo,32-negemo
Unnamed: 0_level_1,text_type,depression,control,depression,control,depression,control,depression,control
nearest_neighbors,threshold,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2
0,1.0,0.013115,0.003252,0.003091,0.003898,0.01068,0.009224,0.019922,0.010136
25,0.7,0.0126,0.003069,0.003004,0.003808,0.010359,0.008717,0.020392,0.0101
25,0.75,0.012856,0.003137,0.003027,0.003799,0.010473,0.008758,0.01979,0.009829
25,0.8,0.013022,0.003184,0.003067,0.003842,0.010597,0.008939,0.019832,0.009925
25,0.85,0.013075,0.003215,0.003081,0.003867,0.010643,0.009059,0.019858,0.010014
50,0.7,0.012564,0.003063,0.002994,0.003803,0.010326,0.008727,0.020515,0.010105
50,0.75,0.012829,0.003135,0.00302,0.003797,0.010448,0.008749,0.019787,0.009835
50,0.8,0.013018,0.003184,0.003066,0.003842,0.010593,0.00894,0.019867,0.009934
50,0.85,0.013075,0.003215,0.003081,0.003867,0.010643,0.00906,0.019858,0.010014
75,0.7,0.012532,0.003058,0.002985,0.003799,0.010441,0.008805,0.020725,0.010331
