In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
import time 

In [3]:
def make_lists(filename):
    df = pd.read_csv(filename)
    
    content_lists = []
    for i in (df['title'] + ' '+df['text_context']):
        i = i.replace('\n', '')
        content_lists.append(i.lower())
    return content_lists

## LIWC 를 대체할만한 기능을 정리해보면,

1. word count, wps, Sixltr -> `LIWC-alike`
2. health, work, home, social, money, ingest, death, sexual, leisure, religion word의 비율
3. anxiety, anger, sadness의 정도를 분류 -> `LIWC-alike`
4. 언어학적 특성
   1) 인칭대명사 
   2) 시제

In [4]:
# after liwc-alike 
a_alike = pd.read_csv('preprocess_data/adhd_origin_LIWCalike.csv').iloc[:, 1:] #adhd
b_alike = pd.read_csv('preprocess_data/bipolar_origin_LIWCalike.csv').iloc[:, 1:] #bipolar
an_alike = pd.read_csv('preprocess_data/anxiety_origin_LIWCalike.csv').iloc[:, 1:] #anxiety
n_alike = pd.read_csv('preprocess_data/non_mh_origin_LIWCalike.csv').iloc[:, 1:] #nonMH
d_alike = pd.read_csv('preprocess_data/depression_origin_LIWCalike.csv').iloc[:, 1:] #depression

In [6]:
## 인칭대명사
def pronoun_analysis(filename):
    content_lists = make_lists(filename)
    pers1 = ['i', 'my', 'me', 'mine', "i'm", "i'd", "i've", "i'll"]
    pers2 = ['you', 'u', 'yours', 'your', 'ur', 'we', 'our', 'us', 'ours', "we're", "we'd", "we'll", "you'll", "you've", "you'd"]
    pers3 = ['he', 'she', 'him', 'his', 'hers', 'her','them', 'their','they', 'theirs', "she's", "she'd", "she'll", "he's", "he'd", "he'll", 
        "they're", "they'd", "they'll", "she've", "he've", "they've"]
    pers0 = ['it', 'that',  'this', 'its', "it's", "that's", "these", "those"]
    
    list1 = []
    list2 = []
    list3 = []
    list0 = []
    for content in content_lists:
        content = content.split()
        wc = len(content)
        count1 = 0 # 1인칭
        count2 = 0 # 2인칭
        count3 = 0 # 3인칭
        count4 = 0 # 비인칭 
        for c in content:
            for pers in pers1: 
                if pers == c: count1+=1
            for pers in pers2: 
                if pers == c: count2+=1
            for pers in pers3: 
                if pers == c: count3+=1
            for pers in pers0: 
                if pers == c: count4+=1
            
        list1.append(count1/wc)
        list2.append(count2/wc)
        list3.append(count3/wc)
        list0.append(count4/wc)
    result = pd.DataFrame({
        '1pers': list1, '2pers': list2, '3pers': list3, 'nonpers': list0
    })
    #print('done!')
    return result

In [7]:
# after pronoun_analysis
a_pron = pronoun_analysis('preprocess_data/adhd_origin.csv') # adhd
b_pron = pronoun_analysis('preprocess_data/bipolar_origin.csv') # bipolar
an_pron = pronoun_analysis('preprocess_data/anxiety_origin.csv') # anxiety
s_pron = pronoun_analysis('preprocess_data/schizo_origin.csv') # schizo 
n_pron = pronoun_analysis('preprocess_data/non_mh_origin.csv') # nonMH
d_pron = pronoun_analysis('preprocess_data/depression_origin.csv') # depression

In [8]:
# save
a_pron.to_csv('preprocess_data/adhd_origin_pron.csv', index = False)
b_pron.to_csv('preprocess_data/bipolar_origin_pron.csv', index = False)
an_pron.to_csv('preprocess_data/anxiety_origin_pron.csv', index = False)
s_pron.to_csv('preprocess_data/schizo_origin_pron.csv', index = False)
n_pron.to_csv('preprocess_data/non_mh_origin_pron.csv', index = False)
d_pron.to_csv('preprocess_data/depression_origin_pron.csv', index = False)

In [9]:
import nltk
from nltk.tokenize import word_tokenize

In [None]:
## 시제 
def time_analysis(filename):
    content_lists = make_lists(filename)
    
    past =['ago', 'did', 'before', 'yesterday', 'last']
    present = ['present', 'now', 'today', 'this']
    future = ['future', 'may', 'will', 'soon', 'next', 'gonna']
    vb_pr = ['VBP', 'VBZ', 'VB']
    
    future_s = []
    present_s = []
    past_s = []
    for j in range(len(content_lists)):
        content = content_lists[j]
        temp_text = nltk.word_tokenize(content)
        f_s = 0 # future
        pr_s = 0 # present
        p_s = 0 #past
        for t in temp_text:
            if t in future: f_s +=1
            if t in present: pr_s +=1
            if t in past: p_s +=1
        temp_tag = nltk.pos_tag(temp_text)
        for i in range(len(temp_tag)):
            if temp_tag[i][1] in vb_pr: pr_s +=1
            elif temp_tag[i][1] == 'VBD': p_s +=1
        future_s.append(f_s)
        present_s.append(pr_s)
        past_s.append(p_s)
    result = pd.DataFrame({
       'past':past_s, 'present': present_s ,'future': future_s
    })
    return result

In [None]:
# after time_analysis
a_time = time_analysis('preprocess_data/adhd_origin.csv') # adhd
b_time = time_analysis('preprocess_data/bipolar_origin.csv') # bipolar
an_time = time_analysis('preprocess_data/anxiety_origin.csv') # anxiety
s_time = time_analysis('preprocess_data/schizo_origin.csv') # schizo
d_time = time_analysis('preprocess_data/depression_origin.csv') # depress
n_time = time_analysis('preprocess_data/non_mh_origin.csv') # nonMH

In [None]:
a_time.to_csv('preprocess_data/adhd_origin_time.csv', index = False)
b_time.to_csv('preprocess_data/bipolar_origin_time.csv', index = False)
an_time.to_csv('preprocess_data/anxiety_origin_time.csv', index = False)
s_time.to_csv('preprocess_data/schizo_origin_time.csv', index = False)
n_time.to_csv('preprocess_data/non_mh_origin_time.csv', index = False)
d_time.to_csv('preprocess_data/depression_origin_time.csv', index = False)

## 감정 분류 

1. 7 emotional category
2. LIWC-alike에서 제공하는 `joy`, `disgust`, `surprised`, `anger`, `fear`, `sad`, `love`


In [7]:
## 7 emotional category
def senti_classification(filename):
    content_lists = make_lists(filename)
    e = pd.read_csv('7_emotions.csv')
    start = time.time()
    joy = []
    disgust = []
    surprised = []
    anger = []
    fear = []
    sad = []
    love = []
    
    for i in range(len(content_lists)):
        temp_text = content_lists[i].split()
        wc = len(temp_text)
        count1 = 0 # joy
        count2 = 0 # disgust
        count3 = 0 # surprised
        count4 = 0 # anger
        count5 = 0 # fear
        count6 = 0 # sad
        count7 = 0 # love
        for text in temp_text: 
            if text in list(e[e['emotion']== 'joy']['words']): count1 +=1
            if text in list(e[e['emotion']== 'disgust']['words']): count2 +=1
            if text in list(e[e['emotion']== 'surprised']['words']): count3 +=1
            if text in list(e[e['emotion']== 'anger']['words']): count4 +=1
            if text in list(e[e['emotion']== 'fear']['words']): count5 +=1
            if text in list(e[e['emotion']== 'sad']['words']): count6 +=1    
            if text in list(e[e['emotion']== 'love']['words']): count7 +=1
        joy.append(count1/wc)
        disgust.append(count2/wc)
        surprised.append(count3/wc)
        anger.append(count4/wc)
        fear.append(count5/wc)
        sad.append(count6/wc)
        love.append(count7/wc)
    
    result = pd.DataFrame({
        'joy': joy, 'sad': sad, 'love': love,'disgust': disgust,
         'surprised': surprised, 'anger': anger, 'fear': fear
    })
    print('done!')
    print('tine: '+str(time.time()-start))
    return result 

In [5]:
# after sentimental_classification
a_senti = senti_classification('preprocess_data/adhd_origin.csv') # adhd
b_senti = senti_classification('preprocess_data/bipolar_origin.csv') # bipolar
an_senti = senti_classification('preprocess_data/anxiety_origin.csv') # anxiety
s_senti = senti_classification('preprocess_data/schizo_origin.csv') # schizo
n_senti = senti_classification('preprocess_data/non_mh_origin.csv') # nonMH

0 th
1 th
2 th
3 th
4 th
5 th
6 th
7 th
8 th
9 th
10 th
11 th
12 th
13 th
14 th
15 th
16 th
17 th
18 th
19 th
20 th
21 th
22 th
23 th
24 th
25 th
26 th
27 th
28 th
29 th
30 th
31 th
32 th
33 th
34 th
35 th
36 th
37 th
38 th
39 th
40 th
41 th
42 th
43 th
44 th
45 th
46 th
47 th
48 th
49 th
50 th
51 th
52 th
53 th
54 th
55 th
56 th
57 th
58 th
59 th
60 th
61 th
62 th
63 th
64 th
65 th
66 th
67 th
68 th
69 th
70 th
71 th
72 th
73 th
74 th
75 th
76 th
77 th
78 th
79 th
80 th
81 th
82 th
83 th
84 th
85 th
86 th
87 th
88 th
89 th
90 th
91 th
92 th
93 th
94 th
95 th
96 th
97 th
98 th
99 th
100 th
101 th
102 th
103 th
104 th
105 th
106 th
107 th
108 th
109 th
110 th
111 th
112 th
113 th
114 th
115 th
116 th
117 th
118 th
119 th
120 th
121 th
122 th
123 th
124 th
125 th
126 th
127 th
128 th
129 th
130 th
131 th
132 th
133 th
134 th
135 th
136 th
137 th
138 th
139 th
140 th
141 th
142 th
143 th
144 th
145 th
146 th
147 th
148 th
149 th
150 th
151 th
152 th
153 th
154 th
155 th
156 th
157 th
158 

In [6]:
a_senti.to_csv('preprocess_data/adhd_origin_senti.csv', index = False)
b_senti.to_csv('preprocess_data/bipolar_origin_senti.csv', index = False)
an_senti.to_csv('preprocess_data/anxiety_origin_senti.csv', index = False)
s_senti.to_csv('preprocess_data/schizo_origin_senti.csv', index = False)
d_senti.to_csv('preprocess_data/depression_origin_senti.csv', index = False)
n_senti.to_csv('preprocess_data/non_mh_origin_senti.csv', index = False)