In [1]:
import pandas as pd
import numpy as np
from scipy.stats import mannwhitneyu
import matplotlib.pyplot as plt
%matplotlib inline
colors = ['lightblue', 'pink']

In [2]:
df_phase1  = pd.read_csv('phase1_lect_ata_new_df.csv')
df_phase2  = pd.read_csv('phase2_lect_ata_new_df.csv')
df_sent  = pd.read_csv('all_w_sent.csv',encoding = "ISO-8859-1")
df_liwc  = pd.read_csv('all_w_liwc.csv',encoding = "ISO-8859-1")
feature_keys = ['total words','total uniq', 'total wpt','D uniq','D words',\
                'D wpt','D pos_avg','P uniq','P words','P wpt','P pos_avg']
liwc_features = ['Achievement','Affective','Anger','Anxiety','Biological','Body','Causation','Certainty','Cognitive',\
                 'Discrepancy','Family','Feeling','Fillers','Friends','Future tense','Health','Hearing','Home',\
                 'Humans','Insight','Leisure','Money','Negative Emotion','Positive emotion','Religion','Sadness',\
                 'Sexual','Social','Swear','Time','death@Death']

def get_male_female_files(p_male,d_male):
	phase1_files = df_phase1[(df_phase1['isPatientMale']==p_male)&(df_phase1['isDoctorMale']==d_male)]['Filename'].tolist()
	phase2_files = df_phase2[(df_phase2['isPatientMale']==p_male)&(df_phase2['isDoctorMale']==d_male)]['Filename'].tolist()
	return phase1_files+phase2_files

male_doctor_file_names = get_male_female_files(0,1)+ get_male_female_files(1,1)
female_doctor_file_names = get_male_female_files(0,0)+ get_male_female_files(1,0)
male_patient_file_names = get_male_female_files(1,0)+ get_male_female_files(1,1)
female_patient_file_names = get_male_female_files(0,0)+ get_male_female_files(0,1)

maleD_femaleP_file_names = get_male_female_files(0,1)
maleD_maleP_file_names = get_male_female_files(1,1)
femaleD_femaleP_file_names = get_male_female_files(0,0)
femaleD_maleP_file_names = get_male_female_files(1,0)

print('Male Doctor conversation N = ',len(male_doctor_file_names))
print('Female Doctor conversation N = ',len(female_doctor_file_names))
print('Male Patients conversation N = ',len(male_patient_file_names))
print('Female Patients N = ',len(female_patient_file_names))
print('Male Doctor Female Patient N = ',len(maleD_femaleP_file_names))
print('Male Doctor Male Patient N = ',len(maleD_maleP_file_names))
print('Female Doctor Female Patient N = ',len(femaleD_femaleP_file_names))
print('Female Doctor Male Patient N = ',len(femaleD_maleP_file_names))

df = pd.concat([df_phase1,df_phase2])
docgender = df.groupby('p0_physician_id').mean()['isDoctorMale']
print('unique male doctors N = ',docgender.sum())
discordance_dict = dict()
discordance = pd.concat([df_phase1[['Filename','discDmind3']].dropna(),df_phase2[['Filename','discDmind3']].dropna()])
discordance_dict = discordance.set_index('Filename').T.to_dict('list')

rating_dict = dict()
rating = pd.concat([df_phase1[['Filename','cumulative rating']].dropna(),df_phase2[['Filename','cumulative rating']].dropna()])
rating_dict = rating.set_index('Filename').T.to_dict('list')

discordance[discordance['Filename'].isin(male_doctor_file_names)]['discDmind3'].mean()

Male Doctor conversation N =  255
Female Doctor conversation N =  127
Male Patients conversation N =  172
Female Patients N =  210
Male Doctor Female Patient N =  122
Male Doctor Male Patient N =  133
Female Doctor Female Patient N =  88
Female Doctor Male Patient N =  39
unique male doctors N =  25.0


2.7848101265822787

<h2> All doctor high and low discordance comparison</h2>

In [45]:
df_male = df
df_male_high = df_male[df_male['discDmind3'] >= 3]
df_male_low = df_male[df_male['discDmind3'] < 3]
result = dict()
c = ((len(male_doctor_file_names)/2)**2)
result['Feature'] = []
result['p'] = []
result['d'] = []
result['High mean'] = []
result['Low mean'] = []
for feat in feature_keys:
    U, p = mannwhitneyu(df_male_high[feat],df_male_low[feat])
    d = 2*U/c -1
    result['Feature'].append(feat)
    result['p'].append(p)
    result['d'].append(d)
    result['High mean'].append(df_male_high[feat].mean())
    result['Low mean'].append(df_male_low[feat].mean())
df_liwc_male = df_liwc
df_liwc_male_high = df_liwc_male[df_liwc_male['Filename'].isin(df_male_high['Filename'].tolist())].groupby(['Filename']).sum()
df_liwc_male_low = df_liwc_male[df_liwc_male['Filename'].isin(df_male_low['Filename'].tolist())].groupby(['Filename']).sum()
for feat in liwc_features:
    U, p = mannwhitneyu(df_liwc_male_high[feat]/df_liwc_male_high['n_words'],df_liwc_male_low[feat]/df_liwc_male_low['n_words'])
    d = (2*U)/c -1
    result['Feature'].append(feat)
    result['p'].append(p)
    result['d'].append(d)
    result['High mean'].append((df_liwc_male_high[feat]/df_liwc_male_high['n_words']).mean())
    result['Low mean'].append((df_liwc_male_low[feat]/df_liwc_male_low['n_words']).mean())
result = pd.DataFrame(result)
result = result.sort_values(['p'])
result[result['p']<0.05]


Unnamed: 0,Feature,High mean,Low mean,d,p
5,D wpt,17.318536,15.913512,0.642199,0.01768
36,Sadness,0.001682,0.001861,0.678247,0.036279
31,Leisure,0.002522,0.002752,0.693318,0.047826


<h2> All doctor high and low rating comparison</h2>


In [46]:
df_male = df
df_male_high = df_male[df_male['cumulative rating'] >= 45]
df_male_low = df_male[df_male['cumulative rating'] < 45]
result = dict()
c = ((len(male_doctor_file_names)/2)**2)
result['Feature'] = []
result['p'] = []
result['d'] = []
result['High mean'] = []
result['Low mean'] = []
for feat in feature_keys:
    U, p = mannwhitneyu(df_male_high[feat],df_male_low[feat])
    d = 2*U/c -1
    result['Feature'].append(feat)
    result['p'].append(p)
    result['d'].append(d)
    result['High mean'].append(df_male_high[feat].mean())
    result['Low mean'].append(df_male_low[feat].mean())
df_liwc_male = df_liwc
df_liwc_male_high = df_liwc_male[df_liwc_male['Filename'].isin(df_male_high['Filename'].tolist())].groupby(['Filename']).sum()
df_liwc_male_low = df_liwc_male[df_liwc_male['Filename'].isin(df_male_low['Filename'].tolist())].groupby(['Filename']).sum()
for feat in liwc_features:
    U, p = mannwhitneyu(df_liwc_male_high[feat]/df_liwc_male_high['n_words'],df_liwc_male_low[feat]/df_liwc_male_low['n_words'])
    d = (2*U)/c -1
    result['Feature'].append(feat)
    result['p'].append(p)
    result['d'].append(d)
    result['High mean'].append((df_liwc_male_high[feat]/df_liwc_male_high['n_words']).mean())
    result['Low mean'].append((df_liwc_male_low[feat]/df_liwc_male_low['n_words']).mean())
result = pd.DataFrame(result)
result = result.sort_values(['p'])
result[result['p']<0.05]


Unnamed: 0,Feature,High mean,Low mean,d,p
41,death@Death,0.00025,0.000226,1.024268,0.031827
40,Time,0.04524,0.046399,1.014671,0.043442
37,Sexual,0.000414,0.000352,1.033864,0.045762


<h2>Male Docotors high and low discordance comparison</h2>

In [40]:
df_male = df[df['Filename'].isin(male_doctor_file_names)]
df_male_high = df_male[df_male['discDmind3'] >= 3]
df_male_low = df_male[df_male['discDmind3'] < 3]
result = dict()
c = ((len(male_doctor_file_names)/2)**2)
result['Feature'] = []
result['p'] = []
result['d'] = []
result['High mean'] = []
result['Low mean'] = []
for feat in feature_keys:
    U, p = mannwhitneyu(df_male_high[feat],df_male_low[feat])
    d = 2*U/c -1
    result['Feature'].append(feat)
    result['p'].append(p)
    result['d'].append(d)
    result['High mean'].append(df_male_high[feat].mean())
    result['Low mean'].append(df_male_low[feat].mean())
df_liwc_male = df_liwc[df_liwc['Filename'].isin(male_doctor_file_names)]
df_liwc_male_high = df_liwc_male[df_liwc_male['Filename'].isin(df_male_high['Filename'].tolist())].groupby(['Filename']).sum()
df_liwc_male_low = df_liwc_male[df_liwc_male['Filename'].isin(df_male_low['Filename'].tolist())].groupby(['Filename']).sum()
for feat in liwc_features:
    U, p = mannwhitneyu(df_liwc_male_high[feat]/df_liwc_male_high['n_words'],df_liwc_male_low[feat]/df_liwc_male_low['n_words'])
    d = (2*U)/c -1
    result['Feature'].append(feat)
    result['p'].append(p)
    result['d'].append(d)
    result['High mean'].append((df_liwc_male_high[feat]/df_liwc_male_high['n_words']).mean())
    result['Low mean'].append((df_liwc_male_low[feat]/df_liwc_male_low['n_words']).mean())
result = pd.DataFrame(result)
result = result.sort_values(['p'])
result[result['p']<0.05]


Unnamed: 0,Feature,High mean,Low mean,d,p
9,P wpt,8.610164,9.535247,-0.282122,0.012676
7,P uniq,273.54918,307.973913,-0.269942,0.020261
10,P pos_avg,0.302169,0.281109,-0.264283,0.024933
41,death@Death,0.000327,0.000212,-0.246567,0.028717
36,Sadness,0.001718,0.001957,-0.259116,0.029929
8,P words,879.139344,1029.478261,-0.256286,0.033038
31,Leisure,0.002553,0.00283,-0.250012,0.040822
34,Positive emotion,0.021732,0.020325,-0.249335,0.041745
39,Swear,0.000163,0.000184,-0.221961,0.047914


<h2>Female Docotors high and low discordance comparison</h2>

In [38]:
df_male = df[df['Filename'].isin(female_doctor_file_names)]
df_male_high = df_male[df_male['discDmind3'] >= 3]
df_male_low = df_male[df_male['discDmind3'] < 3]
result = dict()
c = ((len(male_doctor_file_names)/2)**2)
result['Feature'] = []
result['p'] = []
result['d'] = []
result['High mean'] = []
result['Low mean'] = []
for feat in feature_keys:
    U, p = mannwhitneyu(df_male_high[feat],df_male_low[feat])
    d = 2*U/c -1
    result['Feature'].append(feat)
    result['p'].append(p)
    result['d'].append(d)
    result['High mean'].append(df_male_high[feat].mean())
    result['Low mean'].append(df_male_low[feat].mean())
df_liwc_male = df_liwc[df_liwc['Filename'].isin(female_doctor_file_names)]
df_liwc_male_high = df_liwc_male[df_liwc_male['Filename'].isin(df_male_high['Filename'].tolist())].groupby(['Filename']).sum()
df_liwc_male_low = df_liwc_male[df_liwc_male['Filename'].isin(df_male_low['Filename'].tolist())].groupby(['Filename']).sum()
for feat in liwc_features:
    U, p = mannwhitneyu(df_liwc_male_high[feat]/df_liwc_male_high['n_words'],df_liwc_male_low[feat]/df_liwc_male_low['n_words'])
    d = (2*U)/c -1
    result['Feature'].append(feat)
    result['p'].append(p)
    result['d'].append(d)
    result['High mean'].append((df_liwc_male_high[feat]/df_liwc_male_high['n_words']).mean())
    result['Low mean'].append((df_liwc_male_low[feat]/df_liwc_male_low['n_words']).mean())
result = pd.DataFrame(result)
result = result.sort_values(['p'])
result[result['p']<0.05]


Unnamed: 0,Feature,High mean,Low mean,d,p
30,Insight,0.019373,0.017861,-0.850273,0.014473
6,D pos_avg,0.210594,0.231595,-0.843383,0.03113
4,D words,2564.765625,2234.02,-0.84283,0.032974
25,Future tense,0.015324,0.016657,-0.839323,0.046869


<h2>Male Docotors high and low Ratings comparison</h2>

In [41]:
df_male = df[df['Filename'].isin(male_doctor_file_names)]
df_male_high = df_male[df_male['cumulative rating'] >= 45]
df_male_low = df_male[df_male['cumulative rating'] < 45]
result = dict()
c = ((len(male_doctor_file_names)/2)**2)
result['Feature'] = []
result['p'] = []
result['d'] = []
result['High mean'] = []
result['Low mean'] = []
for feat in feature_keys:
    U, p = mannwhitneyu(df_male_high[feat],df_male_low[feat])
    d = 2*U/c -1
    result['Feature'].append(feat)
    result['p'].append(p)
    result['d'].append(d)
    result['High mean'].append(df_male_high[feat].mean())
    result['Low mean'].append(df_male_low[feat].mean())
df_liwc_male = df_liwc[df_liwc['Filename'].isin(male_doctor_file_names)]
df_liwc_male_high = df_liwc_male[df_liwc_male['Filename'].isin(df_male_high['Filename'].tolist())].groupby(['Filename']).sum()
df_liwc_male_low = df_liwc_male[df_liwc_male['Filename'].isin(df_male_low['Filename'].tolist())].groupby(['Filename']).sum()
for feat in liwc_features:
    U, p = mannwhitneyu(df_liwc_male_high[feat]/df_liwc_male_high['n_words'],df_liwc_male_low[feat]/df_liwc_male_low['n_words'])
    d = (2*U)/c -1
    result['Feature'].append(feat)
    result['p'].append(p)
    result['d'].append(d)
    result['High mean'].append((df_liwc_male_high[feat]/df_liwc_male_high['n_words']).mean())
    result['Low mean'].append((df_liwc_male_low[feat]/df_liwc_male_low['n_words']).mean())
result = pd.DataFrame(result)
result = result.sort_values(['p'])
result[result['p']<0.05]

Unnamed: 0,Feature,High mean,Low mean,d,p
25,Future tense,0.015954,0.014973,-0.139408,0.028757
37,Sexual,0.000415,0.000332,-0.118985,0.036715


<h2>Female Docotors high and low Ratings comparison</h2>

In [43]:
df_male = df[df['Filename'].isin(female_doctor_file_names)]
df_male_high = df_male[df_male['cumulative rating'] >= 45]
df_male_low = df_male[df_male['cumulative rating'] < 45]
result = dict()
c = ((len(male_doctor_file_names)/2)**2)
result['Feature'] = []
result['p'] = []
result['d'] = []
result['High mean'] = []
result['Low mean'] = []
for feat in feature_keys:
    U, p = mannwhitneyu(df_male_high[feat],df_male_low[feat])
    d = 2*U/c -1
    result['Feature'].append(feat)
    result['p'].append(p)
    result['d'].append(d)
    result['High mean'].append(df_male_high[feat].mean())
    result['Low mean'].append(df_male_low[feat].mean())
df_liwc_male = df_liwc[df_liwc['Filename'].isin(female_doctor_file_names)]
df_liwc_male_high = df_liwc_male[df_liwc_male['Filename'].isin(df_male_high['Filename'].tolist())].groupby(['Filename']).sum()
df_liwc_male_low = df_liwc_male[df_liwc_male['Filename'].isin(df_male_low['Filename'].tolist())].groupby(['Filename']).sum()
for feat in liwc_features:
    U, p = mannwhitneyu(df_liwc_male_high[feat]/df_liwc_male_high['n_words'],df_liwc_male_low[feat]/df_liwc_male_low['n_words'])
    d = (2*U)/c -1
    result['Feature'].append(feat)
    result['p'].append(p)
    result['d'].append(d)
    result['High mean'].append((df_liwc_male_high[feat]/df_liwc_male_high['n_words']).mean())
    result['Low mean'].append((df_liwc_male_low[feat]/df_liwc_male_low['n_words']).mean())
result = pd.DataFrame(result)
result = result.sort_values(['p'])
result[result['p']<0.05]


Unnamed: 0,Feature,High mean,Low mean,d,p
40,Time,0.044105,0.046378,-0.804998,0.018953
22,Feeling,0.004185,0.00357,-0.798724,0.033629
