In [45]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn import preprocessing, svm
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error
import re
import math
from tabulate import tabulate
# pd.set_option('display.max_columns', None)
# pd.set_option('display.max_colwidth', None)
# pd.set_option('display.max_rows', None)

### code to set up processing 

In [46]:
surg = r'surgery[\.,]?'
rad = r'radiation[\.,]?'
both = f'(?=.*{surg})(?=.*{rad})'

In [47]:
def count_responses(df):
    #count responses with word surgery 
    count_surgery = df['Response'].str.contains(surg, case=False, regex=True).sum()
    responses_surgery = df[df['Response'].str.contains(surg, case=False, regex=True)]
    #print("Number of responses containing 'surgery':", count_surgery)
    
    #count radiation 
    count_radiation = df['Response'].str.contains(rad, case=False, regex=True).sum()
    responses_radiation = df[df['Response'].str.contains(rad, case=False, regex=True)]
    #print("Number of responses containing 'radiation':", count_radiation)
    
    #count both
    count_both = df['Response'].str.contains(both, case=False, regex=True).sum()
    responses_both = df[df['Response'].str.contains(both, case=False, regex=True)]
    print("Number of responses containing both 'radiation' and 'surgery':", count_both)
    
    df_surgery_only = pd.merge(responses_surgery, responses_both, how="outer", indicator=True)
    df_surgery_only = df_surgery_only[df_surgery_only['_merge'] == 'left_only']
    total_surgery = df_surgery_only['Response'].count()
    print("Number of responses containing ONLY 'surgery':", total_surgery)
    
    df_radiation_only = pd.merge(responses_radiation, responses_both, how="outer", indicator=True)
    df_radiation_only = df_radiation_only[df_radiation_only['_merge'] == 'left_only']
    total_radiation = df_radiation_only['Response'].count()
    print("Number of responses containing ONLY 'radiation':", total_radiation)
    
    return total_surgery, total_radiation, responses_both

In [48]:
def check_both(df):
    dataNone = []
    radiation = 0
    surgery = 0
    for num in range(len(df)):
        row_data = df.iloc[num]
        options = row_data['Response'].split(' ')
        #print(options[0])
        
        if options[0] == "Radiation" or options[3] == "radiation" or options[4] == "radiation":
            radiation+=1
            
        elif options[0] == "Surgery" or options[0] == "Surgery," or options[3] == "surgery" or options[4] == "surgery" or options[4] == 'surgery,':
            surgery+=1
            
        else:
            dataNone.append(row_data)
        
    dfNone = pd.DataFrame(dataNone)
    return surgery, radiation, dfNone

### data/analysis from final dateset 

In [49]:
dfFinal = pd.read_csv("Doctors1000.csv")
#print(dfFinal.head())
print(dfFinal.count())

Form        2000
Response    2000
dtype: int64


In [50]:
dfFinalA = dfFinal[dfFinal['Form'] == "Form A"]
dfFinalB = dfFinal[dfFinal['Form'] == 'Form B']
print(dfFinalA.count())
print(dfFinalA.count())

Form        1000
Response    1000
dtype: int64
Form        1000
Response    1000
dtype: int64


In [51]:
print("Form A")
Ftot_surgeryA, Ftot_radiationA, df_both_A = count_responses(dfFinalA)
print("\nForm B")
Ftot_surgeryB, Ftot_radiationB, df_both_B = count_responses(dfFinalB)

Form A
Number of responses containing both 'radiation' and 'surgery': 183
Number of responses containing ONLY 'surgery': 71
Number of responses containing ONLY 'radiation': 746

Form B
Number of responses containing both 'radiation' and 'surgery': 252
Number of responses containing ONLY 'surgery': 304
Number of responses containing ONLY 'radiation': 448


### handcounting final

In [52]:
#Form A with both
both_surgA, both_radA, dfNoneA = check_both(df_both_A)
Ftot_surgeryA += both_surgA
Ftot_radiationA += both_radA
print(Ftot_surgeryA)
print(Ftot_radiationA)
print(dfNoneA.count())

#print(dfNoneA)
#after:
Ftot_surgeryA += 5
Ftot_radiationA += 7

86
902
Form        12
Response    12
dtype: int64


In [53]:
#Form B with both 
both_surgB, both_radB, dfNoneB = check_both(df_both_B)
Ftot_surgeryB += both_surgB
Ftot_radiationB += both_radB
print(Ftot_surgeryB)
print(Ftot_radiationB)
print(dfNoneB.count())

#print(dfNoneB)
#after:
Ftot_surgeryB += 35
Ftot_radiationB += 6

373
586
Form        45
Response    45
dtype: int64


In [54]:
#print(df_both_A)

In [55]:
#print(df_both_B)

### final results + hypergeometric distribution test

In [56]:
#show final results here
print("Form A results are: Surgery: ", Ftot_surgeryA, "Radiation Therapy: ", Ftot_radiationA)
print("Form B results are: Surgery: ", Ftot_surgeryB, "Radiation Therapy: ", Ftot_radiationB)

Form A results are: Surgery:  91 Radiation Therapy:  909
Form B results are: Surgery:  408 Radiation Therapy:  592


In [57]:
data = [['Question A', 91, 909], ['Question B', 408, 592]]

df_data = pd.DataFrame(data, columns=['Question Type', 'Surgery', 'Radiation Therapy']).reset_index(drop=True)
print(tabulate(df_data, headers='keys', tablefmt='fancy_outline', showindex=False))

╒═════════════════╤═══════════╤═════════════════════╕
│ Question Type   │   Surgery │   Radiation Therapy │
╞═════════════════╪═══════════╪═════════════════════╡
│ Question A      │        91 │                 909 │
│ Question B      │       408 │                 592 │
╘═════════════════╧═══════════╧═════════════════════╛


In [58]:
#compare effect of form A compared to B

N = 2000  # total number of participants (Group A + Group B)
n = 1000  # num in a sample
K = 499   # total success states
x = 91   # value we're interested in (success states in form A)

pvalue = 0
for k in range(1, x + 1):  
    pvalue += ((math.comb(K, k) * math.comb(N - K, n - k)) / math.comb(N, n))

print("pvalue is:", pvalue)

pvalue is: 7.465801561545622e-64


### data from initial dataset (uses alternate wording)

In [59]:
dfA = pd.read_csv("DoctorsA.csv")
dfB = pd.read_csv("DoctorsB.csv")
dfA.head()
dfB.head()
dfA.count()

Form        100
Response    100
dtype: int64

In [60]:
total_surgeryA, total_radiationA, responses_both_A = count_responses(dfA)
total_surgeryB, total_radiationB, responses_both_B = count_responses(dfB)

Number of responses containing both 'radiation' and 'surgery': 25
Number of responses containing ONLY 'surgery': 50
Number of responses containing ONLY 'radiation': 25
Number of responses containing both 'radiation' and 'surgery': 32
Number of responses containing ONLY 'surgery': 11
Number of responses containing ONLY 'radiation': 58


In [61]:
#hand counting the 25 responses with both in dfA
#print(responses_both)
total_surgeryA += 11
total_radiationA += 14
print('total surgery A:', total_surgeryA)
print('total radation A:', total_radiationA)

total surgery A: 61
total radation A: 39


In [62]:
#hand counting the 'both' responses, and adjusting for one misclassification:
total_surgeryB -= 1
total_radiationB -=1
total_surgeryB += 11
total_radiationB +=21
neither = 1
print('total surgery B:', total_surgeryB)
print('total radation B:', total_radiationB)

total surgery B: 21
total radation B: 78


### check responses individually and record answers with explanations that make SENSE (from initial dataset)

In [67]:
#print(dfA['Response'])
idxA = [0, 1, 13, 14, 15, 18, 26, 30, 36, 48, 52, 53, 61, 63, 64, 69, 70, 71, 72, 74, 79, 83, 84, 88, 89, 91, 93, 97, 99]
#questionable: 1, 74, 89, 99

surgeryA = 17
radiationA = 11
print("New Count! Surgery Responses: ", surgeryA, "Radiation Responses :", radiationA)

New Count! Surgery Responses:  17 Radiation Responses : 11


In [68]:
#print(dfA['Response'])
idxB = [0, 4, 5, 11, 13, 19, 20, 21, 24, 25, 29, 35, 36, 40, 42, 45, 48, 54, 63, 68, 69, 73, 84, 85, 87, 95]
#questionable: 19, 54
 
surgeryB = 4
radiationB = 22
print("New Count! Surgery Responses: ", surgeryB, "Radiation Responses :", radiationB)

New Count! Surgery Responses:  4 Radiation Responses : 22


In [65]:
data2 = [['Question A', 61, 17, 39, 11], 
           ['Question B', 21, 4, 78, 22]]

df_data = pd.DataFrame(data2, columns=['Question Type', 'Surgery', 'Adj. Surgery', 'Radiation Therapy', 'Adj. Radiation']).reset_index(drop=True)
print(tabulate(df_data, headers='keys', tablefmt='fancy_outline', showindex=False))

╒═════════════════╤═══════════╤════════════════╤═════════════════════╤══════════════════╕
│ Question Type   │   Surgery │   Adj. Surgery │   Radiation Therapy │   Adj. Radiation │
╞═════════════════╪═══════════╪════════════════╪═════════════════════╪══════════════════╡
│ Question A      │        61 │             17 │                  39 │               11 │
│ Question B      │        21 │              4 │                  78 │               22 │
╘═════════════════╧═══════════╧════════════════╧═════════════════════╧══════════════════╛
