In [1]:
import os
import pickle
import warnings
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn import linear_model
import matplotlib.pyplot as plt
%matplotlib inline


from IPython.display import display
from scipy import stats
from scipy.stats import norm
from IPython.display import display
from sklearn.feature_selection import chi2

In [2]:
sns.set()

pd.set_option('display.max_rows', 100)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

In [3]:
conda install -c conda-forge/label/cf201901 missingno

Collecting package metadata (current_repodata.json): done
Solving environment: done

# All requested packages already installed.


Note: you may need to restart the kernel to use updated packages.


#### Load data

In [4]:
INPUT_FILE_NAME = "Pre-Post Survey Responses.xlsx"

In [5]:
data = pd.ExcelFile('Pre-Post Survey Responses.xlsx')

In [6]:
df1 = pd.read_excel(data, 'Pre int')

In [7]:
df2 = pd.read_excel(data, 'Post int')

In [8]:
# list(df2.columns.values)

In [9]:
# print(df2.info())

In [10]:
updated_df2 = df2
updated_df2['D5']=updated_df2['D5'].fillna(updated_df2['D5'].mean())
updated_df2['D6']=updated_df2['D6'].fillna(updated_df2['D6'].mean())
updated_df2['M4']=updated_df2['M4'].fillna(updated_df2['M4'].mean())
updated_df2['M5']=updated_df2['M5'].fillna(updated_df2['M5'].mean())
updated_df2['M6']=updated_df2['M6'].fillna(updated_df2['M6'].mean())
updated_df2['PS1']=updated_df2['PS1'].fillna(updated_df2['PS1'].mean())
updated_df2['PS2']=updated_df2['PS2'].fillna(updated_df2['PS2'].mean())
updated_df2['PS3']=updated_df2['PS3'].fillna(updated_df2['PS3'].mean())
updated_df2['PS4']=updated_df2['PS4'].fillna(updated_df2['PS4'].mean())
updated_df2['B1']=updated_df2['B1'].fillna(updated_df2['B1'].mean())
updated_df2['B2']=updated_df2['B2'].fillna(updated_df2['B2'].mean())
updated_df2['B3']=updated_df2['B3'].fillna(updated_df2['B3'].mean())
updated_df2['B4']=updated_df2['B4'].fillna(updated_df2['B4'].mean())
updated_df2['B5']=updated_df2['B5'].fillna(updated_df2['B5'].mean())
# updated_df2.info()

In [11]:
df2 = pd.DataFrame(updated_df2)

In [12]:
# print(df2.info())

In [13]:
column_categorization = {
    'SRH' : {
        'nutrition' : ['N1','N2','N3','N4','N5'], 
        'parental_connection' : ['P1','P2','P3','P4'], 
        'decision_making' : ['D1','D2','D3','D4'], #'D5_imp','D6_imp'based on marital status
        'sp_harassment' : ['S1','S2','S3','S4','S5'],
        'mhm' : ['M1','M2','M3','M4','M5','M6',],
        'pi_sa' : ['PS1','PS2','PS3','PS4'],
        'body_image' : ['B1','B2','B3','B4','B5']
    }, 
    'demographics' : ['Age', 'Edu_status', 'Res_Code', 'Marital_status']
}

# with open(os.path.join(os.path.dirname(os.path.realpath("__file__")), "data", "column_categorization.pickle"), 'wb') as handle:
#     pickle.dump(column_categorization, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [14]:
# with open(os.path.join(os.path.dirname(os.path.realpath("__file__")), "data", "column_categorization.pickle"), 'rb') as handle:
#     column_categorization = pickle.load(handle)

In [15]:
df1["nutrition_score"] = (df1[column_categorization["SRH"]['nutrition']].sum(axis=1))

In [16]:
df2["nutrition_score"] = (df2[column_categorization["SRH"]['nutrition']].sum(axis=1))

In [17]:
df1["parental_connection_score"] = (df1[column_categorization["SRH"]['parental_connection']].sum(axis=1))

In [18]:
df2["parental_connection_score"] = (df2[column_categorization["SRH"]['parental_connection']].sum(axis=1))

In [19]:
df1["decision_making_score"] = df1[column_categorization["SRH"]['decision_making']].sum(axis=1) + np.where(df1["Marital Status"]=="Married",df1["D5"],df1["D6"])

In [20]:
df2["decision_making_score"] = df2[column_categorization["SRH"]['decision_making']].sum(axis=1) + np.where(df2["Marital Status"]=="Married",df2["D5"],df2["D6"])

In [21]:
df1["sp_harassment_score"] = (df1[column_categorization["SRH"]['sp_harassment']].sum(axis=1))

In [22]:
df2["sp_harassment_score"] = (df2[column_categorization["SRH"]['sp_harassment']].sum(axis=1))

In [23]:
df1["mhm_score"] = (df1[column_categorization["SRH"]['mhm']].sum(axis=1))

In [24]:
df2["mhm_score"] = (df2[column_categorization["SRH"]['mhm']].sum(axis=1))

In [25]:
df1["pi_sa_score"] = (df1[column_categorization["SRH"]['pi_sa']].sum(axis=1))

In [26]:
df2["pi_sa_score"] = (df2[column_categorization["SRH"]['pi_sa']].sum(axis=1))

In [27]:
df1["body_image_score"] = (df1[column_categorization["SRH"]['body_image']].sum(axis=1))

In [28]:
df2["body_image_score"] = (df2[column_categorization["SRH"]['body_image']].sum(axis=1))

In [29]:
df1["srh_score"] = (
    df1["nutrition_score"] + 
    df1["decision_making_score"] +
    df1["parental_connection_score"] + 
    df1["sp_harassment_score"] +
    df1["mhm_score"] +
    df1["pi_sa_score"] +
    df1["body_image_score"] 
)

In [30]:
df2["srh_score"] = (
    df2["nutrition_score"] + 
    df2["decision_making_score"] +
    df2["parental_connection_score"] + 
    df2["sp_harassment_score"] +
    df2["mhm_score"] +
    df2["pi_sa_score"] +
    df2["body_image_score"] 
)

In [31]:
df2.shape

(75, 49)

In [32]:
df2.describe().round(2)

Unnamed: 0,S.NO,Age,SE,N1,N2,N3,N4,N5,P1,P2,P3,P4,D1,D2,D3,D4,D5,D6,S1,S2,S3,S4,S5,M1,M2,M3,M4,M5,M6,PS1,PS2,PS3,PS4,B1,B2,B3,B4,B5,nutrition_score,parental_connection_score,decision_making_score,sp_harassment_score,mhm_score,pi_sa_score,body_image_score,srh_score
count,75.0,75.0,75.0,75.0,75.0,75.0,75.0,75.0,75.0,75.0,75.0,75.0,75.0,75.0,75.0,75.0,75.0,75.0,75.0,75.0,75.0,75.0,75.0,75.0,75.0,75.0,75.0,75.0,75.0,75.0,75.0,75.0,75.0,75.0,75.0,75.0,75.0,75.0,75.0,75.0,75.0,75.0,75.0,75.0,75.0,75.0
mean,39.6,16.47,54.88,2.37,1.48,2.79,2.08,2.09,2.68,2.63,2.21,2.63,2.49,2.37,1.88,2.25,0.54,2.34,2.53,2.35,2.64,2.85,2.87,1.97,2.84,2.57,2.08,2.19,2.57,2.38,2.86,2.86,2.93,2.85,2.69,2.92,2.78,2.84,10.81,10.15,11.34,13.24,14.22,11.05,14.08,84.89
std,22.93,0.98,12.96,0.54,0.64,0.44,0.56,0.77,0.5,0.56,0.7,0.61,0.69,0.82,0.73,0.66,1.04,0.96,0.72,0.69,0.63,0.39,0.38,0.75,0.47,0.74,0.85,0.86,0.52,0.61,0.41,0.38,0.25,0.36,0.52,0.27,0.5,0.37,1.33,1.5,2.27,1.84,2.02,1.12,1.22,6.7
min,1.0,15.0,20.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,2.0,2.0,1.0,2.0,1.0,2.0,8.0,6.0,7.0,7.0,10.0,7.0,10.0,64.0
25%,20.5,16.0,46.0,2.0,1.0,3.0,2.0,1.5,2.0,2.0,2.0,2.0,2.0,2.0,1.0,2.0,0.0,2.0,2.0,2.0,2.0,3.0,3.0,1.0,3.0,2.0,1.0,1.0,2.0,2.0,3.0,3.0,3.0,3.0,2.0,3.0,3.0,3.0,10.0,9.0,10.0,12.0,13.0,11.0,14.0,80.98
50%,39.0,17.0,56.0,2.0,1.0,3.0,2.0,2.0,3.0,3.0,2.0,3.0,3.0,3.0,2.0,2.0,0.0,3.0,3.0,2.0,3.0,3.0,3.0,2.0,3.0,3.0,2.0,2.0,3.0,2.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,11.0,10.0,11.0,14.0,14.0,11.0,14.08,87.0
75%,58.0,17.0,64.0,3.0,2.0,3.0,2.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,2.0,3.0,0.54,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,12.0,11.0,13.0,15.0,16.0,12.0,15.0,89.5
max,80.0,18.0,80.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,13.0,12.0,15.0,15.0,18.0,12.0,15.0,98.0


In [33]:
df1.groupby("Age")["srh_score"].describe()

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
Age,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
15,16.0,84.0625,7.460731,72.0,78.5,84.0,88.25,97.0
16,18.0,77.277778,7.061124,59.0,73.25,78.0,80.75,89.0
17,31.0,80.225806,6.432779,63.0,77.0,80.0,83.5,94.0
18,10.0,74.4,5.796551,66.0,70.0,74.5,78.75,83.0


In [41]:
df1['srh_score'].describe().round(2)

count    75.00
mean     79.56
std       7.28
min      59.00
25%      74.00
50%      79.00
75%      84.00
max      97.00
Name: srh_score, dtype: float64

In [40]:
df2['SE'].describe().round(2)

count    75.00
mean     54.88
std      12.96
min      20.00
25%      46.00
50%      56.00
75%      64.00
max      80.00
Name: SE, dtype: float64

In [36]:
stats.ttest_rel(df1['SE'], df2['SE'])

Ttest_relResult(statistic=1.1481717904232478, pvalue=0.2545958054160611)

In [37]:
df2[(df2["SE"]>69.66)].describe()

Unnamed: 0,S.NO,Age,SE,N1,N2,N3,N4,N5,P1,P2,P3,P4,D1,D2,D3,D4,D5,D6,S1,S2,S3,S4,S5,M1,M2,M3,M4,M5,M6,PS1,PS2,PS3,PS4,B1,B2,B3,B4,B5,nutrition_score,parental_connection_score,decision_making_score,sp_harassment_score,mhm_score,pi_sa_score,body_image_score,srh_score
count,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0
mean,57.1,15.9,74.0,2.6,1.4,3.0,2.0,2.2,2.7,2.8,2.3,3.0,3.0,2.8,2.2,2.2,0.0,3.0,3.0,2.7,3.0,3.0,3.0,2.2,2.9,2.6,2.5,2.4,3.0,2.8,3.0,3.0,3.0,2.9,3.0,3.0,2.9,3.0,11.2,10.8,13.2,14.7,15.6,11.8,14.8,92.1
std,26.892585,0.994429,2.828427,0.516398,0.699206,0.0,0.816497,0.788811,0.483046,0.632456,0.823273,0.0,0.0,0.421637,0.918937,0.788811,0.0,0.0,0.0,0.483046,0.0,0.0,0.0,0.632456,0.316228,0.843274,0.707107,0.843274,0.0,0.421637,0.0,0.0,0.0,0.316228,0.0,0.0,0.316228,0.0,1.619328,1.686548,1.135292,0.483046,1.505545,0.421637,0.421637,3.634709
min,8.0,15.0,72.0,2.0,1.0,3.0,1.0,1.0,2.0,1.0,1.0,3.0,3.0,2.0,1.0,1.0,0.0,3.0,3.0,2.0,3.0,3.0,3.0,1.0,2.0,1.0,1.0,1.0,3.0,2.0,3.0,3.0,3.0,2.0,3.0,3.0,2.0,3.0,8.0,7.0,11.0,14.0,13.0,11.0,14.0,87.0
25%,52.75,15.0,72.0,2.0,1.0,3.0,1.25,2.0,2.25,3.0,2.0,3.0,3.0,3.0,1.25,2.0,0.0,3.0,3.0,2.25,3.0,3.0,3.0,2.0,3.0,3.0,2.0,2.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,10.25,10.25,13.0,14.25,15.0,12.0,15.0,89.25
50%,70.5,15.5,72.0,3.0,1.0,3.0,2.0,2.0,3.0,3.0,2.5,3.0,3.0,3.0,2.5,2.0,0.0,3.0,3.0,3.0,3.0,3.0,3.0,2.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,11.0,11.5,13.0,15.0,15.5,12.0,15.0,92.0
75%,75.25,17.0,76.0,3.0,1.75,3.0,2.75,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,0.0,3.0,3.0,3.0,3.0,3.0,3.0,2.75,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,12.75,12.0,14.0,15.0,16.75,12.0,15.0,94.75
max,79.0,17.0,80.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,0.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,13.0,12.0,15.0,15.0,18.0,12.0,15.0,98.0


In [38]:
df2[(df2["SE"]<38.48)].describe()

Unnamed: 0,S.NO,Age,SE,N1,N2,N3,N4,N5,P1,P2,P3,P4,D1,D2,D3,D4,D5,D6,S1,S2,S3,S4,S5,M1,M2,M3,M4,M5,M6,PS1,PS2,PS3,PS4,B1,B2,B3,B4,B5,nutrition_score,parental_connection_score,decision_making_score,sp_harassment_score,mhm_score,pi_sa_score,body_image_score,srh_score
count,6.0,6.0,6.0,6.0,6.0,6.0,6.0,6.0,6.0,6.0,6.0,6.0,6.0,6.0,6.0,6.0,6.0,6.0,6.0,6.0,6.0,6.0,6.0,6.0,6.0,6.0,6.0,6.0,6.0,6.0,6.0,6.0,6.0,6.0,6.0,6.0,6.0,6.0,6.0,6.0,6.0,6.0,6.0,6.0,6.0,6.0
mean,32.5,17.0,28.666667,2.333333,1.833333,2.333333,2.0,1.666667,2.5,2.166667,2.166667,2.333333,2.0,2.0,1.833333,2.666667,1.5,2.0,2.5,2.333333,2.5,2.666667,2.833333,1.666667,2.833333,2.0,1.833333,2.166667,2.333333,2.063927,2.833333,2.833333,2.833333,2.666667,2.333333,2.833333,2.333333,2.666667,10.166667,9.166667,10.5,12.833333,12.833333,10.563927,12.833333,78.89726
std,18.780309,0.632456,6.889606,0.516398,0.983192,0.516398,0.0,0.816497,0.547723,0.752773,0.752773,0.816497,0.632456,0.632456,0.752773,0.516398,1.378405,0.894427,0.83666,0.516398,0.83666,0.516398,0.408248,0.816497,0.408248,1.095445,0.983192,0.983192,0.516398,0.156588,0.408248,0.408248,0.408248,0.516398,0.516398,0.408248,0.816497,0.516398,2.228602,1.602082,2.167948,2.316607,2.483277,1.265398,1.722401,8.494497
min,7.0,16.0,20.0,2.0,1.0,2.0,2.0,1.0,2.0,1.0,1.0,1.0,1.0,1.0,1.0,2.0,0.0,1.0,1.0,2.0,1.0,2.0,2.0,1.0,2.0,1.0,1.0,1.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,1.0,2.0,8.0,8.0,8.0,10.0,10.0,8.0,10.0,68.0
25%,24.25,17.0,24.0,2.0,1.0,2.0,2.0,1.0,2.0,2.0,2.0,2.0,2.0,2.0,1.25,2.25,0.25,1.25,2.25,2.0,2.25,2.25,3.0,1.0,3.0,1.0,1.0,1.25,2.0,2.0,3.0,3.0,3.0,2.25,2.0,3.0,2.0,2.25,9.0,8.0,9.0,10.75,11.0,11.0,12.25,75.25
50%,30.5,17.0,28.0,2.0,1.5,2.0,2.0,1.5,2.5,2.0,2.0,2.5,2.0,2.0,2.0,3.0,1.5,2.0,3.0,2.0,3.0,3.0,3.0,1.5,3.0,2.0,1.5,2.5,2.0,2.0,3.0,3.0,3.0,3.0,2.0,3.0,2.5,3.0,9.0,8.5,10.0,13.5,12.5,11.0,13.0,76.191781
75%,39.0,17.0,35.0,2.75,2.75,2.75,2.0,2.0,3.0,2.75,2.75,3.0,2.0,2.0,2.0,3.0,2.75,2.75,3.0,2.75,3.0,3.0,3.0,2.0,3.0,3.0,2.75,3.0,2.75,2.0,3.0,3.0,3.0,3.0,2.75,3.0,3.0,3.0,12.0,9.75,12.5,14.75,14.75,11.0,13.75,84.34589
max,63.0,18.0,36.0,3.0,3.0,3.0,2.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,2.383562,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,13.0,12.0,13.0,15.0,16.0,11.383562,15.0,91.0
