## Omoju Miller Dissertation Data Analysis 
### Factors that lead students to choose CS major at Cal

In [1]:
# inline graphs
# %pylab inline
from pylab import *

In [2]:
import sys
sys.path.append('helperFunctions/')
from IPython.display import display # Allows the use of display() for DataFrames
import seaborn as sns # Seaborn for nice visualizations


In [3]:
import numpy as np
import pandas as pd 
import HelperFunctionDataAnalysis as myHelper # helper functions to display images

### Read in csv files and define dataframe object.

In [4]:
dataPath = '/Users/omojumiller/Dropbox/Research/DissertationSubmission/'

cs10Spring_df = pd.read_csv(dataPath+'Data/CS10_Pre_Responses_Spring2015.csv') 
cs10Spring_pdf = pd.read_csv(dataPath+'Data/CS10_Post_Responses_Spring2015.csv')
cs61a = pd.read_csv(dataPath+'Data/CS61A_Responses_Fall2014.csv')


### Data Description

In [5]:
#Get the description of the column items
dd = pd.read_csv(dataPath+'Data/Data_Describe.csv')
dd.columns = ['dataDecription', 'dataKeys']
dataDescription = {}

print "-"* 32, 'Data Description', "-"*32
for i, row in dd.iterrows():
    lineLength = len(row.dataDecription)
    if (lineLength > 80):
        lineOneEnd = 81
        print"{:15}{:20}".format(row.dataKeys, row.dataDecription[:lineOneEnd])
        print"{:15}{:20}".format('', row.dataDecription[lineOneEnd:])
    else:
         print"{:15}{:20}".format(row.dataKeys, row.dataDecription)
    
    dataDescription[dd.dataKeys[i]] = dd.dataDecription[i]
    


-------------------------------- Data Description --------------------------------
atcs_1         I like to use computer science to solve problems.
atcs_2         I can learn to understand computing concepts.
atcs_3         I can achieve good grades (C or better) in computing courses.
atcs_4         I do not like using computer science to solve problems.
atcs_5         I am confident that I can solve problems by using computation
atcs_6         The challenge of solving problems using computer science appeals to me.
atcs_7         I am comfortable with learning computing concepts.
atcs_8         I am confident about my abilities with regards to computer science.
atcs_9         I do think I can learn to understand computing concepts.
atcsgender_1   Women are less capable of success in CS than men.
atcsgender_2   Women are smarter than men.
atcsgender_3   Men have better math and science abilities than women.
atcsjob_1      Knowledge of computing will allow me to secure a good job.
atcsjo

In [6]:
print '\n'
del dataDescription['name_2']
del dataDescription['name_1']
del dataDescription['major']
del dataDescription['morecs']
del dataDescription['timestamp']
del dataDescription['consent']
del dataDescription['mtr_2']
del dataDescription['mtr_3']
del dataDescription['mtr_1']
del dataDescription['prcs_3']
del dataDescription['prcs_5']
del dataDescription['prcs_4']
del dataDescription['prcs_2']
del dataDescription['prcs_1']
del dataDescription['reason_class']
del dataDescription['grade']
del dataDescription['gender']
del dataDescription['prepared']


dataDescription['atct_1'] = 'I am good at solving a problem by thinking \nabout similar problems I have solved before.'
dataDescription['atct_7'] = 'I am good at ignoring irrelevant details to solve a problem.'
tmp = '\n with people from diverse backgrounds.'
dataDescription['cltrcmp_2'] = 'I have good cultural competence, or the ability to interact effective' + tmp







### Update column names

In [7]:
cs10Spring_df.columns = ['timestamp',
'consent','gender','reason_class','major','atcs_1','atcsjob_1','atcs_2','atcsjob_2','atcsgender_1',
'atcs_3','atcs_4','atcs_5','atcsgender_2','atcs_6','atcs_7','atcs_8','atcs_9','atcsgender_3',
'atct_1','atct_2','atct_3','atct_4','atct_5','atct_6','atct_7','atct_8','clet_1','clet_2','grade',
'mtr_1','mtr_2','mtr_3','blg_1','blg_2','blg_3','blg_4','cltrcmp_1',
'cltrcmp_2','classmtr','prcs_1','prcs_2','prcs_3','prcs_4','prcs_5','prepared','morecs','name_1', 'name_2']


In [8]:
cs10Spring_pdf.columns = ['timestamp',
'consent','gender','reason_class','major','atcs_1','atcsjob_1','atcs_2','atcsjob_2','atcsgender_1',
'atcs_3','atcs_4','atcs_5','atcsgender_2','atcs_6','atcs_7','atcs_8','atcs_9','atcsgender_3',
'atct_1','atct_2','atct_3','atct_4','atct_5','atct_6','atct_7','atct_8','clet_1','clet_2','grade',
'mtr_1','mtr_2','mtr_3','blg_1','blg_2','blg_3','blg_4','cltrcmp_1',
'cltrcmp_2','classmtr','prcs_1','prcs_2','prcs_3','prcs_4','prcs_5','prepared','morecs','name_1',
                    'snap_python','hiphop_d1','hiphop_d2','song_ct']


In [9]:
cs61a.columns = ['timestamp','consent','gender',
'reason_class','major','atcs_1','atcsjob_1','atcs_2','atcsjob_2','atcsgender_1','atcs_3','atcs_4',
'atcs_5','atcsgender_2','atcs_6','atcs_7','atcs_8','atcs_9','atcsgender_3','atct_1',
'atct_2','atct_3','atct_4','atct_5','atct_6','atct_7','atct_8','clet_1','clet_2','grade',
'mtr_1','mtr_2','mtr_3','blg_1','blg_2','blg_3','blg_4','cltrcmp_1','cltrcmp_2','classmtr',
'prcs_1','prcs_2','prcs_3','prcs_4','prcs_5','prepared','morecs','priorcs10','name']


### Data Exploration
We will begin exploring the data through visualizations and code to understand how each feature is related to the others. We will segment the data into separate data frames.

#### Top frame
- cs10
- cs61a

#### By Gender
- cs10_female
- cs10_male
- cs61a_female
- cs61a_male

#### By previous CS exposure before College
- cs10_CS
- cs10_NO_CS
- cs61a_CS
- cs61a_NO_CS


Lets go ahead and filter out missing data.

If the consent value is **"I disagree"**, then drop that row from the data set.

In [10]:
cs61a = cs61a[cs61a.consent == 'I agree']
cs10Spring_df = cs10Spring_df[cs10Spring_df.consent == 'I agree']
cs10Spring_pdf = cs10Spring_pdf[cs10Spring_pdf.consent == 'I agree']


In [11]:
# Add all the data from CS10 together since I am not doing before and after 

frames = [cs10Spring_df, cs10Spring_pdf]
cs10 = pd.concat(frames, keys=['pre', 'post'])


Lets filter out the data based on gender

In [12]:
cs10_female = cs10[cs10.gender == 'Female']
cs10_male = cs10[cs10.gender == 'Male']
cs10_female = cs10_female.reset_index(drop=True)
cs10_male = cs10_male.reset_index(drop=True)


cs61a_female = cs61a[cs61a.gender == 'Female']
cs61a_male = cs61a[cs61a.gender == 'Male']
cs61a_female = cs61a_female.reset_index(drop=True)
cs61a_male = cs61a_male.reset_index(drop=True)

print ('%d male students, and %d female students consented to participate in the research CS10' 
       % (len(cs10_male), len(cs10_female)))
print ('%d male students, and %d female students consented to participate in the research CS61A' 
       % (len(cs61a_male), len(cs61a_female)))



170 male students, and 217 female students consented to participate in the research CS10
324 male students, and 171 female students consented to participate in the research CS61A


Create the dimension that we are interested in investigating.

In [13]:
mtr = ['mtr_1', 'mtr_2', 'mtr_3'] # CS Mentors
prcs = ['prcs_1', 'prcs_2', 'prcs_3', 'prcs_4', 'prcs_5'] # Prior CS Exposure
atcs = ['atcs_1', 'atcs_2', 'atcs_3', 'atcs_4', 'atcs_5', 
        'atcs_6', 'atcs_7', 'atcs_8', 'atcs_9']# self reported attitude about CS competency
atct = ['atct_1', 'atct_2', 'atct_3', 'atct_4', 
        'atct_5', 'atct_6', 'atct_7', 'atct_8'] # Self reported attitudes about computational thinking
blg = ['blg_1', 'blg_2', 'blg_3', 'blg_4'] # Sense of belonging in the class room
clet = ['clet_1', 'clet_2'] # Social implications and ethics
major = 'major'
atcsgender = ['atcsgender_1', 'atcsgender_2', 'atcsgender_3'] 
atcsjob = ['atcsjob_1', 'atcsjob_2'] 
cltrcmp = ['cltrcmp_1', 'cltrcmp_2'] # Culutral competency
gender = 'gender'
priorcs10 = 'priorcs10' # had taken CS10 prior


itemDimensions = {}
itemDimensions['atcs'] = atcs
itemDimensions['atcsgender'] = atcsgender
itemDimensions['atcsjob'] = atcsjob 
itemDimensions['atct'] = atct
itemDimensions['blg'] = blg
itemDimensions['clet'] = clet
itemDimensions['cltrcmp'] = cltrcmp



In [14]:
# get students from 61a who have had no prior cs experience

# prcs_2: Did you have any exposure to Computer Science before UC Berkeley?
cs61a_NO_CS = cs61a[cs61a.prcs_2 == 'No']
cs10_NO_CS = cs10[cs10.prcs_2 == 'No']


cs61a_NO_CS_female = cs61a_female[cs61a_female.prcs_2 == 'No']
cs61a_NO_CS_male = cs61a_male[cs61a_male.prcs_2 == 'No']
cs10_NO_CS_female = cs10_female[cs10_female.prcs_2 == 'No']
cs10_NO_CS_male = cs10_male[cs10_male.prcs_2 == 'No']

print 'There are %d women WITHOUT prior CS exposure who are taking 61a' % cs61a_NO_CS_female.prcs_2.value_counts()
print 'There are %d men WITHOUT prior CS exposure who are taking 61a\n' % cs61a_NO_CS_male.prcs_2.value_counts()
print'There are %d women WITHOUT prior CS exposure who are taking cs10' % cs10_NO_CS_female.prcs_2.value_counts()
print'There are %d men WITHOUT prior CS exposure who are taking cs10\n' % cs10_NO_CS_male.prcs_2.value_counts()

print ('There are %d students WITHOUT prior CS exposure in this study\n' 
% ((cs61a_NO_CS_male.prcs_2.value_counts()+cs10_NO_CS_male.prcs_2.value_counts()) \
   + (cs61a_NO_CS_female.prcs_2.value_counts()+cs10_NO_CS_female.prcs_2.value_counts())))

print 'Total Women: %d' % (cs61a_NO_CS_female.prcs_2.value_counts()+cs10_NO_CS_female.prcs_2.value_counts())
print 'Total Men: %d' % (cs61a_NO_CS_male.prcs_2.value_counts()+cs10_NO_CS_male.prcs_2.value_counts())


There are 85 women WITHOUT prior CS exposure who are taking 61a
There are 101 men WITHOUT prior CS exposure who are taking 61a

There are 172 women WITHOUT prior CS exposure who are taking cs10
There are 122 men WITHOUT prior CS exposure who are taking cs10

There are 480 students WITHOUT prior CS exposure in this study

Total Women: 257
Total Men: 223


In [15]:
# get students from 61a who have had prior cs experience
# prcs_2: Did you have any exposure to Computer Science before UC Berkeley?
cs61a_CS = cs61a[cs61a.prcs_2 == 'Yes']
cs10_CS = cs10[cs10.prcs_2 == 'Yes']

cs10_CS_female = cs10_female[cs10_female.prcs_2 == 'Yes']
cs10_CS_male = cs10_male[cs10_male.prcs_2 == 'Yes']
cs61a_CS_female = cs61a_female[cs61a_female.prcs_2 == 'Yes']
cs61a_CS_male = cs61a_male[cs61a_male.prcs_2 == 'Yes']

print'There are %d women WITH prior CS exposure who are taking cs10' % cs10_CS_female.prcs_2.value_counts()
print'There are %d men WITH prior CS exposure who are taking cs10\n' % cs10_CS_male.prcs_2.value_counts()
print 'There are %d women WITH prior CS exposure who are taking 61a' % cs61a_CS_female.prcs_2.value_counts()
print 'There are %d men WITH prior CS exposure who are taking 61a\n' % cs61a_CS_male.prcs_2.value_counts()


print ('There are %d students WITH prior CS exposure in this study\n' % 
((cs61a_CS_male.prcs_2.value_counts()+cs10_CS_male.prcs_2.value_counts()) \
 + (cs61a_CS_female.prcs_2.value_counts()+cs10_CS_female.prcs_2.value_counts())))

print 'Total Women: %d' % (cs61a_CS_female.prcs_2.value_counts()+cs10_CS_female.prcs_2.value_counts())
print 'Total Men: %d' % (cs61a_CS_male.prcs_2.value_counts()+cs10_CS_male.prcs_2.value_counts())


There are 45 women WITH prior CS exposure who are taking cs10
There are 48 men WITH prior CS exposure who are taking cs10

There are 86 women WITH prior CS exposure who are taking 61a
There are 223 men WITH prior CS exposure who are taking 61a

There are 402 students WITH prior CS exposure in this study

Total Women: 131
Total Men: 271


In [16]:
dfSegments = {}
dfSegments['cs10_CS'] = cs10_CS
dfSegments['cs10_NO_CS'] = cs10_NO_CS
dfSegments['cs61a_CS'] = cs61a_CS
dfSegments['cs61a_NO_CS'] = cs61a_NO_CS

dfSegments['cs10_female'] = cs10_female
dfSegments['cs10_CS_female'] = cs10_CS_female
dfSegments['cs10_NO_CS_female'] = cs10_NO_CS_female

dfSegments['cs10_male'] = cs10_male
dfSegments['cs10_CS_male'] = cs10_CS_male
dfSegments['cs10_NO_CS_male'] = cs10_NO_CS_male

dfSegments['cs61a_female'] = cs61a_female
dfSegments['cs61a_NO_CS_female'] = cs61a_NO_CS_female

dfSegments['cs61a_male'] = cs61a_male
dfSegments['cs61a_NO_CS_male'] = cs61a_NO_CS_male

dfSegments['cs61a_CS_female'] = cs61a_CS_female
dfSegments['cs61a_CS_male'] = cs61a_CS_male 

## Test for statistical signifance

In [43]:
# Test for statistical signifance
# Use non-parametric alternative to the t-test Mann-Whitney U test because we have ordinal data.
# Likert scale is ordinal and not normally distributed
# And I can use Mann Whitney to compare unequal sample size.

#genPValues(atcs, '')
#genPValues(atcsgender, '')
#genPValues(atct,'')
#genPValues(clet,'')
#genPValues(cltrcmp,'')
myHelper.genPValues(blg,'', 'cs10', 'cs61a', cs10, cs61a, dataDescription)

Test for significance between CS10 and CS61A

blg_1 	In this class, I feel I belong.
Female 	0.02153060 	Statistically Significant: *


Test for significance between CS10 and CS61A

blg_2 	In this class, I feel awkward and out of place.


Test for significance between CS10 and CS61A

blg_3 	In this class, I feel like my ideas count.
Class 	0.00001109 	Statistically Significant: ****
Female 	0.00000106 	Statistically Significant: ****
Male 	0.00600122 	Statistically Significant: **


Test for significance between CS10 and CS61A

blg_4 	In this class, I feel like I matter.
Class 	0.00000073 	Statistically Significant: ****
Female 	0.00000141 	Statistically Significant: ****
Male 	0.00074444 	Statistically Significant: ***




In [19]:
print'There are %d women WITH prior CS exposure who are taking cs10' % cs10_CS_female.prcs_2.value_counts()

There are 45 women WITH prior CS exposure who are taking cs10


In [20]:
# Test for statistical significance between students with prior exposure and students 
# without prior exposure

#genPValues_2(atcs)
#genPValues_2(atcsgender)
#genPValues_2(atct)
#genPValues_2(clet)
#genPValues_2(cltrcmp)
myHelper.genPValues_2(blg, 'cs10', 'cs61a', cs10, cs61a, dataDescription)

Test for significance between CS10 and CS61A
blg_1   :   In this class, I feel I belong.
NoPrior CS Class    0.000460  Statistically Significant:    ***       
No Prior CS Female  0.002089  Statistically Significant:    **        
No Prior CS Male    0.015097  Statistically Significant:    **        


Test for significance between CS10 and CS61A
blg_2   :   In this class, I feel awkward and out of place.
NoPrior CS Class    0.014721  Statistically Significant:    **        
No Prior CS Female  0.025501  Statistically Significant:    *         


Test for significance between CS10 and CS61A
blg_3   :   In this class, I feel like my ideas count.
Prior CS Class      0.007862  Statistically Significant:    **        
NoPrior CS Class    0.000012  Statistically Significant:    ****      
Prior CS Female     0.000485  Statistically Significant:    ***       
No Prior CS Female  0.000566  Statistically Significant:    ***       
No Prior CS Male    0.000992  Statistically Significant:    ***

### Investigate Correlations among items

In [21]:
list_of_lists = [
itemDimensions['atcs'], 
itemDimensions['blg'],
itemDimensions['atct']
]
#itemDimensions['atcsgender'],
#itemDimensions['clet'],
#itemDimensions['cltrcmp']]


flattened = [val for sublist in list_of_lists for val in sublist]

In [42]:
for key in dfSegments:
    print key
    flattened = [val for sublist in list_of_lists for val in sublist]
    df = eval(key + '[flattened]')
    corr = df.corr()
    mask = np.zeros_like(corr)
    mask[np.triu_indices_from(mask, 1)] = True

    myFileName='Figures' + '/' + key +'_corr.pdf'
    rcParams['figure.figsize'] = 12, 8
    fig = figure(dpi=80) # no frame

    with sns.axes_style("white"):
        ax = sns.heatmap(corr, mask=mask, square=False, annot=True, vmin=-1, vmax=1,cmap='RdBu', fmt='+.1f')
        plt.xticks(rotation=70, ha='right');
        plt.yticks(rotation=2);

    fig.savefig(myFileName)

cs61a_CS_female
cs61a_CS_male
cs10_female
cs61a_female
cs10_CS_male
cs10_NO_CS
cs61a_male
cs10_NO_CS_female
cs10_NO_CS_male
cs61a_NO_CS_male
cs10_CS
cs61a_NO_CS_female
cs61a_NO_CS
cs61a_CS
cs10_male
cs10_CS_female


###  Correlation matrix of CS61A students who had previously taken CS10


In [61]:

cs61a_priorCS10 = cs61a[cs61a.priorcs10 == 'Yes']


cs61a_priorCS10_female = cs61a_priorCS10[cs61a_priorCS10.gender == 'Female']
cs61a_priorCS10_male = cs61a_priorCS10[cs61a_priorCS10.gender == 'Male']
cs61a_priorCS10_female = cs61a_priorCS10_female.reset_index(drop=True)
cs61a_priorCS10_male = cs61a_priorCS10_male.reset_index(drop=True)


dfSegmentsPriorCS10 = {}
dfSegmentsPriorCS10['cs61a_priorCS10_female'] = cs61a_priorCS10_female
dfSegmentsPriorCS10['cs61a_priorCS10_male'] = cs61a_priorCS10_female

# dfSegmentsPriorCS10[] = 

In [65]:
# {figures/priorCS10_female_corr}
# {figures/priorCS10_male_corr}

for key in dfSegmentsPriorCS10:
    print key
    flattened = [val for sublist in list_of_lists for val in sublist]
    df = eval(key + '[flattened]')
    corr = df.corr()
    mask = np.zeros_like(corr)
    mask[np.triu_indices_from(mask, 1)] = True

    myFileName='Figures' + '/' + key +'_corr.pdf'
    rcParams['figure.figsize'] = 12, 8
    fig = figure(dpi=80) # no frame

    with sns.axes_style("white"):
        ax = sns.heatmap(corr, mask=mask, square=False, annot=True, vmin=-1, vmax=1,cmap='RdBu', fmt='+.1f')
        plt.xticks(rotation=70, ha='right');
        plt.yticks(rotation=2);

    fig.savefig(myFileName)

cs61a_priorCS10_male
cs61a_priorCS10_female


### Plot data dimensions 


In [24]:
theCategory = '_NO_CS'
theXLabel = '   '
itemDimension = ['blg_1', 'blg_2', 'blg_3', 'blg_4']


def getPlotDimensions(key, attr, theCategory):
    '''Function gets two arrays of datapoints.

    Input:
        key: a string for an instrument like 'blg_1'
        attr: '_male', '_female', or ''
        theCategory: a string that takes the following values ['_NO_CS', '_CS', '']
        
    Output: 
        x0: datapoints to be plotted represent cs10 information for the dimension and category
        x1: ditto cs61a
    '''
    
    temp = []
    
    temp.append('cs10'+theCategory+attr+'.'+key+'.value_counts()')
    a = eval(temp[0])
    myHelper.scaleData(a)
    x0 = myHelper.turnArray(a)
    temp = []
        
    temp.append('cs61a'+theCategory+attr+'.'+key+'.value_counts()')
    b = eval(temp[0])
    myHelper.scaleData(b)
    x1 = myHelper.turnArray(b)
    temp = []
        
    return x0, x1



In [25]:
# For the No prior CS Experience cohort

theCategory = '_NO_CS'
theXLabel = '   '
itemDimension = ['blg_1', 'blg_2', 'blg_3', 'blg_4']
attr = '_female' # the other value _male


def createThePlots(itemDimension, myFileName, theCategory, attr, theXLabel):
    
    for i in range(len(itemDimension)):
        x0, x1 = getPlotDimensions(itemDimension[i], attr, theCategory)
        myHelper.genFigure(itemDimension[i], myFileName, theCategory, theXLabel, 
                           x0, x1, dataDescription[itemDimension[i]])
        

def createTheGenderPlots(itemDimension, myFileName, theCategory, genderAttr, theXLabel):
    
    for i in range(len(itemDimension)):
        x0, x1 = getPlotDimensions(itemDimension[i], genderAttr, theCategory)
        myHelper.genGenderFigure(itemDimension[i], myFileName, theCategory, genderAttr, theXLabel, 
                                 x0, x1, dataDescription[itemDimension[i]])


In [26]:
i = 1
genderAttr = '_female'
dimension = blg[i]
myFileName = 'Figures' + '/' + dimension + genderAttr + theCategory + '.pdf'
print myFileName

Figures/blg_2_female_NO_CS.pdf


In [27]:
theCategory = '_NO_CS'
theXLabel = 'female'
genderAttr = ''
cohort = 'cs10'

i = 1
dimension = blg[i]
myFileName = 'Figures' + '/' + dimension + genderAttr + theCategory + '.pdf'




createThePlots(blg, myFileName, theCategory, genderAttr, theXLabel)
#createThePlots(clet, myFileName, theCategory, genderAttr, theXLabel)
#createThePlots(cltrcmp, myFileName, theCategory, genderAttr, theXLabel)
#createThePlots(atcs, myFileName, theCategory, genderAttr, theXLabel)
#createThePlots(atcsgender, myFileName, theCategory, genderAttr, theXLabel)
#createThePlots(atct, myFileName, theCategory, genderAttr, theXLabel)




In [28]:
theCategory = '_NO_CS'
theXLabel = 'female'
genderAttr = '_female'
cohort = 'cs10'

i = 1
dimension = blg[i]
myFileName = 'Figures' + '/' + dimension + genderAttr + theCategory + '.pdf'

createTheGenderPlots(blg,myFileName,theCategory,genderAttr,theXLabel)



In [29]:
theCategory = '_CS'
theXLabel = '   '

#createThePlots(blg, theCategory, theXLabel)
#createThePlots(clet, theCategory, theXLabel)
#createThePlots(cltrcmp, theCategory, theXLabel)
#createThePlots(atcs, theCategory, theXLabel)
#createThePlots(atcsgender, theCategory, theXLabel)
#createThePlots(atct, theCategory, theXLabel)