# Celebrity Multiple Choice - Filtered Version

### import modules

In [1]:
import os
import fnmatch

import numpy as np
import pandas as pd

### get logfiles

In [2]:
def getLogfile(whichfolder, whichexperiment):

    loglist = []
    for fileName in os.listdir(whichfolder):
        if fnmatch.fnmatch(fileName, whichexperiment):
            loglist.append(whichfolder+fileName)
    loglist.sort()
    return loglist

In [3]:
logList = getLogfile('../famousFiltered/experiment/data/','*.csv')

In [4]:
logList[-5:]

['../famousFiltered/experiment/data/famousFilterClean_7.csv',
 '../famousFiltered/experiment/data/famousFilterClean_8.csv',
 '../famousFiltered/experiment/data/famousFilterClean_9.csv',
 '../famousFiltered/experiment/data/famousFilterClean_jb.csv',
 '../famousFiltered/experiment/data/jb_famFaceFilter_2016_Nov_30_1302.csv']

### get content from logfile

In [5]:
logFile = logList[-1]

In [6]:
def makeDf(logFile):
    
    df = pd.read_csv(logFile).drop(0)
    
    d = {}
    for idx in df.index:
        k1 = df['ratingK1.response'].loc[idx]
        k2 = df['ratingK2.response'].loc[idx]
        k3 = df['ratingK3.response'].loc[idx]
        if k1 != 'None':
            d[idx] = 'act'   
        elif k2 != 'None':
            d[idx] = k2
        elif k3 != 'None':
            d[idx] = k3
        else:
            return

    newDf = pd.DataFrame(index=df.index)

    newDf['name'] = [x.split('/')[-1].split('.')[0] for x in df['pic']]
    condDict = {'smoo15':'smoo','edges5':'edges','gray':'gray'}
    newDf['cond'] = [ condDict[x.split('_')[0]] for x in newDf['name'] ]

    newDf['familiar'] = [int(x=='1.ja') for x in df['ratingBekannt.response']]
    
    # make sure there are no missing values
    for x in newDf['familiar']:
        if x == 'None':
            return
        
    newDf['context'] = pd.DataFrame(d,index=['context']).T
    
    newDf.index = newDf['name']
    return newDf

In [7]:
thisDf = makeDf(logFile)

In [8]:
thisDf.tail()

Unnamed: 0_level_0,name,cond,familiar,context
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
gray_Ice-T(OdafinFinTutuola)7_1,gray_Ice-T(OdafinFinTutuola)7_1,gray,1,act
gray_OmarEpps(DrEricForeman)3_1,gray_OmarEpps(DrEricForeman)3_1,gray,1,act
gray_WillSmith5_1,gray_WillSmith5_1,gray,1,act
gray_BritneySpears7_1,gray_BritneySpears7_1,gray,1,music
gray_WillSmith4_1,gray_WillSmith4_1,gray,1,act


### evaluate context responses

reference list:

In [9]:
contextDf = pd.read_csv('../famousCheck/contextList.csv')

In [10]:
def evalResp(df,contextDf=contextDf):
    
    respDict = {}

    for i in df.index:

        resp = df.loc[i]['context']
        bekannt = df.loc[i]['familiar']

        for idx in contextDf.index:
            correct = contextDf.loc[idx]['context']
            thisName = contextDf.loc[idx]['name']
            
            for namePart in thisName.split(' '):
                if namePart in i:

                    respDict[i] = {'corrAns':correct,
                                          'familiarity':bekannt,
                                          'corrContext':resp,
                                          'context': int(resp in correct)}
                    
    respDf = pd.DataFrame( respDict ).T
    
    return respDf

In [11]:
evalDf = evalResp(thisDf)

In [12]:
evalDf.tail(10)

Unnamed: 0,context,corrAns,corrContext,familiarity
smoo15_RobertSeanLeonard(Wilson)8_1,1,act,act,0
smoo15_TomCruise1_1,1,act,act,1
smoo15_TomCruise4_1,1,act,act,0
smoo15_TomCruise5_1,1,act,act,1
smoo15_UrsulavonderLeyen1_1,0,polit,act,0
smoo15_UrsulavonderLeyen4_1,0,polit,act,0
smoo15_UrsulavonderLeyen5_1,0,polit,act,0
smoo15_WillSmith1_1,1,act music,act,1
smoo15_WillSmith4_1,1,act music,act,1
smoo15_WillSmith5_1,1,act music,act,1


### get rid of unknown faces

In [13]:
checkDf = pd.read_csv('../famousCheck/output/famousFamiliarity_%s.csv' % 9)

In [14]:
checkDf.tail()

Unnamed: 0,img,name,bekannt
37,./img/SigmarGabriel1.jpg,Sigmar Gabriel,ja
38,./img/BritneySpears5.jpg,Britney Spears,ja
39,./img/EmilyDeschanel3.jpg,Emily Deschanel,ja
40,./img/AlysonHannigan(Lily)1.jpg,Alyson Hannigan,ja
41,./img/BarackObama2.jpg,Barack Obama,ja


In [15]:
def cleanUp(evalDf,checkDf=checkDf):
    copyDf = evalDf.copy()
    for idx in evalDf.index:
        #print idx
        for j in checkDf.index:
            status = (checkDf.loc[j]['bekannt'] == 'nein')
            name = checkDf.loc[j]['name'].split(' ')
            if status:
                for sub in name:
                    
                    if sub in idx and len(sub)>3:# the >3 is a hack to prevent 'von' and 'der' in von der Leyen to be used for matching
                        # this is a hack to prevent Robert Sean Leonard matching Leonard[o] DiCaprio
                        if sub == 'Leonard' and 'DiCaprio' in idx:
                            pass
                        else:
                            #print sub,idx
                            copyDf.drop(idx,inplace=True)
                            break
                            
    condDict = {'smoo15':'smoo','edges5':'edges','gray':'gray'}
    cond = [ condDict[x.split('_')[0]] for x in copyDf.index]                    
    copyDf.index = [cond,copyDf.index]
    
    return copyDf

In [16]:
cleanDf = cleanUp(evalDf)

In [17]:
cleanDf.tail()

Unnamed: 0,Unnamed: 1,context,corrAns,corrContext,familiarity
smoo,smoo15_UrsulavonderLeyen4_1,0,polit,act,0
smoo,smoo15_UrsulavonderLeyen5_1,0,polit,act,0
smoo,smoo15_WillSmith1_1,1,act music,act,1
smoo,smoo15_WillSmith4_1,1,act music,act,1
smoo,smoo15_WillSmith5_1,1,act music,act,1


### get percent correct for one participant

In [18]:
def makePercent(evalDf,pName):
    
    sumDf = evalDf.drop(['corrAns','corrContext'],axis=1).groupby(level=0).sum()
    countDf = evalDf.drop(['corrAns','corrContext'],axis=1).groupby(level=0).count()
    
    percentDf = sumDf/countDf
    df = pd.DataFrame( percentDf.stack() ).T
    df.index = [pName]
    return df

In [19]:
fuDf = makePercent(cleanDf,'pName')

In [20]:
fuDf

Unnamed: 0_level_0,edges,edges,gray,gray,smoo,smoo
Unnamed: 0_level_1,context,familiarity,context,familiarity,context,familiarity
pName,0.878788,0.80303,0.954545,0.969697,0.893939,0.666667


### jb split

In [21]:
jbRecogDf = pd.read_csv('../famousCheck/recognizableList.csv')

In [22]:
def makeRecogSplit(thisDf,jbRecogDf=jbRecogDf):
    
    df = thisDf.copy()
    
    # for each entry in the response df
    for i in df.index:
        thisName = i[1]
        # for each name in the recoglist
        for j in jbRecogDf.index:
            thisRecog = jbRecogDf.loc[j]['name']
            thisStatus = jbRecogDf.loc[j]['recognizable']
            # if the names match
            if thisRecog in thisName:
                # set the status of the 
                df.set_value(i,'jbRecog',thisStatus)
           
    return df

In [23]:
def makePercentJb(evalDf,pName):
    
    myDf = evalDf.drop(['corrAns','corrContext'],axis=1)
    myDf['type'] = [myDf.index.levels[0][x] for x in myDf.index.labels[0] ]
    sumDf = myDf.groupby(['jbRecog','type']).sum()
    countDf = myDf.groupby(['jbRecog','type']).count()
    
    percentDf = sumDf/countDf
    percentDf = percentDf.unstack(1).reorder_levels((1,0),axis=1).sort_index(axis=1)
    return percentDf

In [24]:
makePercentJb(makeRecogSplit(cleanDf),'pName')

type,edges,edges,gray,gray,smoo,smoo
Unnamed: 0_level_1,context,familiarity,context,familiarity,context,familiarity
jbRecog,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
JB,0.8,0.666667,0.933333,0.933333,0.8,0.566667
JB*,0.969697,0.909091,1.0,1.0,0.969697,0.727273


In [25]:
def makeParticipant(logFile,contextDf=contextDf):
    
    pName = logFile.split('/')[-1].split('_')[0]

    thisDf = makeDf(logFile)
    evalDf = evalResp(thisDf)
    checkDf = pd.read_csv('../famousCheck/output/famousFamiliarity_%s.csv' % pName)
    cleanDf = cleanUp(evalDf,checkDf)
    
    if pName == 'jb':
        thisDf = makePercentJb(makeRecogSplit(cleanDf),pName)
    else:
        thisDf = makePercent(cleanDf,pName)
    
    return thisDf

In [26]:
makeParticipant(logList[1])

Unnamed: 0_level_0,edges,edges,gray,gray,smoo,smoo
Unnamed: 0_level_1,context,familiarity,context,familiarity,context,familiarity
11,0.666667,0.520833,0.729167,0.666667,0.604167,0.520833


In [27]:
makeParticipant(logList[-1])

type,edges,edges,gray,gray,smoo,smoo
Unnamed: 0_level_1,context,familiarity,context,familiarity,context,familiarity
jbRecog,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
JB,0.818182,0.69697,0.939394,0.939394,0.818182,0.606061
JB*,0.972222,0.916667,1.0,1.0,0.972222,0.694444


### do this for all participants

In [28]:
def makeBigDf(logList,contextDf=contextDf):
    bigDf = pd.DataFrame()
    for logFile in logList:
        #print '\n',logFile
        try:
            thisDf = makeParticipant(logFile,contextDf=contextDf)
            bigDf = pd.concat([bigDf,thisDf])
        except:
            print '!!!!',logFile
            
    return bigDf*100

In [29]:
bigDf = makeBigDf(logList)

!!!! ../famousFiltered/experiment/data/10_famFaceFilter_2017_Feb_06_1253.csv
!!!! ../famousFiltered/experiment/data/18_famFaceFilter_2017_Feb_16_1655.csv
!!!! ../famousFiltered/experiment/data/19_famFaceFilter_2017_Feb_20_1110.csv
!!!! ../famousFiltered/experiment/data/23_famFaceFilter_2017_Feb_23_1717.csv
!!!! ../famousFiltered/experiment/data/24_famFaceFilter_2017_Feb_24_1121.csv
!!!! ../famousFiltered/experiment/data/26_famFaceFilter_2017_Feb_28_1246.csv
!!!! ../famousFiltered/experiment/data/27_famFaceFilter_2017_Mrz_01_1642.csv
!!!! ../famousFiltered/experiment/data/3_famFaceFilter_2017_Jan_25_1257.csv
!!!! ../famousFiltered/experiment/data/5_famFaceFilter_2017_Jan_26_1056.csv
!!!! ../famousFiltered/experiment/data/6_famFaceFilter_2017_Jan_27_1049.csv
!!!! ../famousFiltered/experiment/data/7_famFaceFilter_2017_Jan_30_1503.csv
!!!! ../famousFiltered/experiment/data/8_famFaceFilter_2017_Feb_01_1709.csv
!!!! ../famousFiltered/experiment/data/famousFilterClean_1.csv
!!!! ../famousFilt

In [30]:
bigDf.tail()

Unnamed: 0_level_0,edges,edges,gray,gray,smoo,smoo
Unnamed: 0_level_1,context,familiarity,context,familiarity,context,familiarity
2,100.0,92.982456,100.0,92.982456,98.245614,91.22807
4,100.0,100.0,98.611111,100.0,100.0,100.0
9,100.0,98.484848,96.969697,98.484848,89.393939,86.363636
JB,81.818182,69.69697,93.939394,93.939394,81.818182,60.606061
JB*,97.222222,91.666667,100.0,100.0,97.222222,69.444444


### now we merge with the logfile version

In [31]:
altDf = pd.read_csv('../famousFiltered/output/filteredChoiceLog.csv',index_col=[0],header=[0,1])
altDf.index = [str(x) for x in altDf.index]

In [32]:
makeParticipant(logList[-1])

type,edges,edges,gray,gray,smoo,smoo
Unnamed: 0_level_1,context,familiarity,context,familiarity,context,familiarity
jbRecog,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
JB,0.818182,0.69697,0.939394,0.939394,0.818182,0.606061
JB*,0.972222,0.916667,1.0,1.0,0.972222,0.694444


In [33]:
altDf.tail()

Unnamed: 0_level_0,smoo,smoo,edges,edges,gray,gray
Unnamed: 0_level_1,context,familiarity,context,familiarity,context,familiarity
5,100.0,100.0,98.611111,100.0,98.611111,100.0
6,92.592593,94.444444,92.592593,94.444444,98.148148,100.0
7,94.444444,98.148148,98.148148,100.0,96.296296,100.0
8,93.939394,87.878788,90.909091,89.393939,95.454545,90.909091
9,89.393939,86.363636,100.0,98.484848,96.969697,98.484848


### checking if the two versions give identical results  

In [34]:
for entry in bigDf.index:
    try:
        print entry
        print bigDf.loc[entry].round(10) == altDf.loc[entry].round(10)
    except:
        pass

11
12
13
14
15
16
17
1
20
21
22
25
2
4
9
JB
JB*


### merging the two

In [35]:
mergeDf = pd.concat([bigDf,altDf]).groupby(level=0).last().sort_index()

In [36]:
mergeDf.tail()

Unnamed: 0_level_0,edges,edges,gray,gray,smoo,smoo
Unnamed: 0_level_1,context,familiarity,context,familiarity,context,familiarity
7,98.148148,100.0,96.296296,100.0,94.444444,98.148148
8,90.909091,89.393939,95.454545,90.909091,93.939394,87.878788
9,100.0,98.484848,96.969697,98.484848,89.393939,86.363636
JB,81.818182,69.69697,93.939394,93.939394,81.818182,60.606061
JB*,97.222222,91.666667,100.0,100.0,97.222222,69.444444


In [37]:
mergeDf.to_csv('../famousFiltered/output/filteredChoice.csv')

In [38]:
pd.read_csv('../famousFiltered/output/filteredChoice.csv',index_col=[0],header=[0,1]).tail()

Unnamed: 0_level_0,edges,edges,gray,gray,smoo,smoo
Unnamed: 0_level_1,context,familiarity,context,familiarity,context,familiarity
7,98.148148,100.0,96.296296,100.0,94.444444,98.148148
8,90.909091,89.393939,95.454545,90.909091,93.939394,87.878788
9,100.0,98.484848,96.969697,98.484848,89.393939,86.363636
JB,81.818182,69.69697,93.939394,93.939394,81.818182,60.606061
JB*,97.222222,91.666667,100.0,100.0,97.222222,69.444444
