In [3]:
import re
import json
import datetime

import pandas as pd
import requests
import itertools

from geolite2 import geolite2
import matplotlib.pyplot as plt

In [4]:
from config import USERNAME, PASSWORD

In [5]:
% pylab inline

Populating the interactive namespace from numpy and matplotlib


## Read in the Raw Data

In [6]:
# url = 'https://%s:%s@svo.world/responses2/_all_docs/?include_docs=true' % (USERNAME, PASSWORD)
# res = requests.get(url)

In [7]:
# docs = [d['doc'] for d in res.json()['rows']]
# df = pd.DataFrame(docs)

In [8]:
# The raw data was stored locally so we can just 
with open('../data/raw/raw2.json') as data_file:    
    data = json.load(data_file)

In [9]:
responses = [r['doc'] for r in data['rows']]
raw_df = pd.DataFrame(responses)

### Remove Responses Missing Critical Fields
Review the records that aren't complete. If they're missing critical data that can't be derived, then they'll have to be dropped

- Those with 92% are missing Secondary Measures
- Those with 78 - 86% are missing ip, browser
- Those with 64% are missing events, completedAt, SVO - they got to secondary measures and quit
- Those with 42% didn't finish the initial survey

In [10]:
# Remove records with more than 50% empty fields
(raw_df.count(1) / raw_df.shape[1]).value_counts().sort_index(ascending=False)

1.000000    485
0.928571    294
0.857143     48
0.785714     11
0.642857      4
0.428571      8
dtype: int64

In [11]:
df = raw_df[(raw_df.count(1) / raw_df.shape[1]) > .7].copy()

In [12]:
df.reset_index(inplace=True, drop=True)

## Create Features

### Survey Duration

In [13]:
df.loc[:, 'startedAt'] = pd.to_datetime(df.startedAt)
df.loc[:, 'completedAt'] = pd.to_datetime(df.completedAt)
df.loc[:, 'duration'] = (df.completedAt - df.startedAt)
df.loc[:, 'durationSeconds'] = df.duration.apply(lambda d: d.total_seconds())

### Location from IP

In [14]:
# Fill in any responses without ip addresses
df.ip.fillna(False, inplace=True)

In [15]:
reader = geolite2.reader()

In [16]:
null_location = {
    'city': None, 
    'continent': None, 
    'country': None, 
    'lat': None, 
    'lon': None, 
    'accuracy': None, 
    'postal': None
}

In [17]:
def add_location_features(ip):
    if ip:
        loc = reader.get(ip)
        if loc:
            keys = loc.keys()
            return {
                'city': loc['city']['names']['en'] if 'city' in keys else None,
                'continent': loc['continent']['names']['en'] if 'continent' in keys else None,
                'country': loc['country']['names']['en'] if 'country' in keys else None,
                'lat': loc['location']['latitude'] if 'location' in keys else None,
                'lon': loc['location']['longitude'] if 'location' in keys else None,
                'accuracy': loc['location']['accuracy_radius'] if 'location' in keys else None,
                'postal': loc['postal']['code'] if 'postal' in keys else None
            }
        else:
            return null_location
    else:
        return null_location

In [18]:
location_records = list(df.ip.apply(add_location_features))

In [19]:
ldf = pd.DataFrame(location_records)

### State and Region from IP

In [20]:
import time
from geopy.geocoders import Nominatim, GoogleV3

In [21]:
#geolocator = Nominatim()
geolocator = GoogleV3(api_key="AIzaSyDM6PIhCXkhbSQu0jFHQDyDjlK-CFh8Mao")

In [22]:
def extract_state_nominatum(row):
    row['state'] = None
    if row['country'] == 'United States':
        geolocation = "%s, %s" % (row['lat'], row['lon'])
        location = geolocator.reverse(geolocation)
        if (location.raw and 
            'address' in location.raw.keys() and
            'state' in location.raw['address']
           ):
            row['state'] = location.raw.get('address', {}).get('state')
    return row

In [23]:
def reverse_geocode_google(row):
    geolocation = "{}, {}".format(row.get('lat'), row.get('lon'))
    if "nan" not in geolocation:
        time.sleep(.1)
        locations = geolocator.reverse(geolocation)
        if locations:
            location = locations[0]
            data = {'geolocation': geolocation, 'raw': location.raw}
        else:
            data = {'geolocation': geolocation, 'raw': None}
        return data

In [24]:
def process_google_geolocation(row):
    if not row:
        return None
    components = row.get('raw', {}).get('address_components', [])
    state = filter_components(components, 'administrative_area_level_1')
    country = filter_components(components, 'country')
    if state and country.get('long_name') == 'United States':
        state = state.get('long_name')
    else:
        state = None
    return state

In [25]:
def filter_components(components, type_):
    for component in components:
        if type_ in component.get('types', []):
            return component

In [26]:
# Reverse geocode the ip geolocations
# geolocations = ldf.apply(reverse_geocode_google, axis=1)

In [27]:
# Write the locations to a file for future use
# with open('../data/raw/ip_geolocations.json', 'w') as outfile:
#     outfile.write(geolocations.to_json(orient='records'))

In [28]:
with open('../data/raw/ip_geolocations.json') as infile:
    geolocations = list(json.load(infile))

In [29]:
# Extract the state information
states = [process_google_geolocation(loc) for loc in geolocations]

In [30]:
# Lowercase the states - Done below because standard case states are used for region lookup
# states_lower = [state.lower().replace(' ', '_') for state in states if state]

In [31]:
# Create the State Data Frame
sdf = pd.DataFrame(states, columns=['state'])

In [32]:
# Add the state feature to the location dataframe
# ldf['state'] = states

In [33]:
# Create a dataframe with census regions
# https://www2.census.gov/geo/docs/maps-data/maps/reg_div.txt
rdf = pd.read_csv('../data/regions.csv')

In [34]:
# Create a pair of dictionaries keyed by state and valued by region and division
regions = {r['state']:r['region'] for r in rdf.to_dict(orient='records')}
divisions = {r['state']:r['division'] for r in rdf.to_dict(orient='records')}

In [35]:
# Add the region and division features to the location dataframe
sdf.loc[:, "region"] = sdf.state.replace(regions)
sdf.loc[:, "region_division"] = sdf.state.replace(divisions)

In [36]:
# Create a dataframe of region dummies
region_dummies = pd.get_dummies(sdf[["region", "region_division"]])

In [37]:
region_dummies.columns = [c.replace(" ", "_").lower() for c in region_dummies.columns]

In [38]:
# Create a dataframe of state dummies
state_dummies = pd.get_dummies(sdf.state)

In [39]:
# Lowercase the state dummy columns
state_dummies.columns = [c.replace(" ", "_").lower() for c in state_dummies.columns]

In [40]:
# with_region_df = pd.concat([nldf, region_dummies], axis=1)

In [41]:
# Test that the location df is the same length as the state dummy df
# assert ldf.shape[0] == state_dummies.shape[0]

In [42]:
# Merge the location df with the state dummy df
# with_state_df = pd.merge(ldf, state_dummies, left_index=True, right_index=True)

In [43]:
# Test that the location df is the same length as the region dummy df
# assert ldf.shape[0] == region_dummies.shape[0]

In [44]:
# Merge the state df with the region dummy df
# with_region_df = pd.merge(with_state_df, region_dummies, left_index=True, right_index=True)

In [45]:
# Test that the main df is the same length as the location df
# assert df.shape[0] == ldf.shape[0]

In [46]:
# Merge the main df with the region df
# with_location = pd.merge(df, with_region_df, left_index=True, right_index=True)

In [47]:
# with_location_no_dummies = pd.merge(df, ldf, left_index=True, right_index=True)

In [48]:
# Replace spaces with underscores and lowercase
sdf = sdf.apply(lambda r: r.str.replace(" ", "_").str.lower())

In [49]:
assert sdf.shape[0] == df.shape[0]

### Browser Information Features

In [50]:
null_browser = {
    'appCodeName': None,
    'appName': None,
    'appVersion': None,
    'cookieEnabled': None,
    'hardwareConcurrency': None,
    'language': None,
    'maxTouchPoints': None,
    'onLine': None,
    'platform': None,
    'product': None,
    'productSub': None,
    'userAgent': None,
    'vendor': None,
    'vendorSub': None
}

In [51]:
browser = [null_browser if not v else v for v in list(df.browser.fillna(0))]
bdf = pd.DataFrame(browser)

In [52]:
assert bdf.shape[0] == df.shape[0]

In [53]:
# with_browser = pd.merge(with_location, bdf, left_index=True, right_index=True)

### Demographics Survey

In [54]:
demo_records = list(df.demoSurvey)

In [55]:
ddf = pd.DataFrame(demo_records)

In [56]:
# Change age to birthyear
ddf['birthyear'] = ddf.age

In [57]:
# Compute age from birthyear
current_year = datetime.date.today().year
ddf.loc[:, 'age'] = ddf.age.apply(lambda a: current_year - a)

In [58]:
# change the religiosity to a 0-4 scale
ddf['religiosity'] = ddf.religiosity.replace({
    'Not at all religious or spiritual': 0,
    'Slightly religious or spiritual': 1,
    'Somewhat religious or spiritual': 2,
    'Quite religious or spiritual': 3,
    'Very religious or spiritual': 4
})

In [59]:
# change the politics to a -2 to 2 scale
ddf['politics'] = ddf.politics.replace({
    'Very liberal or left-leaning': -2,
    'Somewhat liberal or left-leaning': -1,
    'Centrist': 0,
    'Somewhat conservative or right-leaning': 1,
    'Very convervative or right-leaning': 2
})

In [60]:
ddf['income'] = ddf.income.replace({
    'Less than $10,000': 0,
    '$10,000 to $19,999': 0,
    '$20,000 to $29,999': 1,
    '$30,000 to $39,999': 1,
    '$40,000 to $49,999': 2,
    '$50,000 to $59,999': 2,
    '$60,000 to $69,999': 2,
    '$70,000 to $79,999': 2,
    '$80,000 to $89,999': 3,
    '$90,000 to $99,999': 3,
    '$100,000 to $149,999': 3,
    'More than $150,000': 4,
    'Prefer not to answer': -1 # Note: this should be removed!  Just for testing...
})

In [61]:
# TODO: Add census categories / aggregations
# https://www.census.gov/library/visualizations/time-series/demo/cps-historical-time-series.html
ddf['education'] = ddf.education.replace({
    'Less than a high school degree': 0,
    'High school degree or equivalent (e.g. GED)': 1,
    'Some college but no degree': 2,
    'Associate degree': 3,
    'Bachelor degree': 3,
    "Master's degree": 4,
    'Ph.D.': 4
})

In [62]:
# Attachment Style
ddf['relationship-development'] = ddf['relationship-development'].astype(int)
ddf['trust-development'] = ddf['trust-development'].astype(int)
ddf['dependence-development'] = ddf['dependence-development'].astype(int)

In [63]:
# Add a feature for whether or not the respondant is Female
ddf['female'] = ddf.gender.replace({
    'Female': 1,
    'Male': 0,
    'other': 0
})

In [64]:
# Add a feature for whether or not the respondent is Other Gendered
ddf['other_gender'] = ddf.gender.replace({
    'Female': 0,
    'Male': 0,
    'other': 1
})

In [65]:
# One-hot encode the ethnicities
eth_dummies = pd.get_dummies(ddf.race)

In [66]:
# Update the columns of the ethnicities
eth_dummies.columns = [
    'american_indian', 'asian', 
    'black', 'pacific_islander', 
    'white', 'other'
]

In [67]:
ddf.loc[ddf.race == 'other', 'ethnicity_other'] = 1
ddf.loc[ddf.race != 'other', 'ethnicity_other'] = 0

In [68]:
assert ddf.shape[0] == eth_dummies.shape[0]

In [69]:
demodf = pd.concat([ddf, eth_dummies], axis=1)

In [70]:
demodf.columns = [c.replace("-", "_").lower() for c in demodf.columns]

In [72]:
# demodf.head()

### Interests

In [73]:
# Generate a list of interest Dummies
interests = list(itertools.chain(*list(ddf.interests)))

In [74]:
# Helper function to create interest dummies
def interest_dummies(interest_list):
    dummies = {}
    for interest in interests:
        if interest in interest_list:
            dummies[interest] = 1
        else:
            dummies[interest] = 0
    return dummies

In [75]:
# One-hot encode interests
interest_dummies_records = list(ddf.interests.apply(interest_dummies))

In [76]:
# Create a dataframe of the interest dummies
idf = pd.DataFrame(interest_dummies_records)

In [77]:
# Clean up the interest columns
interest_cols = [
    'arts', 'computers_tech', 'health_fitness', 
    'indoor', 'life_style', 'nature', 'outdoor',
    'performance_arts', 'sports', 'other'
]
idf.columns = interest_cols

In [78]:
assert idf.shape[0] == df.shape[0]

In [79]:
#interest_cols = [c.lower().replace(' ', '_').replace('&','_').replace('-','_') for c in with_interest_dummies.columns]

In [80]:
# add the interest dummies to the deographics dataframe
# with_interest_dummies = pd.concat([ddf, idf], axis=1)

In [81]:
# Create interest cols

#re.sub(r'(\s|&|-)', '_', "tac&o-time")
# with_interest_dummies.columns = interest_cols

In [82]:
# Drop the 'interests' column
# with_interest_dummies.drop('interests', axis=1, inplace=True)

In [83]:
# Test that the interest dummies is the same shape as the main dataframe
# assert with_interest_dummies.shape[0] == df.shape[0]

In [84]:
# with_demo = pd.concat([with_browser, with_interest_dummies], axis=1)

### Slider Events
This creates features relating to events that took place while the survey was being taken, like checking the instructions, moving the slider, etc.

In [85]:
events = list(df[df.events.notnull()].events)

In [86]:
combined_events = list(itertools.chain(*events))

In [87]:
## Event counts for each question
edf = pd.DataFrame(combined_events)

In [88]:
# Moved slider event count
survey_events = edf[edf.type == 'Moved Slider']
event_counts = (survey_events
    .groupby(['sessionId', 'question'])
    ._id
    .agg('count')
    .unstack()
    .reset_index()
    .fillna(0))

In [89]:
event_counts.columns = ['sessionId', 'ev0', 'ev1', 'ev2', 'ev3', 'ev4', 'ev5']

In [90]:
# with_event_counts = pd.merge(with_secondary_measures, event_counts, left_on='_id', right_on='sessionId', how='left')

### Instruction Events

In [91]:
# Instruction count
instruction = edf[edf.type != 'Moved Slider']
instruction_count = (instruction
    .groupby(['sessionId', 'question'])
    ._id
    .agg('count')
    .unstack()
    .reset_index()
    .fillna(0))

In [92]:
instruction_count.columns = ['sessionId', 'i0', 'i1', 'i2', 'i3', 'i4', 'i5']

In [93]:
# with_instruction_count = pd.merge(
#     with_event_counts,
#     instruction_count, 
#     left_on='_id',
#     right_on='sessionId',
#     how='left'
# )

In [94]:
# event_cols = [
#     'ev0', 'ev1', 'ev2',
#     'ev3', 'ev4', 'ev5',
#     'i0', 'i1', 'i2', 'i4', 'i5'
# ]

In [95]:
# with_instruction_count.loc[:, event_cols] = with_instruction_count.loc[:, event_cols].fillna(0)

### Self / other Answer Distances

In [96]:
answers = list(df.answers)
answers_list = list(itertools.chain(*answers))
adf = pd.DataFrame(answers_list)

In [97]:
adf['sodiff'] = adf.self - adf.other

In [98]:
answer_diffs = (adf[adf.question < 6]
    .groupby(['sessionId', 'question'])
    .sodiff.agg('sum')
    .unstack()
    .reset_index()
    .fillna(0))

In [99]:
answer_diffs.columns = ["sessionId", "ad0", "ad1", "ad2", "ad3", "ad4", "ad5"]

In [100]:
# with_demo = with_demo.merge(answer_diffs, left_on='_id', right_on='sessionId')

### Answer Times

In [101]:
### Todo, add the answer time for each question

### Interactions

In [102]:
region_cols = [
    'region_midwest', 'region_northeast', 
    'region_south', 'region_west',
]

In [103]:
with_instruction_count['rel_pol'] = with_instruction_count.religiosity * with_instruction_count.politics 

NameError: name 'with_instruction_count' is not defined

In [757]:
with_instruction_count['ed_pol'] = with_instruction_count.education * with_instruction_count.politics 

In [758]:
reg_pol = with_instruction_count[region_cols].multiply(with_instruction_count.politics, axis=0)
reg_pol.columns = [c + '_pol' for c in reg_pol.columns]

In [777]:
# reg_pol.columns

In [760]:
with_instruction_count = pd.concat([with_instruction_count, reg_pol], axis=1)

In [763]:
reg_rel = with_instruction_count[region_cols].multiply(with_instruction_count.relationship_development, axis=0)
reg_rel.columns = [c + '_rel' for c in reg_rel.columns]

In [766]:
with_instruction_count = pd.concat([with_instruction_count, reg_rel], axis=1)

In [779]:
# with_instruction_count.head()

In [769]:
#with_instruction_count['ed_income'] = with_instruction_count.education * with_instruction_count.income 

### Secondary Measures
Creates features for secondary measures on the main dataset. (This just moves the dictionary fro the source record up to the main record)

In [104]:
def secondary_measures(row):
    if not row:
        return {
            'dal': None,
            'dia': None,
            'dic': None,
            'djg': None,
            'ia': None
        }
    else:
        return row

In [105]:
secondary_records = list(df.secondaryMeasures.fillna(False).apply(secondary_measures))

In [106]:
secdf = pd.DataFrame(secondary_records)

In [107]:
# with_secondary_measures = pd.concat([with_demo, sdf], axis=1)

In [108]:
# sdf[(sdf.ia.notnull()) & (sdf.ia != 'does not fit criteria')].ia.astype(float).hist(bins=50)

### Prosocial

In [109]:
df.loc[:, "prosocial"] = df.type.replace({
        "prosocial": 1,
        "altruistic": 1,
        "individualistic": 0,
        "competitive": 0
    })

In [110]:
# with_instruction_count[["prosocial", "type"]]

### TODO: possible additional features:
 - distance to urban center
 - urban or rural classification

## Concatenate Data Frames

In [111]:
# Survery Duration: df
# location: ldf
# state and region: sdf
# state dummies: state_dummies
# region dummies: region_dummies
# browser: bdf
# Demographics: demodf
# Interests: idf
# Distance between self and other:
# Secondary Measures: secdf
# Interactions
# Prosocial
dfs = [df, demodf, ldf, sdf, idf, secdf]

In [112]:
full_df = pd.concat(dfs, axis=1)

In [113]:
full_df.shape

(838, 66)

In [115]:
# full_df.head()

In [117]:
# for df in dfs:
#     print(df.index)

In [118]:
# Events - Needs to be merged on sessionId
# Answers - Needs to be merged on sessionId
full_df = full_df.merge(event_counts, left_on='_id', right_on='sessionId', how='left')

In [119]:
full_df = full_df.merge(instruction_count, left_on='_id', right_on='sessionId', how='left')

In [120]:
print(df.shape)
print(demodf.shape)
print(ldf.shape)
print(sdf.shape)
print(secdf.shape)
print(full_df.shape)

(838, 5)
(838, 24)
(838, 7)
(838, 3)
(838, 5)
(838, 80)


## Drop Bad Records

### Incorrect Age

In [121]:
# Filter out the test repsonesed
filtered_df = full_df[full_df.birthyear != 1234].copy()

In [122]:
# Convert negative years to positive
filtered_df['birthyear'] = filtered_df.birthyear.apply(lambda b: abs(b))

In [123]:
# Add century to short-hand years
filtered_df.loc[filtered_df.birthyear < 1900, "birthyear"] = (filtered_df[filtered_df.birthyear < 1900]
     .birthyear
     .apply(lambda b: np.int("19{}".format(str(b)))))

In [124]:
# Compute age from birthyear
current_year = datetime.date.today().year
filtered_df.loc[:, 'age'] = filtered_df.birthyear.apply(lambda a: current_year - a)

In [125]:
filtered_df.shape

(831, 80)

### Outside US

In [126]:
filtered_df = filtered_df[
    (full_df.country == 'United States') |
    (full_df.country.isnull())
].copy()

  app.launch_new_instance()


In [129]:
# filtered_df.head()

### Duplicate IP addresses

In [None]:
duplicate_ips = filtered_df.ip.value_counts()[filtered_df.ip.value_counts() > 1].index

## Drop unused columns & prep output

In [130]:
# final_df = with_instruction_count.loc[with_instruction_count.svo.notnull(), :]

In [131]:
final_df = filtered_df.copy()

In [132]:
final_df = final_df.drop([
    '_rev', 'answers', 'browser', 'events', 
    'secondaryMeasures', 'demoSurvey', 
    'sessionId', 'sessionId_x', 'sessionId_y',
    'buildID', 'cookieEnabled', 'language',
    'pointerEnabled', 'product', 'onLine'], axis=1)

In [133]:
final_df.columns

Index(['_id', 'completedAt', 'ip', 'otherTotal', 'selfTotal', 'startedAt',
       'svo', 'type', 'duration', 'durationSeconds', 'prosocial', 'age',
       'dependence_development', 'education', 'gender', 'gender_comment',
       'income', 'interests', 'interests_comment', 'politics', 'race',
       'race_comment', 'relationship_development', 'religiosity',
       'trust_development', 'birthyear', 'female', 'other_gender',
       'ethnicity_other', 'american_indian', 'asian', 'black',
       'pacific_islander', 'white', 'other', 'accuracy', 'city', 'continent',
       'country', 'lat', 'lon', 'postal', 'state', 'region', 'region_division',
       'arts', 'computers_tech', 'health_fitness', 'indoor', 'life_style',
       'nature', 'outdoor', 'performance_arts', 'sports', 'other', 'dal',
       'dia', 'dic', 'djg', 'ia', 'ev0', 'ev1', 'ev2', 'ev3', 'ev4', 'ev5',
       'i0', 'i1', 'i2', 'i3', 'i4', 'i5'],
      dtype='object')

In [134]:
final_df.to_csv('../data/clean/surveys2.csv', encoding='utf-8', index=False)
# final_df.to_csv('../data/clean/surveys.csv', encoding='utf-8', index=False)

### Ideas to get additional features
 - [Browser history](http://bhavin.directi.com/using-javascript-to-read-a-users-browser-history/)

## Secondary Feature Analysis

### Rank order of preferences

### Transativity of preferences

### Secondary Item Analysis 

In [281]:
equality_points = {
    6: 81,
    7: 95,
    8: 81,
    9: 93,
    10: 85,
    11: 92,
    12: 75,
    13: 93,
    14: 92
}

# zeros at 2, 5, 7
maximizing_points = {
    6: 70,
    7: None,
    8: 100,
    9: 90,
    10: None,
    11: 100,
    12: None,
    13: 100,
    14: 90
}

others_gain = {
    6: 70,
    7: 90,
    8: 50,
    9: 90,
    10: 70,
    11: 50,
    12: 50,
    13: 100,
    14: 90
}

own_gain = {
    6: 100,
    7: 100,
    8: 100,
    9: 100,
    10: 100,
    11: 100,
    12: 100,
    13: 100,
    14: 100    
}

max_distances = {
    6: 30,
    7: 10,
    8: 50,
    9: 10,
    10: 30,
    11: 50,
    12: 50,
    13: 30,
    14: 10    
}

In [282]:
sessions = adf.groupby('sessionId')

In [283]:
test_session = sessions.get_group('0136909466a936daa3b8b26a0796d21f')

In [295]:
adf['range'] = adf.question.replace(max_distances)
adf['equality_point'] = adf.question.replace(equality_points)
adf['jg_point'] = adf.question.replace(maximizing_points)
adf['other_gain'] = adf.question.replace(others_gain)
adf['self_gain'] = adf.question.replace(own_gain)

In [298]:
secondary_df = adf.loc[adf.question > 5,:]

In [299]:
secondary_df['dia'] = secondary_df.apply(lambda r: abs(r['self'] - r['equality_point'])/r['range'], axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':


In [300]:
secondary_df['jg_point'] = secondary_df.question.replace(maximizing_points)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':


In [301]:
secondary_df['djg'] = secondary_df.apply(
    lambda r: (abs(r['self'] - r['jg_point']) / r['range']), axis=1).replace(np.nan, 0)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from ipykernel import kernelapp as app


In [306]:
secondary_df['dal'] = secondary_df.apply(lambda r: abs(r['self'] - r['other_gain']) / r['range'], axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':


In [308]:
secondary_df['dic'] = secondary_df.apply(lambda r: abs(r['self'] - r['self_gain']) / r['range'], axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':


In [314]:
sessions = secondary_df.groupby('sessionId')

In [317]:
test_group = sessions.get_group('0136909466a936daa3b8b26a0796d21f')

In [321]:
dia = test_group.dia.sum() / 9
djg = test_group.djg.sum() / 9
dal = test_group.dal.sum() / 9
dic = test_group.dic.sum() / 9

In [329]:
if ((dia <= dal and dia <= dic) and (djg <= dal and djg <= dic)):
    print("yup")

yup


In [339]:
# dia
# djg
# dal
# dic

In [337]:
ia_index = dia / (dia + djg)

In [338]:
ia_index

0.068377060599024805

In [340]:
# Compute this for all of the groups

In [345]:
ias = sessions.agg({
    'dia': lambda x: np.sum(x) / 9,
    'djg': lambda x: np.sum(x) / 9,
    'dal': lambda x: np.sum(x) / 9,
    'dic': lambda x: np.sum(x) / 9,    
}).reset_index()

In [354]:
ias.apply(lambda r: r['dia'] / (r['dia'] + r['djg']), axis=1)

0     0.068377
1     0.124443
2     0.402020
3     0.417192
4     0.565637
5     0.399630
6     0.364910
7     0.292621
8     0.353120
9     0.463054
10    0.483945
11    0.337424
12    0.730287
13    0.124175
14    0.069293
15    0.311927
16    0.462843
17    0.170971
18    0.658799
19    0.595643
20    0.533263
21    0.071723
22    0.311927
23    0.311927
24    0.322889
25    0.311927
26    0.151864
27    0.292435
28    0.594959
29    0.311927
30    0.594959
31    0.429414
32    0.462008
33    0.422433
34    0.402291
35    0.349183
36    0.411434
37    0.370712
38    0.372252
dtype: float64

In [356]:
ias['ia'] = ias.apply(
    lambda r: r['dia'] / (r['dia'] + r['djg']) if \
    ((r['dia'] <= r['dal'] and r['dia'] <= r['dic'])\
    and (r['djg'] <= r['dal'] and r['djg'] <= r['dic'])) else None, axis=1) 

In [171]:
# sns.distplot(ias.ia[ias.ia.notnull()])