In [2]:
import os
import pandas as pd
import numpy as np
import csv
from itertools import islice
from scipy import stats

In [1]:
def getIPDict(ip_file):
	"""
    Using code provided by Sudalai :)
    
	Each line in the IP file belongs to either a device or a cookie.
	This function creates two dictionaries
	1. Device dictionary - has device_id as key and the IPs it belongs to as value in the form of list
	2. IP Dictionary - has IP address as the key and the cookies in that IP as value in the form of list
	Reasoning:
	We need to find the cookies that are associated with the given device in the competition. So given a device we can find out the IP addressess of the device from device dictionary. Then using those IP addresses and IP Dictionary, find out the cookies associated with the IP and link them back to the device.
	"""
	# reading the ip file #
	reader = csv.reader(ip_file)
	header = reader.next()             # skipping the header

	# initializing the dicts #
	device_dict = {}
	ip_dict = {}

	counter = 0                        # counter to manage the progress
	for row in reader:
		counter += 1

		# extracting ip address alone from the given input and store it in a list #
		ip_all_str = ','.join(row[2:]) 
		ip_list = []
		ip_all_list = ip_all_str.replace("{","").replace("}","").replace("),("," ").replace("(","").replace(")","").split(" ")     # formatting 
		for val in ip_all_list:
			ip_list.append(val.split(",")[0])

		# if device, write to device dict, else write to ip dict #
		if row[1] == '0':
			device_dict[row[0]] = ip_list
		elif row[1] == '1':
			for ip in ip_list:
				temp_list = ip_dict.get(ip,[])
				temp_list.append(row[0])
				ip_dict[ip] = temp_list
		else:
			print "Device or Cookie has unacceptable value.. Value : ", row[1]
			raise

		# printing the progress #
		if counter % 50000 == 0:
			print "Processed : ", counter
			
	return device_dict, ip_dict

In [3]:
# Notes...either alter the path to the data file or just do the lazy way of placing this notebook in the same folder
# with the data

# Load files that lend themselves to a dataframe...(un)commment the data frames as you need them.

# Note..I think we could create sets and then do intersections on them to quickly collapse data frames...it's a ToDo
# on my list.

df_cookie_all_basic = pd.read_csv('cookie_all_basic.csv')
#df_dev_test_basic = pd.read_csv('dev_test_basic.csv')
#df_dev_train_basic = pd.read_csv('dev_train_basic.csv')
#df_ipagg_all = pd.read_csv('ipagg_all.csv')
#df_sample_submission = pd.read_csv('sampleSubmission.csv')

In [4]:
df_cookie_all_basic.head()

Unnamed: 0,drawbridge_handle,cookie_id,computer_os_type,computer_browser_version,country,anonymous_c0,anonymous_c1,anonymous_c2,anonymous_5,anonymous_6,anonymous_7
0,-1,id_10,computer_os_type_203,computer_browser_version_1158,country_146,1,anonymous_c1_1307,anonymous_c2_18778,15,1,204
1,handle_609762,id_100,computer_os_type_133,computer_browser_version_875,country_146,1,anonymous_c1_753,-1,41,1,204
2,handle_1622320,id_1000,computer_os_type_203,computer_browser_version_36,country_146,0,anonymous_c1_906,anonymous_c2_751,40,140,204
3,-1,id_10000,computer_os_type_203,computer_browser_version_36,country_146,1,anonymous_c1_255,anonymous_c2_2725,34,140,204
4,handle_1286628,id_100000,computer_os_type_203,computer_browser_version_875,country_146,0,anonymous_c1_255,anonymous_c2_235,52,134,204


In [5]:
# file config #
data_path = "../Data/"
ip_file = open("id_all_ip.csv")

print "Getting device and IP dict.."
device_dict, ip_dict = getIPDict(ip_file)

df_device_dict = pd.DataFrame(list(device_dict.iteritems()), columns=['device_dict_key', 'device_dict_values'])

df_ip_dict = pd.DataFrame(list(ip_dict.iteritems()), columns=['ip_dict_key', 'ip_dict_values'])


Getting device and IP dict..
Processed :  50000
Processed :  100000
Processed :  150000
Processed :  200000
Processed :  250000
Processed :  300000
Processed :  350000
Processed :  400000
Processed :  450000
Processed :  500000
Processed :  550000
Processed :  600000
Processed :  650000
Processed :  700000
Processed :  750000
Processed :  800000
Processed :  850000
Processed :  900000
Processed :  950000
Processed :  1000000
Processed :  1050000
Processed :  1100000
Processed :  1150000
Processed :  1200000
Processed :  1250000
Processed :  1300000
Processed :  1350000
Processed :  1400000
Processed :  1450000
Processed :  1500000
Processed :  1550000
Processed :  1600000
Processed :  1650000
Processed :  1700000
Processed :  1750000
Processed :  1800000
Processed :  1850000
Processed :  1900000
Processed :  1950000
Processed :  2000000
Processed :  2050000
Processed :  2100000
Processed :  2150000
Processed :  2200000
Processed :  2250000
Processed :  2300000
Processed :  2350000


In [6]:
df_device_dict.head(3)
df_ip_dict.head(3)

Unnamed: 0,ip_dict_key,ip_dict_values
0,ip11880904,[id_1824690]
1,ip19010203,[id_4161533]
2,ip2117664,"[id_2826439, id_3355111]"
