In [20]:
import graphlab as gl

########### initialize and get data ############

gl.canvas.set_target('browser')

sms_fn = '/assignment/input/SMSLog.csv'
calls_fn = '/assignment/input/CallLog.csv'
meetings_fn = '/assignment/input/BluetoothProximity.csv'
gender_fn = '/assignment/input/gender_labels.csv'

calls = gl.SFrame.read_csv(calls_fn, column_type_hints={'participantID.A': str,
                                                        'participantID.B': str,
                                                        'local_time' : str,
                                                        'type': str,
                                                        'duration': int,
                                                        'number.hash' : str})

sms = gl.SFrame.read_csv(sms_fn, column_type_hints={'participantID.A': str,
                                                    'participantID.B': str,
                                                    'local_time' : str,
                                                    'type': str,
                                                    'number.hash' : str})

meetings_directed = gl.SFrame.read_csv(meetings_fn, column_type_hints={'participantID': str,
                                                                        'date' : str,
                                                                        'participantID.B': str,
                                                                        'address' : str})

##### filter the transactions containing connection with another participant from the experiment #####
calls = calls[calls['participantID.B']!='']
sms = sms[sms['participantID.B']!='']
meetings_directed = meetings_directed[meetings_directed['participantID.B']!='']

##### extract the transaction's date #####
from datetime import datetime
calls['date'] = calls['local_time'].apply(lambda d: str(datetime.strptime(d,'%Y-%m-%d %H:%M:%S').date()))
sms['date'] = sms['local_time'].apply(lambda d: str(datetime.strptime(d,'%Y-%m-%d %H:%M:%S').date()))
meetings_directed['date'] = meetings_directed['date'].apply(lambda d: str(datetime.strptime(d,'%Y-%m-%d %H:%M:%S').date()))

##### arrange the initial transactions as source-destination according to the type- incoming/outgoing #####
calls['source'] = calls.apply(lambda l: l['participantID.A'] if l['type'] in ["outgoing","outgoing+"] else l['participantID.B'] )
calls['destination'] = calls.apply(lambda l: l['participantID.B'] if l['type'] in ["outgoing","outgoing+"] else l['participantID.A'] )
sms['source'] = sms.apply(lambda l: l['participantID.A'] if l['type'] in ["outgoing","outgoing+"] else l['participantID.B'] )
sms['destination'] = sms.apply(lambda l: l['participantID.B'] if l['type'] in ["outgoing","outgoing+"] else l['participantID.A'] )
meetings_directed['source'] = meetings_directed['participantID']  
meetings_directed['destination'] = meetings_directed['participantID.B']

PROGRESS: Finished parsing file /assignment/input/CallLog.csv
PROGRESS: Parsing completed. Parsed 164905 lines in 0.261995 secs.
PROGRESS: Finished parsing file /assignment/input/SMSLog.csv
PROGRESS: Parsing completed. Parsed 88655 lines in 0.12947 secs.
PROGRESS: Read 1247148 lines. Lines per second: 833987
PROGRESS: Read 9620365 lines. Lines per second: 1.4719e+06
PROGRESS: Finished parsing file /assignment/input/BluetoothProximity.csv
PROGRESS: Parsing completed. Parsed 13226970 lines in 8.61267 secs.


In [21]:
##### AVG_events_per_day #####
agg_calls = calls.groupby(['source','destination','date'], {'calls_per_day': gl.aggregate.COUNT()})
calls_edges = agg_calls.groupby(['source','destination'],{'AVG_per_day':  gl.aggregate.AVG('calls_per_day')}, gl.aggregate.COUNT())
agg_sms = sms.groupby(['source','destination','date'], {'sms_per_day': gl.aggregate.COUNT()})
sms_edges = agg_sms.groupby(['source','destination'],{'AVG_per_day':  gl.aggregate.AVG('sms_per_day')}, gl.aggregate.COUNT())
agg_meetings = meetings_directed.groupby(['source','destination','date'], {'meetings_per_day': gl.aggregate.COUNT()})
directed_meetings_edges = agg_meetings.groupby(['source','destination'],{'AVG_per_day':  gl.aggregate.AVG('meetings_per_day')}, gl.aggregate.COUNT())

min_num_of_days = 5
calls_edges = calls_edges[calls_edges['Count']>min_num_of_days]
sms_edges = sms_edges[sms_edges['Count']>min_num_of_days]
directed_meetings_edges = directed_meetings_edges[directed_meetings_edges['Count']>min_num_of_days]

calls_edges = calls_edges[['source','destination','AVG_per_day']]
sms_edges = sms_edges[['source','destination','AVG_per_day']]
directed_meetings_edges = directed_meetings_edges[['source','destination','AVG_per_day']]

##### create graphs #####
gender = gl.SFrame.read_csv('/assignment/input/gender_labels.csv', 
                                  column_type_hints={'user': str,
                                                     'gender': str})

Calls_graph = gl.SGraph()
Calls_graph = Calls_graph.add_vertices(gender,vid_field = 'user')
Calls_graph = Calls_graph.add_edges(calls_edges, src_field='source', dst_field='destination')

sms_graph = gl.SGraph()
sms_graph = sms_graph.add_vertices(gender,vid_field = 'user')
sms_graph = sms_graph.add_edges(sms_edges, src_field='source', dst_field='destination')

meetings_graph = gl.SGraph()
meetings_graph = meetings_graph.add_vertices(gender,vid_field = 'user')
meetings_graph = meetings_graph.add_edges(directed_meetings_edges, src_field='source', dst_field='destination')
meetings_graph = meetings_graph.add_edges(directed_meetings_edges, src_field='destination', dst_field='source')

PROGRESS: Finished parsing file /assignment/input/gender_labels.csv
PROGRESS: Parsing completed. Parsed 140 lines in 0.010651 secs.


In [22]:
##### pagerank #####
pr_calls = gl.pagerank.create(Calls_graph)
features = pr_calls['pagerank'].remove_columns(['delta'])

pr_sms = gl.pagerank.create(sms_graph)
tmp = pr_sms['pagerank'].remove_columns(['delta'])
tmp.rename({'pagerank': 'sms_pagerank'})
features = features.join(tmp,'__id',how='outer')

pr_meetings = gl.pagerank.create(meetings_graph)
tmp = pr_meetings['pagerank'].remove_columns(['delta'])
tmp['meetings_pagerank']=tmp['pagerank']
tmp.remove_columns(['pagerank'])
features = features.join(tmp,'__id',how='outer')

## remove outliers ##
# we have two centers that have a very large values and will affect the entire model
num_of_centers = 2
features = features.sort('pagerank',ascending=False)[num_of_centers:]

PROGRESS: Counting out degree
PROGRESS: Done counting out degree
PROGRESS: +-----------+-----------------------+
PROGRESS: | Iteration | L1 change in pagerank |
PROGRESS: +-----------+-----------------------+
PROGRESS: | 1         | 88.3717               |
PROGRESS: | 2         | 26.1226               |
PROGRESS: | 3         | 13.8743               |
PROGRESS: | 4         | 8.56995               |
PROGRESS: | 5         | 4.02769               |
PROGRESS: | 6         | 3.54613               |
PROGRESS: | 7         | 2.3888                |
PROGRESS: | 8         | 1.64477               |
PROGRESS: | 9         | 1.09385               |
PROGRESS: | 10        | 0.783415              |
PROGRESS: | 11        | 0.549024              |
PROGRESS: | 12        | 0.410203              |
PROGRESS: | 13        | 0.318068              |
PROGRESS: | 14        | 0.24516               |
PROGRESS: | 15        | 0.0952772             |
PROGRESS: | 16        | 0.164269              |
PROGRESS: | 17        |

In [23]:
##### in/out degree #####

def count_in_degree(src, edge, dst):
    dst['in_degree'] += edge['AVG_per_day']
    return (src, edge, dst)

def count_out_degree(src, edge, dst):
    src['out_degree'] += edge['AVG_per_day']
    return (src, edge, dst)

def get_degree(g):
    new_g = gl.SGraph(g.vertices, g.edges)
    new_g.vertices['in_degree'] = 0
    new_g.vertices['out_degree'] = 0
    new_g = new_g.triple_apply(count_in_degree, ['in_degree'])
    return new_g.triple_apply(count_out_degree, ['out_degree']).get_vertices()

features = get_degree(Calls_graph)[['__id','in_degree','out_degree']].join(features,'__id')
features.rename({'in_degree': 'calls_in_degree', 'out_degree':'calls_out_degree'})
features = get_degree(sms_graph)[['__id','in_degree','out_degree']].join(features,'__id')
features.rename({'in_degree': 'sms_in_degree', 'out_degree':'sms_out_degree'})
features = get_degree(meetings_graph)[['__id','in_degree']].join(features,'__id')
features.rename({'in_degree': 'meetings_degree'})

__id,meetings_degree,sms_in_degree,sms_out_degree,calls_in_degree,calls_out_degree,pagerank
fa10-01-14,909,10,10,9,7,3.57116116408
sp10-01-38,1865,28,22,10,16,2.90102020381
sp10-01-45,1625,16,19,14,14,2.85754832866
sp10-01-41,1990,13,16,12,13,2.1392302624
fa10-01-74,814,6,6,4,4,2.06182967695
fa10-01-73,611,5,5,4,1,1.89228409156
sp10-01-39,1237,27,33,16,16,1.69672918106
sp10-01-49,1849,21,26,11,15,1.52625027687
fa10-01-35,862,0,0,6,5,1.48875
fa10-01-36,1813,0,0,6,4,1.36125

sms_pagerank,meetings_pagerank
1.09481881721,1.10262948825
1.34388554723,1.78467293048
1.11826006217,1.32492059447
1.08095564414,1.7179144092
0.799170918741,0.454370612871
0.489824036745,0.361358585383
1.24937088019,1.17323271994
1.18835949025,1.74255837035
0.15,0.666061898471
0.15,0.76974517688


In [24]:
##### normalization #####

features_names = ['calls_in_degree','calls_out_degree','meetings_degree','meetings_pagerank','pagerank'\
                 ,'sms_in_degree','sms_out_degree','sms_pagerank']
maximum = gl.SFrame()
minimum = gl.SFrame()

for f in features_names:
    maximum[f] = gl.SArray([max(features[f])])
    minimum[f] = gl.SArray([min(features[f])])

for f in features_names:
    mn = minimum[f][0]
    mx = maximum[f][0]
    features[f] = features[f].apply(lambda val: (1.0*val-mn)/(mx-mn))

In [25]:
############ export to file ############
gender.rename({'user': 'id'})
features.rename({'__id': 'id'})
features = features.join(gender,'id')
features.rename({'gender': 'label'})
features.save('/assignment/features/features_network.csv', format='csv')

In [26]:
############ features selection ############
new_features = gl.SFrame.read_csv('/assignment/features/features_network.csv')
old_features = gl.SFrame.read_csv('/assignment/features/features_initial.csv')
features = new_features.join(old_features,'id')
features.remove_columns(['label','average_duration','portion_of_incoming_calls','portion_of_incoming_sms', \
                         'average_distinct_sms_contacts_per_day','average_number_of_calls_per_day','sms_pagerank'])

features.save('/assignment/features/features_full.csv', format='csv')

PROGRESS: Finished parsing file /assignment/features/features_network.csv
PROGRESS: Parsing completed. Parsed 100 lines in 0.01148 secs.
------------------------------------------------------
Inferred types from first line of file as 
column_type_hints=[str,float,float,float,float,float,float,float,float,str]
If parsing fails due to incorrect types, you can correct
the inferred type list above and pass it to read_csv in
the column_type_hints argument
------------------------------------------------------
PROGRESS: Finished parsing file /assignment/features/features_network.csv
PROGRESS: Parsing completed. Parsed 138 lines in 0.011136 secs.
PROGRESS: Finished parsing file /assignment/features/features_initial.csv
PROGRESS: Parsing completed. Parsed 100 lines in 0.011811 secs.
------------------------------------------------------
Inferred types from first line of file as 
column_type_hints=[str,float,float,float,float,float,float,float,float,float,float]
If parsing fails due to incorrec

In [27]:
############ export to file ############
def export_list_to_file(l,fn,features_names=None):
    import csv
    with open(fn, 'wb') as csvfile:
        f = csv.writer(csvfile, delimiter=',', quotechar='|', quoting=csv.QUOTE_MINIMAL)
        if features_names: f.writerow(['id']+features_names)
        for i in l:
            f.writerow(i)
    return

In [28]:
'''

folds = gl.toolkits.cross_validation.KFold(data, 3)
params = dict([('target', 'gender'), ('features', features_names)])
job = gl.toolkits.cross_validation.cross_val_score(folds,
                                              gl.logistic_classifier.create,
                                              params)
print job.get_results()

'''


data = gl.SFrame.read_csv('/assignment/features/features_full.csv')
features_names = ['calls_in_degree','calls_out_degree','meetings_degree','pagerank'\
                   ,'sms_in_degree','sms_out_degree','meetings_pagerank', \
                        'average_distinct_meetings_contacts_per_day' \
                         ,'average_distinct_calls_contacts_per_day','average_number_of_sms_per_day', \
                         'average_number_of_meetings_per_day']

#svm_classifier
#logistic_classifier
#boosted_trees_classifier
#neuralnet_classifier
#nearest_neighbor_classifier

(train_set, test_set) = data.random_split(0.7)
train_set.save('/assignment/features/training_full.csv', format='csv')
test_set.save('/assignment/features/testing_full.csv', format='csv')

model = gl.logistic_classifier.create(train_set, target='gender', features=features_names, validation_set=test_set)
predictions = model.predict(test_set)

test_set['predicted'] = predictions
test_set = test_set[['id','gender','predicted']]

accuracy = gl.evaluation.accuracy(test_set['gender'],test_set['predicted'])

test_set.save('/assignment/output/tested_full.csv', format='csv')
export_list_to_file([['Model Accuracy'],['The model\'s accuracy is:'],[str(accuracy)]],'/assignment/output/accuracy_full.txt')

accuracy

PROGRESS: Finished parsing file /assignment/features/features_full.csv
PROGRESS: Parsing completed. Parsed 100 lines in 0.014044 secs.
------------------------------------------------------
Inferred types from first line of file as 
column_type_hints=[str,float,float,float,float,float,float,float,float,float,float,float,int]
If parsing fails due to incorrect types, you can correct
the inferred type list above and pass it to read_csv in
the column_type_hints argument
------------------------------------------------------
PROGRESS: Finished parsing file /assignment/features/features_full.csv
PROGRESS: Parsing completed. Parsed 120 lines in 0.010993 secs.
PROGRESS: Logistic regression:
PROGRESS: --------------------------------------------------------
PROGRESS: Number of examples          : 73
PROGRESS: Number of classes           : 2
PROGRESS: Number of feature columns   : 11
PROGRESS: Number of unpacked features : 11
PROGRESS: Number of coefficients    : 12
PROGRESS: Starting Newton Met

0.574468085106383