## Imports and database connection

In [3]:
# Database Interaction
import json
import ast
import pandas as pd
import csv
import requests
import time
import traceback
from sqlalchemy import *
from sqlalchemy.orm import sessionmaker
from sqlalchemy.ext.declarative import declarative_base

# Analytics packages
import matplotlib
import numpy as np
import matplotlib.pyplot as plt
from sklearn.cluster import DBSCAN
from sklearn import metrics

from scipy.stats import beta
from scipy.stats import poisson

# Geospatial packages
import datetime
import instaconfig
from pytz import timezone
from geopy.distance import vincenty


# Pull from database
instagram, database, search_tags = instaconfig.config()

engine = create_engine('mysql://%(user)s:%(pass)s@%(host)s' % database)
engine.execute('use instagram')

q = '''
    SELECT *
    FROM posts
    WHERE searched_tag IN 
    %s ;
    ''' % ("('" + "','".join(search_tags[:6]) + "')")  # :7

#print q

df = pd.read_sql_query(q,con = engine)

# IPython plotting magic
#%matplotlib inline
#%pylab inline

## Identify spatial clusters

In [4]:
Xtrain = np.vstack((df.longitude, df.lat)).T
Xtrain *= np.pi/180

# Compute DBSCAN
db = DBSCAN(eps=4e-5, 
            min_samples=5,
            metric='haversine'
           ).fit(Xtrain)
core_samples_mask = np.zeros_like(db.labels_, dtype=bool)
core_samples_mask[db.core_sample_indices_] = True
labels = list(db.labels_)

# Number of clusters in labels, ignoring noise if present.
n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)

print('Estimated number of clusters: %d' % n_clusters_)

df['cluster_label'] = pd.Series(labels)

Estimated number of clusters: 1144


## Compute Cluster Radii

In [5]:
# Remove entries not belonging to a cluster
cluster_df = df[df['cluster_label'] != -1]

cluster_centroids = cluster_df[['cluster_label','lat','longitude']].groupby(['cluster_label']).mean().reset_index()
cluster_point_centroids = pd.merge(left = cluster_df[['cluster_label','lat','longitude']], right = cluster_centroids, how = 'inner', on = 'cluster_label', suffixes = ('_point','_centroid')).reset_index()

# Distance measure
def dist(x1,x2,y1,y2):
    return vincenty((x1,x2),(y1,y2)).miles
    #return np.sqrt(np.square(x1-y1)+np.square(x2-y2))

# Calculate distance of each point from centroid
cluster_point_centroids['distance_from_centroid'] = cluster_point_centroids.apply(lambda x: dist(x['lat_point'], x['longitude_point'], x['lat_centroid'], x['longitude_centroid']),axis = 1)

# Max over all distances from centroid
cluster_radius = cluster_point_centroids.groupby('cluster_label').max().reset_index()
cluster_radius = cluster_radius[['cluster_label','lat_centroid','longitude_centroid','distance_from_centroid']]

In [6]:
# Check if point should be assigned to cluster
import random

def check_spatial_membership(point):
    matches = cluster_radius[cluster_radius.apply(lambda x: vincenty(point,(x['lat_centroid'],x['longitude_centroid'])) <= x['distance_from_centroid'], axis = 1)]['cluster_label'].reset_index()
    if len(matches) < 1:
        return -1
    elif len(matches) == 1:
        return int(matches['cluster_label'])
    else:
        print "Something odd may happen"
        return int(matches.ix[random.sample(matches.index,1)]['cluster_label'])

## Identify Temporal Trends

In [7]:
# Identify time scales for binning
df1 = df[['created_time','cluster_label']]
df1['day_of_week'] = df['created_time'].apply(lambda x: int(datetime.datetime.fromtimestamp(x).weekday()))
df1['day_of_month'] = df['created_time'].apply(lambda x: datetime.datetime.fromtimestamp(x).day)
df1['hour_of_day'] = df['created_time'].apply(lambda x: datetime.datetime.fromtimestamp(x).hour)
df1['month'] = df['created_time'].apply(lambda x: datetime.datetime.fromtimestamp(x).month)
df1['year'] = df['created_time'].apply(lambda x: datetime.datetime.fromtimestamp(x).year)

# Bin by some number of hours
time_bin_hours = 4
df1['hour_of_day'] = df1['hour_of_day'].apply(lambda x : x / time_bin_hours)

arrival_times = df1.groupby(['year','month','day_of_month','hour_of_day','day_of_week','cluster_label']).count().reset_index()


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  app.launch_new_instance()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.htm

In [8]:
# Compute occupancy probability PER CLUSTER
bins_in_day = 24/time_bin_hours
bins_in_week = 7*bins_in_day

occupancy = (arrival_times.reset_index().groupby(['day_of_week','hour_of_day','cluster_label']).count()['year']).reset_index()
day_bins = np.zeros([7,bins_in_day,n_clusters_])

# unpack occupancy
for ii in occupancy.index:
     day_bins[int(occupancy.loc[ii]['day_of_week'])][int(occupancy.loc[ii]['hour_of_day'])][int(occupancy.loc[ii]['cluster_label'])] = occupancy.loc[ii]['year']

l, u, md = np.zeros(bins_in_week*n_clusters_), np.zeros(bins_in_week*n_clusters_), np.zeros(bins_in_week*n_clusters_)
day_bins = day_bins.reshape([bins_in_week*n_clusters_])

# average
md = np.divide(day_bins, 243)

# MAP estimates per bin
for ii in xrange(0,bins_in_week*n_clusters_):
    a, b = day_bins[ii], 243-day_bins[ii]             # Default = 243
    alpha = .05                                       # leftover probability
    l[ii] = beta.ppf(alpha / 2.0, a=a, b=b)           # lower threshhold
    u[ii] = beta.ppf(1.0 - alpha / 2.0, a=a, b=b)     # upper threshhold
    md[ii] = np.divide(a - 1.0, a + b - 2.0)          # mode

In [9]:
# Bin by cluster and by time period down to x hour blocks
temp_bin = df1.groupby(['cluster_label','day_of_week','day_of_month','hour_of_day','month','year']).count().reset_index()

# Remove outliers and relabel column
temp_bin_no_spatial_outliers = temp_bin[temp_bin['cluster_label'] != -1]
temp_bin_no_spatial_outliers.rename(columns = {'created_time' : 'num_posts_per_time_slot'}, inplace = True)

posts_per_time_period = temp_bin_no_spatial_outliers.groupby(['num_posts_per_time_slot','hour_of_day']).count()['cluster_label'].reset_index()

def arrival_statistics(time_bin):
    dist_posts_per_time_period = posts_per_time_period[posts_per_time_period['hour_of_day'] == time_bin]['cluster_label'].reset_index()['cluster_label']
    emp_dist = (dist_posts_per_time_period/dist_posts_per_time_period.sum()).values
    mu = dot(np.array(range(0,len(emp_dist))),emp_dist)
    return mu

A value is trying to be set on a copy of a slice from a DataFrame

See the the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  **kwargs)
