<a href="https://colab.research.google.com/github/manjitullal/foursquare/blob/master/FourSquare_Temporal_RNN.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Temporal and Spatial analysis of event data using RNN**



**Dataset:** foursquare

**Aim:** to predict future location of user given the current location.
An analogy for the aim is , predicting the next word in a sentence. 


**Contents:**

1. Data pre processing
2. Modeling
3. Training

### **1. Data pre processing**


In [0]:
import pandas as pd
import numpy as np
import torch
from google.colab import drive
import warnings
warnings.filterwarnings('ignore')

In [2]:
#dataset is in the google drive

drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
#my google drive path

!ls "/content/drive/My Drive/dataset/foursquare"
path = "/content/drive/My Drive/dataset/foursquare"

Checkin.txt	 UserFriends.txt    VenueRating.txt
Description.txt  VenueCategory.txt  Venue.txt


In [4]:
%%time
Checkin_columns = ['UserID','VenueID','Year','Month','Date','Hour']
Checkin = pd.read_csv(path+'/Checkin.txt', sep=',', skiprows=1, names=Checkin_columns)

CPU times: user 678 ms, sys: 146 ms, total: 824 ms
Wall time: 2.29 s


In [5]:
%%time

Venue_columns = ['VenueID','VenueName','Latitude','Longitude','CategoryID']
Venue = pd.read_csv(path+'/Venue.txt', sep=',', error_bad_lines=False,skiprows=1,names=Venue_columns)

VenueCategory_columns = ['CategoryID','CategoryName','ParentCategoryID']
VenueCategory = pd.read_csv(path+'/VenueCategory.txt', sep=',',error_bad_lines=False,skiprows=1,names=VenueCategory_columns)

VenueRating_columns = ['VenueID','Rating']
VenueRating = pd.read_csv(path+'/VenueRating.txt', sep=',',error_bad_lines=False,skiprows=1,names=VenueRating_columns)

UserFriends = pd.read_csv(path+'/UserFriends.txt', sep=',') 

all_tables = [Checkin,Venue,VenueCategory,VenueRating,UserFriends]
all_tables_string = ['Checkin','Venue','VenueCategory','VenueRating','UserFriends']

CPU times: user 949 ms, sys: 71.2 ms, total: 1.02 s
Wall time: 3.91 s


In [6]:
Checkin.head()

Unnamed: 0,UserID,VenueID,Year,Month,Date,Hour
0,u1302,v47,2012,2,24,11
1,u45,v132,2012,2,24,11
2,u24844,v86,2012,2,24,11
3,u896,v248,2012,2,24,11
4,u5020,v29,2012,2,24,11


In [7]:
#stats of the data 

def _describe(data):
    print(f" Number of rows: {data.shape[0]}")
    print(f" Number of columns: {data.shape[1]}")
    print(f" Number of null values: {np.sum(data.isnull().sum())}")
    print("The columns that have null values")
    print(pd.DataFrame(data.isnull().sum()).T)
    
for index,table in enumerate(all_tables):
    print(f"Details of table {all_tables_string[index]}")
    print("")
    _describe(table)
    print("")

Details of table Checkin

 Number of rows: 1276988
 Number of columns: 6
 Number of null values: 0
The columns that have null values
   UserID  VenueID  Year  Month  Date  Hour
0       0        0     0      0     0     0

Details of table Venue

 Number of rows: 85928
 Number of columns: 5
 Number of null values: 14
The columns that have null values
   VenueID  VenueName  Latitude  Longitude  CategoryID
0        0         12         2          0           0

Details of table VenueCategory

 Number of rows: 394
 Number of columns: 3
 Number of null values: 0
The columns that have null values
   CategoryID  CategoryName  ParentCategoryID
0           0             0                 0

Details of table VenueRating

 Number of rows: 68178
 Number of columns: 2
 Number of null values: 96
The columns that have null values
   VenueID  Rating
0       96       0

Details of table UserFriends

 Number of rows: 1366388
 Number of columns: 2
 Number of null values: 0
The columns that have null valu

we will initialing test the model for one user, hence we will filter the data for one user, eventually this will be extended for all users

In [8]:
#filter data for one user 

Checkin_u1205 = Checkin[Checkin.UserID == 'u1205']
Checkin_u1205.head()

Unnamed: 0,UserID,VenueID,Year,Month,Date,Hour
2723,u1205,v73805,2012,2,25,9
3817,u1205,v9884,2012,2,25,11
4739,u1205,v3906,2012,2,25,13
5904,u1205,v10373,2012,2,25,15
6840,u1205,v9884,2012,2,25,17


In [9]:
# drop userid as that is not useful now, since there is only one user 

Checkin_u1205.drop(['UserID'], axis=1, inplace=True)

#renaming column Date to Day
Checkin_u1205.rename(columns={"Date":"Day"}, inplace=True)
Checkin_u1205.head()

Unnamed: 0,VenueID,Year,Month,Day,Hour
2723,v73805,2012,2,25,9
3817,v9884,2012,2,25,11
4739,v3906,2012,2,25,13
5904,v10373,2012,2,25,15
6840,v9884,2012,2,25,17


In [10]:
# create a new column, datetime to sort the events 

%%time
Checkin_u1205['Datetime'] = pd.to_datetime(Checkin_u1205[['Year', 'Month', 'Day', 'Hour']])
Checkin_u1205.head()

CPU times: user 15.9 ms, sys: 1.1 ms, total: 17 ms
Wall time: 34.4 ms


In [11]:
# sort based on datetime
Checkin_u1205.sort_values(by='Datetime',inplace=True)

Checkin_u1205.head()

Unnamed: 0,VenueID,Year,Month,Day,Hour,Datetime
2723,v73805,2012,2,25,9,2012-02-25 09:00:00
9154,v40561,2012,2,25,9,2012-02-25 09:00:00
3817,v9884,2012,2,25,11,2012-02-25 11:00:00
10664,v1743,2012,2,25,11,2012-02-25 11:00:00
4739,v3906,2012,2,25,13,2012-02-25 13:00:00


In [0]:
# from the above, we can see that for some reason there are 2 duplicate timestamps with different venues
# it is not possible for a person to be at different location at the same time, so removing the rows with duplicate time stamps
# the category of the venues is hierarchical, however for there appears no link between the venues

Checkin_u1205_nodup = Checkin_u1205.drop_duplicates('Datetime')

In [13]:
print("Rows in Checkin_u1205: ", Checkin_u1205.shape[0])
print("Rows in Checkin_u1205_nodup: ", Checkin_u1205_nodup.shape[0])

Rows in Checkin_u1205:  1303
Rows in Checkin_u1205_nodup:  1227


In [14]:
Checkin_u1205_nodup.head()

Unnamed: 0,VenueID,Year,Month,Day,Hour,Datetime
2723,v73805,2012,2,25,9,2012-02-25 09:00:00
3817,v9884,2012,2,25,11,2012-02-25 11:00:00
4739,v3906,2012,2,25,13,2012-02-25 13:00:00
5904,v10373,2012,2,25,15,2012-02-25 15:00:00
6840,v9884,2012,2,25,17,2012-02-25 17:00:00


In [15]:
Checkin_u1205_nodup.iloc[:12]

Unnamed: 0,VenueID,Year,Month,Day,Hour,Datetime
2723,v73805,2012,2,25,9,2012-02-25 09:00:00
3817,v9884,2012,2,25,11,2012-02-25 11:00:00
4739,v3906,2012,2,25,13,2012-02-25 13:00:00
5904,v10373,2012,2,25,15,2012-02-25 15:00:00
6840,v9884,2012,2,25,17,2012-02-25 17:00:00
18507,v9885,2012,2,26,9,2012-02-26 09:00:00
12801,v10373,2012,2,26,11,2012-02-26 11:00:00
11674,v9885,2012,2,26,13,2012-02-26 13:00:00
15263,v2927,2012,2,26,15,2012-02-26 15:00:00
20745,v6013,2012,2,27,9,2012-02-27 09:00:00


For time being we are not using the heirarchical informations about the venue, to keep the baseline model simple.

Now, we need to create a time-series of events. 

`Example:`

User goes to gym, grocery and home in that order or shopping, movies, restaurant and home in that order. 

Here, we need to create the longest time-series.


In [18]:
# gather longest time-series by viewing the events of the user, events less than the duration of 8hrs between them will be added to the same series.
# idea is that, a gap of 8 hrs or more indicates the day has ended for the user, hence no more travel.

# for testing purpose, selected only 12 rows of the input 

%%time

import datetime

def _generate_events(data):
  previous_time = datetime.datetime(2020, 12, 31)
  all_events = []
  current_events = []
  for index, row in data.iterrows():
    current_time = row['Datetime']  
    current_hour = row['Hour']
    venue = row['VenueID']
    if( (current_time - previous_time).total_seconds()/60/60 < 8):
      current_events.append([venue, current_hour])
      previous_time = current_time
    else:
      all_events.append(current_events)
      current_events = []
      current_events.append([venue, current_hour])
      previous_time = current_time
  if len(current_events)>0:
    all_events.append(current_events)
  #print(all_events)
  return all_events

events = _generate_events(Checkin_u1205_nodup.iloc[:5])
print(events)

[[['v73805', 9], ['v9884', 11], ['v3906', 13], ['v10373', 15], ['v9884', 17]]]
CPU times: user 2.13 ms, sys: 3 µs, total: 2.13 ms
Wall time: 2.12 ms


In [21]:
# create all possible subsequences from the above sequence (maintaining the order)
# we need a length of atleast 2 for feature and the label, ignore all sequence of events less than length 2
# example
# for a sequence (v1,t1) (v2,t2) (v3,t3) (v4,t4)
# there are
# (v1,t1) (v2,t2)
# (v1,t1) (v2,t2) (v3,t3)
# (v1,t1) (v2,t2) (v3,t3) (v4,t4)
# (v2,t2) (v3,t3)
# (v2,t2) (v3,t3) (v4,t4)
# (v3,t3)


%%time

def _generate_subsequence(data):
  all_sequences = []
  for sequence in data:  
    if len(sequence) > 2:
      for i in range(0,len(sequence)-2):
        sequences = []
        for j in range(i+2,len(sequence)+1):      
          sequences.append(sequence[i:j])          
        all_sequences.append(sequences)
  #print(all_sequences)
  return all_sequences

sequences = _generate_subsequence(events)
print(sequences)

[[[['v73805', 9], ['v9884', 11]], [['v73805', 9], ['v9884', 11], ['v3906', 13]], [['v73805', 9], ['v9884', 11], ['v3906', 13], ['v10373', 15]], [['v73805', 9], ['v9884', 11], ['v3906', 13], ['v10373', 15], ['v9884', 17]]], [[['v9884', 11], ['v3906', 13]], [['v9884', 11], ['v3906', 13], ['v10373', 15]], [['v9884', 11], ['v3906', 13], ['v10373', 15], ['v9884', 17]]], [[['v3906', 13], ['v10373', 15]], [['v3906', 13], ['v10373', 15], ['v9884', 17]]]]
CPU times: user 267 µs, sys: 46 µs, total: 313 µs
Wall time: 253 µs
