In [22]:
import csv
import itertools
import os
import pickle
import random

from tqdm import tqdm
import networkx as nx
import statistics
from models import Rating, Friendship, User


class Graph:
    DATA_ADDRESS = "data"
    GRAPH_FILE_NAME = "graph.txt"
    JUDGEMENT_VALIDITY_LIMIT = 3  # if there are more common venues than this, judgement_validity will be 1
    VENUE_METADATA_FIELD = "venue_ratings"
    FOLLOWING_METADATA_FIELD = "followings"
    LONGITUDE_FIELD = "longitude"
    LATITUDE_FIELD = "latitude"

    def __init__(self):
        self.graph = self.graph = nx.Graph()

    def get_average_influence_for_top_influential_users(
            self,
            top_influencers_percentage_start,
            top_influencers_percentage_end
    ):
        user_influence_values = [(user_id, bc_val) for user_id, bc_val in self.__get_users_values().items()]
        user_influence_values.sort(key=lambda k: k[1])
        end = len(user_influence_values) - int(len(user_influence_values) * top_influencers_percentage_start)
        start = len(user_influence_values) - int(len(user_influence_values) * top_influencers_percentage_end)
        top_influential_users = [user_id for user_id, _ in user_influence_values[start:end]]
        influences = list(map(self.__calculate_user_influence, top_influential_users))
        return statistics.mean(influences)

    def __calculate_user_influence(self, user_id):
        total_common_venues_with_neighbors = 0
        total_influences_on_neighbors = 0
        for venue_id, user_rate in self.graph.nodes[user_id][self.VENUE_METADATA_FIELD].items():
            for node in self.graph.neighbors(user_id):
                if not self.graph.nodes[node][self.VENUE_METADATA_FIELD].get(venue_id):
                    continue
                total_common_venues_with_neighbors += 1
                if int(user_rate) - 1 < int(self.graph.nodes[node][self.VENUE_METADATA_FIELD][venue_id]) < int(user_rate) + 1:
                    total_influences_on_neighbors += 1
        return total_influences_on_neighbors / total_common_venues_with_neighbors

    def __get_users_values(self):
        venue_ratings_percentage = self.__venue_ratings_percentage()
        values = {}
        for node in self.graph.nodes:
            rates = []
            for venue_id, rate in self.graph.nodes[node][self.VENUE_METADATA_FIELD].items():
                rates.append(venue_ratings_percentage[venue_id][rate])
            values[node] = (sum(rates) / len(rates))
        return values

    def __venue_ratings_percentage(self):
        ratings = Rating.read_ratings(f'{self.DATA_ADDRESS}/ratings.txt')
        venue_ratings_percentage = {}
        for rate in ratings:
            if venue_ratings_percentage.get(rate.venue_id):
                venue_ratings_percentage[rate.venue_id][rate.rate] += 1
            else:
                venue_ratings_percentage[rate.venue_id] = {"1": 0, "2": 0, "3": 0, "4": 0, "5": 0}
                venue_ratings_percentage[rate.venue_id][rate.rate] += 1

        for venue in venue_ratings_percentage.values():
            s = sum(venue.values())
            for key in venue:
                venue[key] = venue[key] / s

        return venue_ratings_percentage

    def get_average_friends_influence_on_users_rate(self):
        friends_influence_on_users = []
        for node in self.graph.nodes:
            if self.graph.nodes[node][self.FOLLOWING_METADATA_FIELD] is None:
                continue
            friends_influence_on_users.append(self.__get_friends_influence_on_user(node))
        friends_influence_on_users = list(filter(None, friends_influence_on_users))
        return statistics.mean(friends_influence_on_users)

    def __get_friends_influence_on_user(self, node):
        total_records = 0
        total_influences = 0
        for venue_id, user_rate_for_venue in self.graph.nodes[node][self.VENUE_METADATA_FIELD].items():
            for following in self.graph.nodes[node][self.FOLLOWING_METADATA_FIELD]:
                following_rate_for_venue = self.graph.nodes[following][self.VENUE_METADATA_FIELD].get(venue_id, None)
                if following_rate_for_venue is not None:
                    total_records += 1
                    if following_rate_for_venue == user_rate_for_venue:
                        total_influences += 1
        if total_records == 0:
            return None
        return total_influences / total_records

    def set_nodes_and_edges(self):
        user_venue_ratings = self.__get_user_venue_ratings()
        friendships = self.__get_friendships()
        users = self.__get_users()
        user_pairs = list(itertools.combinations(user_venue_ratings, 2))
        print("calculating edges ...")
        for pair in tqdm(user_pairs):
            temp_rating_diffs = []
            for vid in user_venue_ratings[pair[0]]:
                if user_venue_ratings[pair[1]].get(vid):
                    diff = int(user_venue_ratings[pair[1]].get(vid)) - int(user_venue_ratings[pair[0]].get(vid))
                    temp_rating_diffs.append((5 - abs(diff)) / 5)

            judgement_validity = self.__get_judgement_validity(len(temp_rating_diffs))
            if len(temp_rating_diffs) > 0:
                self.graph.add_edge(pair[0], pair[1], weight=statistics.mean(temp_rating_diffs) * judgement_validity)

                self.graph.nodes[pair[0]][self.VENUE_METADATA_FIELD] = user_venue_ratings[pair[0]]
                self.graph.nodes[pair[1]][self.VENUE_METADATA_FIELD] = user_venue_ratings[pair[1]]

                longitude, latitude = users[pair[0]]
                self.graph.nodes[pair[0]][self.LONGITUDE_FIELD] = longitude
                self.graph.nodes[pair[0]][self.LATITUDE_FIELD] = latitude
                longitude, latitude = users[pair[1]]
                self.graph.nodes[pair[1]][self.LONGITUDE_FIELD] = longitude
                self.graph.nodes[pair[1]][self.LATITUDE_FIELD] = latitude

        for node in tqdm(self.graph.nodes):
            if friendships.get(node) is None:
                self.graph.nodes[node][self.FOLLOWING_METADATA_FIELD] = None
                continue
            to_be_removed = []
            for following in friendships.get(node):
                if self.graph.nodes.get(following) is None:
                    to_be_removed.append(following)
            for user in to_be_removed:
                friendships.get(node).remove(user)
            self.graph.nodes[node][self.FOLLOWING_METADATA_FIELD] = friendships.get(node)

    def __get_user_venue_ratings(self):
        ratings = Rating.read_ratings(f'{self.DATA_ADDRESS}/ratings.txt')
        user_venue_ratings = {}
        for rate in ratings:
            if user_venue_ratings.get(rate.user_id):
                user_venue_ratings[rate.user_id][rate.venue_id] = rate.rate
            else:
                user_venue_ratings[rate.user_id] = {rate.venue_id: rate.rate}

        return user_venue_ratings

    def __get_friendships(self):
        friendships = Friendship.read_friendships(f'{self.DATA_ADDRESS}/friendships.txt')
        followings = {}
        for friendship in friendships:
            if followings.get(friendship.first):
                followings[friendship.first].add(friendship.second)
            else:
                followings[friendship.first] = {friendship.second}
        return followings

    def __get_users(self):
        return {user.identifier: (user.long, user.lat) for user in User.read_users(f"{self.DATA_ADDRESS}/users.txt")}

    def __get_judgement_validity(self, amount):
        if amount >= self.JUDGEMENT_VALIDITY_LIMIT:
            return 1
        return amount / self.JUDGEMENT_VALIDITY_LIMIT

    def create_graph_from_inputs(self):
        self.set_nodes_and_edges()
        pickle.dump(self.graph, open(f'{self.DATA_ADDRESS}/{self.GRAPH_FILE_NAME}', 'wb'))

    def export_graph_to_csv(self, graph, prefix=""):
        with open(f"{self.DATA_ADDRESS}/{prefix}nodes.csv", 'w', newline='') as file:
            writer = csv.writer(file)
            writer.writerow(["ID", "longitude", "latitude"])
            for node in graph.nodes:
                writer.writerow([
                    node,
                    graph.nodes[node][self.LONGITUDE_FIELD],
                    graph.nodes[node][self.LATITUDE_FIELD]
                ])
        with open(f"{self.DATA_ADDRESS}/{prefix}edges.csv", 'w', newline='') as file:
            writer = csv.writer(file)
            writer.writerow(["Source", "Target", "weight"])
            for edge in graph.edges:
                writer.writerow([edge[0], edge[1], graph.get_edge_data(*edge)["weight"]])

    def get_limited_random_graph(self, num_of_nodes, seed=None):
        assert num_of_nodes < self.graph.number_of_nodes()
        if seed is not None:
            random.seed(seed)
        nodes = set(random.sample(self.graph.nodes, num_of_nodes))
        new_graph = nx.Graph()
        for node1, node2, edge_data in self.graph.edges(data=True):
            if node1 not in nodes or node2 not in nodes:
                continue
            new_graph.add_edge(node1, node2, weight=edge_data["weight"])
            new_graph.nodes[node1][self.VENUE_METADATA_FIELD] = self.graph.nodes[node1][self.VENUE_METADATA_FIELD]
            new_graph.nodes[node2][self.VENUE_METADATA_FIELD] = self.graph.nodes[node2][self.VENUE_METADATA_FIELD]
            new_graph.nodes[node1][self.LONGITUDE_FIELD] = self.graph.nodes[node1][self.LONGITUDE_FIELD]
            new_graph.nodes[node1][self.LATITUDE_FIELD] = self.graph.nodes[node1][self.LATITUDE_FIELD]
            new_graph.nodes[node2][self.LONGITUDE_FIELD] = self.graph.nodes[node2][self.LONGITUDE_FIELD]
            new_graph.nodes[node2][self.LATITUDE_FIELD] = self.graph.nodes[node2][self.LATITUDE_FIELD]
            new_graph.nodes[node1][self.FOLLOWING_METADATA_FIELD] = self.graph.nodes[node1][self.FOLLOWING_METADATA_FIELD]
            new_graph.nodes[node2][self.FOLLOWING_METADATA_FIELD] = self.graph.nodes[node2][self.FOLLOWING_METADATA_FIELD]
        return new_graph

    def read_graph(self):
        self.graph = pickle.load(open(f'{self.DATA_ADDRESS}/{self.GRAPH_FILE_NAME}', 'rb'))


In [23]:
    def get_average_friends_per_vanue(self):
        user_venue_ratings = self.__get_user_venue_ratings()
        friendships = self.__get_friendships()
        users = self.__get_users()

In [24]:
import networkx as nx

# from graph import Graph
from models import clean_data


# clean_data()
graph = Graph()
# graph.create_graph_from_inputs()
graph.read_graph()
print(nx.info(graph.graph))


Graph with 5153 nodes and 133784 edges


In [25]:
dir(graph)

['DATA_ADDRESS',
 'FOLLOWING_METADATA_FIELD',
 'GRAPH_FILE_NAME',
 'JUDGEMENT_VALIDITY_LIMIT',
 'LATITUDE_FIELD',
 'LONGITUDE_FIELD',
 'VENUE_METADATA_FIELD',
 '_Graph__calculate_user_influence',
 '_Graph__get_friends_influence_on_user',
 '_Graph__get_friendships',
 '_Graph__get_judgement_validity',
 '_Graph__get_user_venue_ratings',
 '_Graph__get_users',
 '_Graph__get_users_values',
 '_Graph__venue_ratings_percentage',
 '__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 'create_graph_from_inputs',
 'export_graph_to_csv',
 'get_average_friends_influence_on_users_rate',
 'get_average_influence_for_top_influential_users',
 'get_limited_random_graph',
 'graph',
 'read_graph',

In [30]:
user_venue_ratings = graph._Graph__get_user_venue_ratings()
friendships = graph._Graph__get_friendships()
users = graph._Graph__get_users()
ratings = Rating.read_ratings(f'data/ratings.txt')

In [71]:
venue_users = {}
for rate in ratings:
    if venue_users.get(rate.venue_id) is None:
        venue_users[rate.venue_id] = [rate.user_id]
    else:
        venue_users[rate.venue_id].append(rate.user_id)

venue_avg_frndshp = {}
for venue in venue_users.keys():
    count = 0
    total = 0
    for user in venue_users[venue]:
        if graph.graph.nodes.get(user) is not None and graph.graph.nodes[user].get("followings") is not None:
            total += len(list(graph.graph.nodes[user]["followings"]))
        elif graph.graph.nodes.get(user) is None:
            count -= 1
            
        count += 1
        if count != 0:
            venue_avg_frndshp[venue] = total/count
        else:
            venue_avg_frndshp[venue] = 0

for w in sorted(venue_avg_frndshp, key=venue_avg_frndshp.get, reverse=True):
    print(f"venue {w}: {venue_avg_frndshp[w]}")

venue 132282: 238.0
venue 132283: 238.0
venue 132291: 238.0
venue 1487: 157.0
venue 1489: 157.0
venue 161453: 136.0
venue 161458: 136.0
venue 161461: 136.0
venue 161462: 136.0
venue 255169: 130.33333333333334
venue 255173: 130.33333333333334
venue 132290: 119.0
venue 286889: 106.0
venue 301060: 106.0
venue 301068: 106.0
venue 255166: 98.75
venue 255167: 98.5
venue 255177: 98.0
venue 284197: 95.0
venue 284199: 95.0
venue 284200: 95.0
venue 210492: 92.0
venue 210494: 92.0
venue 210495: 92.0
venue 210496: 92.0
venue 210497: 92.0
venue 210498: 92.0
venue 301073: 89.0
venue 151352: 87.0
venue 151353: 87.0
venue 151355: 87.0
venue 151357: 87.0
venue 151358: 87.0
venue 151359: 87.0
venue 151361: 87.0
venue 151363: 87.0
venue 151364: 87.0
venue 151365: 87.0
venue 151366: 87.0
venue 151367: 87.0
venue 151368: 87.0
venue 151369: 87.0
venue 151373: 87.0
venue 151374: 87.0
venue 151375: 87.0
venue 151376: 87.0
venue 151377: 87.0
venue 56797: 86.66666666666667
venue 1488: 82.75
venue 80414: 82.6666

venue 33375: 8.703703703703704
venue 1059436: 8.666666666666666
venue 256003: 8.666666666666666
venue 19036: 8.666666666666666
venue 32398: 8.666666666666666
venue 96229: 8.666666666666666
venue 52307: 8.666666666666666
venue 472076: 8.666666666666666
venue 792437: 8.666666666666666
venue 329856: 8.666666666666666
venue 98831: 8.666666666666666
venue 218304: 8.666666666666666
venue 25334: 8.625
venue 220816: 8.6
venue 192296: 8.6
venue 13881: 8.571428571428571
venue 29773: 8.555555555555555
venue 49249: 8.5
venue 562560: 8.5
venue 330420: 8.5
venue 92846: 8.5
venue 276751: 8.5
venue 55924: 8.5
venue 14053: 8.5
venue 182838: 8.5
venue 80968: 8.5
venue 32394: 8.5
venue 846991: 8.5
venue 433341: 8.5
venue 94923: 8.5
venue 14110: 8.416666666666666
venue 118025: 8.384615384615385
venue 149909: 8.333333333333334
venue 180221: 8.333333333333334
venue 65922: 8.333333333333334
venue 329845: 8.333333333333334
venue 461032: 8.333333333333334
venue 344318: 8.333333333333334
venue 136053: 8.3333333

venue 72181: 4.0
venue 72184: 4.0
venue 72187: 4.0
venue 72189: 4.0
venue 72196: 4.0
venue 152690: 4.0
venue 670210: 4.0
venue 111660: 4.0
venue 679600: 4.0
venue 853129: 4.0
venue 853130: 4.0
venue 853131: 4.0
venue 853135: 4.0
venue 853137: 4.0
venue 853138: 4.0
venue 853139: 4.0
venue 853140: 4.0
venue 853143: 4.0
venue 853148: 4.0
venue 112866: 4.0
venue 405342: 4.0
venue 621484: 4.0
venue 621488: 4.0
venue 621490: 4.0
venue 621491: 4.0
venue 621494: 4.0
venue 621496: 4.0
venue 109843: 4.0
venue 109175: 4.0
venue 109177: 4.0
venue 265113: 4.0
venue 614355: 4.0
venue 679590: 4.0
venue 679592: 4.0
venue 679594: 4.0
venue 679603: 4.0
venue 679604: 4.0
venue 679605: 4.0
venue 679606: 4.0
venue 679609: 4.0
venue 679611: 4.0
venue 679612: 4.0
venue 679614: 4.0
venue 679615: 4.0
venue 679616: 4.0
venue 679617: 4.0
venue 183387: 4.0
venue 795647: 4.0
venue 795649: 4.0
venue 795650: 4.0
venue 314958: 4.0
venue 674163: 4.0
venue 674164: 4.0
venue 674165: 4.0
venue 100335: 4.0
venue 240063: 4

venue 285017: 1.5
venue 465835: 1.5
venue 140313: 1.5
venue 37052: 1.5
venue 64517: 1.5
venue 89705: 1.5
venue 735609: 1.5
venue 152701: 1.5
venue 330414: 1.5
venue 415942: 1.5
venue 849464: 1.5
venue 513719: 1.5
venue 582114: 1.5
venue 582157: 1.5
venue 558236: 1.5
venue 468665: 1.5
venue 300865: 1.5
venue 1021: 1.5
venue 283578: 1.5
venue 942082: 1.5
venue 595129: 1.5
venue 617697: 1.5
venue 281760: 1.5
venue 156982: 1.5
venue 327047: 1.5
venue 149911: 1.5
venue 18930: 1.5
venue 183332: 1.5
venue 828758: 1.5
venue 446218: 1.5
venue 921707: 1.5
venue 921716: 1.5
venue 921729: 1.5
venue 921768: 1.5
venue 692481: 1.5
venue 992413: 1.5
venue 693618: 1.5
venue 100420: 1.5
venue 100348: 1.5
venue 29258: 1.5
venue 209570: 1.5
venue 36100: 1.5
venue 112310: 1.5
venue 14532: 1.5
venue 14561: 1.5
venue 666978: 1.5
venue 1036954: 1.5
venue 639466: 1.5
venue 1058924: 1.5
venue 734869: 1.5
venue 736247: 1.5
venue 186791: 1.4166666666666667
venue 267613: 1.4
venue 207562: 1.4
venue 113539: 1.4
ven

venue 769065: 0.0
venue 668328: 0.0
venue 1067270: 0.0
venue 1067277: 0.0
venue 1098921: 0
venue 286321: 0
venue 969122: 0
venue 549506: 0.0
venue 571219: 0
venue 730329: 0
venue 730334: 0
venue 730338: 0
venue 730342: 0
venue 730687: 0
venue 730690: 0
venue 1092178: 0
venue 736398: 0
venue 753760: 0
venue 753761: 0
venue 753762: 0
venue 220639: 0.0
venue 774884: 0
venue 282144: 0.0
venue 114359: 0
venue 388811: 0
venue 1127288: 0
venue 1070038: 0.0
venue 100327: 0
venue 1137119: 0.0
venue 937112: 0
venue 187833: 0.0
venue 1101550: 0.0
venue 1101552: 0.0
venue 1015226: 0
venue 433324: 0
venue 527842: 0
venue 699636: 0
venue 478394: 0
venue 724399: 0
venue 1121590: 0
venue 1084226: 0.0
venue 1113128: 0.0
venue 1082524: 0
venue 1136910: 0
venue 767218: 0
venue 767219: 0
venue 959398: 0
venue 1076100: 0
venue 573002: 0.0
venue 975025: 0
venue 603963: 0.0
venue 657721: 0.0
venue 872440: 0
venue 956512: 0
venue 956523: 0
venue 956540: 0
venue 739665: 0
venue 767953: 0
venue 767954: 0
venue 

In [77]:
from typing import List, Tuple

from turfpy.measurement import boolean_point_in_polygon
from geojson import Point, Polygon, Feature


class User:
    ID_INDEX = 0
    LAT_INDEX = 1
    LONG_INDEX = 2

    def __init__(self, identifier, long, lat):
        self.identifier = identifier
        self.long = long
        self.lat = lat

    def get_string(self):
        return f'{self.identifier} {self.lat} {self.long}'

    @staticmethod
    def create_from_raw_inputs(inputs):
        return User(
            identifier=int(inputs[User.ID_INDEX]),
            lat=float(inputs[User.LAT_INDEX]),
            long=float(inputs[User.LONG_INDEX]),
        )

    @staticmethod
    def read_users(file_addr):
        result = []
        with open(file_addr, 'r') as file:
            for line in file.readlines():
                inputs = line.split()
                result.append(User(
                    identifier=inputs[User.ID_INDEX],
                    long=inputs[User.LONG_INDEX],
                    lat=inputs[User.LAT_INDEX]
                ))
        return result


class Venue:
    ID_INDEX = 0
    LAT_INDEX = 1
    LONG_INDEX = 2

    def __init__(self, identifier, long, lat):
        self.identifier = identifier
        self.long = long
        self.lat = lat

    def get_string(self):
        return f'{self.identifier} {self.lat} {self.long}'

    @staticmethod
    def create_from_raw_inputs(inputs):
        return Venue(
            identifier=int(inputs[Venue.ID_INDEX]),
            lat=float(inputs[Venue.LAT_INDEX]),
            long=float(inputs[Venue.LONG_INDEX]),
        )

    @staticmethod
    def read_venues(file_addr):
        result = []
        with open(file_addr, 'r') as file:
            for line in file.readlines():
                inputs = line.split()
                result.append(Venue(
                    identifier=inputs[Venue.ID_INDEX],
                    long=inputs[Venue.LONG_INDEX],
                    lat=inputs[Venue.LAT_INDEX]
                ))
        return result


class Rating:
    USER_ID_INDEX = 0
    VENUE_ID_INDEX = 1
    RATE_INDEX = 2

    def __init__(self, user_id, venue_id, rate):
        self.user_id = user_id
        self.venue_id = venue_id
        self.rate = rate

    def get_string(self):
        return f'{self.user_id} {self.venue_id} {self.rate}'

    @staticmethod
    def create_from_raw_inputs(inputs):
        return Rating(
            user_id=int(inputs[Rating.USER_ID_INDEX]),
            venue_id=int(inputs[Rating.VENUE_ID_INDEX]),
            rate=int(inputs[Rating.RATE_INDEX]),
        )

    @staticmethod
    def read_ratings(file_addr):
        result = []
        with open(file_addr, 'r') as file:
            for line in file.readlines():
                inputs = line.split()
                result.append(Rating(
                    user_id=inputs[Rating.USER_ID_INDEX],
                    venue_id=inputs[Rating.VENUE_ID_INDEX],
                    rate=inputs[Rating.RATE_INDEX]
                ))
        return result


class Checkin:
    ID_INDEX = 0
    USER_ID_INDEX = 1
    VENUE_ID_INDEX = 2
    LAT_INDEX = 3
    LONG_INDEX = 4
    CREATED_AT_INDEX = 5

    def __init__(self, identifier, user_id, venue_id, long, lat, created_at):
        self.identifier = identifier
        self.user_id = user_id
        self.venue_id = venue_id
        self.long = long
        self.lat = lat
        self.created_at = created_at

    def get_string(self):
        return f'{self.identifier} {self.user_id} {self.venue_id} {self.lat} {self.long} {self.created_at}'

    @staticmethod
    def create_from_raw_inputs(inputs):
        return Checkin(
            identifier=int(inputs[Checkin.ID_INDEX]),
            user_id=int(inputs[Checkin.USER_ID_INDEX]),
            venue_id=int(inputs[Checkin.VENUE_ID_INDEX]),
            lat=float(inputs[Checkin.LAT_INDEX]),
            long=float(inputs[Checkin.LONG_INDEX]),
            created_at=inputs[Checkin.CREATED_AT_INDEX],
        )


class Friendship:
    FIRST_INDEX = 0
    SECOND_INDEX = 1

    def __init__(self, first, second):
        self.first = first
        self.second = second

    def get_string(self):
        return f'{self.first} {self.second}'

    @staticmethod
    def create_from_raw_inputs(inputs):
        return Friendship(
            first=inputs[Friendship.FIRST_INDEX],
            second=inputs[Friendship.SECOND_INDEX],
        )

    @staticmethod
    def read_friendships(file_addr):
        result = []
        with open(file_addr, 'r') as file:
            for line in file.readlines():
                inputs = line.split()
                result.append(Friendship(
                    first=inputs[Friendship.FIRST_INDEX],
                    second=inputs[Friendship.SECOND_INDEX]
                ))
        return result


def read_data(file_address, model_class):
    data = []
    with open(file_address, 'r') as file:
        for line in file.readlines():
            record_data = [column.strip() for column in line.split(" ")]
            not_standard = False
            for column in record_data:
                if len(column) == 0:
                    not_standard = True
            if not_standard:
                continue
            data.append(model_class.create_from_raw_inputs(inputs=record_data))
    return data


def limit_data_by_location(polygon_coords: List[List[Tuple]], model_records):
    polygon = Polygon(polygon_coords)
    result = []
    for record in model_records:
        if boolean_point_in_polygon(Feature(geometry=Point((record.long, record.lat))), polygon):
            result.append(record)
    return result


def limit_records_by_user_venue(records, users, venues):
    user_ids = set([user.identifier for user in users])
    venues_ids = set([venue.identifier for venue in venues])
    result = []
    for record in records:
        if record.user_id in user_ids and record.venue_id in venues_ids:
            result.append(record)
    return result


def store_limited_friendships(friendships):
    users = set()
    with open("data/users.txt", 'r') as file:
        for line in file.readlines():
            users.add(line.split()[0])

    with open("data/friendships.txt", 'w') as file:
        for friendship in friendships:
            if friendship.first in users and friendship.second in users:
                file.write(f"{friendship.get_string()}\n")


def store_records(records, file_address):
    with open(file_address, 'w') as file:
        for record in records:
            file.write(f'{record.get_string()}\n')


def clean_data():
    san_francisco_coords = [[
        (-122.553454, 37.812965),
        (-122.359602, 37.817252),
        (-122.346337, 37.708571),
        (-122.523607, 37.708332)
    ]]

    place_name = "San Francisco"

    users = read_data("users.dat", User)
    venues = read_data("venues.dat", Venue)
    ratings = read_data("ratings.dat", Rating)
    checkins = read_data("checkins.dat", Checkin)

    san_francisco_users = limit_data_by_location(san_francisco_coords, users)
    san_francisco_venues = limit_data_by_location(san_francisco_coords, venues)
    san_francisco_ratings = limit_records_by_user_venue(ratings, san_francisco_users, san_francisco_venues)
    san_francisco_checkins = limit_records_by_user_venue(checkins, san_francisco_users, san_francisco_venues)

    print(f"total number of users: {len(users)}, users in {place_name}: {len(san_francisco_users)}")
    print(f"total number of venues: {len(venues)}, venues in {place_name}: {len(san_francisco_venues)}")
    print(f"total number of ratings: {len(ratings)}, ratings in {place_name}: {len(san_francisco_ratings)}")
    print(f"total number of checkins: {len(checkins)}, checkins in {place_name}: {len(san_francisco_checkins)}")

    store_records(san_francisco_users, "data/users.txt")
    store_records(san_francisco_venues, "data/venues.txt")
    store_records(san_francisco_ratings, "data/ratings.txt")
    store_records(san_francisco_checkins, "data/checkins.txt")

    friendships = read_data("socialgraph.dat", Friendship)
    store_limited_friendships(friendships)


# calgary_coords = [[
#     (-114.325419, 51.214159),  # long, lat
#     (-113.865366, 51.214159),
#     (-113.865366, 50.847729),
#     (-114.325419, 50.847729),
# ]]

# alberta_coords = [[
#     (-119.921953, 53.212140),  # long, lat
#     (-114.548265, 49.017250),
#     (-110.021898, 49.002837),
#     (-110.021898, 59.994249),
#     (-119.997483, 60.005236),
# ]]

# california_coords = [[
#     (-124.371877, 41.990849),
#     (-119.975840, 41.999506),
#     (-120.002350, 38.986355),
#     (-114.594288, 34.996728),
#     (-114.700328, 32.729648),
#     (-117.205534, 32.551061),
#     (-125.953869, 35.580972)
# ]]


In [None]:
def limit_data_by_location(polygon_coords: List[List[Tuple]], model_records):
    polygon = Polygon(polygon_coords)
    result = []
    for record in model_records:
        if boolean_point_in_polygon(Feature(geometry=Point((record.long, record.lat))), polygon):
            result.append(record)
    return result

In [97]:
checkins = read_data("./data/checkins.txt", Checkin)

from datetime import date

d0 = date(int(checkins[1].created_at.split("-")[0]), int(checkins[1].created_at.split("-")[1]), int(checkins[1].created_at.split("-")[2]))
d1 = date(2008, 9, 26)
d2 = date(2018, 9, 26)

delta = d2 - d1
print(delta.days)

3652


In [96]:
checkins[1].venue_id

18425

In [112]:
d0 = date(2008, 1, 1)

vnu_checkins = {}
users = []
for checkin in checkins:
    d1 = date(int(checkin.created_at.split("-")[0]), int(checkin.created_at.split("-")[1]), int(checkin.created_at.split("-")[2]))
    delta = d1 - d0
    if vnu_checkins.get(checkin.venue_id) is None:
        vnu_checkins[checkin.venue_id] = {}
    if vnu_checkins[checkin.venue_id].get(delta.days) is None:    
        vnu_checkins[checkin.venue_id][delta.days] = 1
    else:
        vnu_checkins[checkin.venue_id][delta.days] += 1
    users.append(checkin.user_id)
    
users = list(dict.fromkeys(users))


In [133]:
user_paghadam = {}
d0 = date(2008, 1, 1)

for checkin in checkins:
    user = checkin.user_id
    venue = checkin.venue_id
    if user_paghadam.get(user) is None:
        user_paghadam[user] = []
    if len(vnu_checkins[venue]) == 0:
        continue
    d1 = date(int(checkin.created_at.split("-")[0]), int(checkin.created_at.split("-")[1]), int(checkin.created_at.split("-")[2]))
    delta = d1 - d0
    deltadays = delta.days
    
    p = 60
    total_before = 0
    total_after = 0
    for i in range (deltadays - p, deltadays):
        if vnu_checkins[venue].get(i) is not None:
            total_before += vnu_checkins[venue][i]
    for i in range (deltadays, deltadays + p):
        if vnu_checkins[venue].get(i) is not None:
            total_after += vnu_checkins[venue][i]
           
    user_paghadam[user].append((total_after/p) - (total_before/p))

In [134]:
user_paghadam2 = {}
for p in user_paghadam:
    t = 0
    c = 0
    m = 0
    for i in user_paghadam[p]:
        t += i
        c += 1
    if c != 0:
        m = float(t)/float(c)

    user_paghadam2[p] = m
    


In [136]:
len(user_paghadam2)
# for w in sorted(user_paghadam2, key=user_paghadam2.get, reverse=True):
#     print(f"{w}: {user_paghadam2[w]}")

1491