Pkl Save Files: <br>
1 = includes tfidf <br>
1b = includes cosine similarity <br>
1c = includes official neighbourhoods dictionary

In [37]:
filename3 = "YELP-Test-10000-1.pkl"
dill.dump_session(filename3)

In [1]:
# %load "YELP-Functions.py"
#!/usr/bin/env python

# # YELP Project 2019: Reviews Analysis for Classifying Businesses into Neighbourhoods in Toronto
# ### Overall Question(s): Can language distinguish groups of businesses/people? What in the language? Why?

# In[3]:


import pandas as pd
import numpy as np
from nltk.corpus import stopwords 
from tqdm import tqdm
from collections import *
import operator
import itertools
import dill
import geopandas as gpd
import matplotlib.pyplot as plt
import descartes
from shapely.geometry import Point, Polygon
import math
from scipy import sparse
from sklearn.model_selection import train_test_split
pd.set_option('display.max_columns', 500)

def everyWord(file, stop_words):
    everyWord_counter = Counter()
    for comment in tqdm(file['1'].values):
        comment = [t.lower() for t in comment.replace('.',' ').replace(',',' ').split(" ") if ((len(t) > 1) and (t.lower() not in stop_words))]
        everyWord_counter.update(comment)
    return everyWord_counter

def topktermFreq(everyWord, stop_words, k):
    AllWords_stop = Counter(everyWord)
    sorted_dict = OrderedDict(sorted(AllWords_stop.items(), key=operator.itemgetter(1), reverse=True))
    topktermFreq = dict(itertools.islice(sorted_dict.items(), k))
    return topktermFreq

def topkdocFreq(file, stop_words, k):
    allWordsinFile = everyWord(file, stop_words)
    topkTF = topktermFreq(allWordsinFile, stop_words, k)
    topkdocFreq = defaultdict(int)
    for comment in tqdm(file['1'].values):
        comment = set(t.lower() for t in comment.replace('.',' ').replace(',',' ').split(" ") if len(t) > 1 and (t.lower() not in stop_words))
        for word in comment:
            if word in topkTF.keys():
                topkdocFreq[word] += 1
            else:
                continue
    return topkdocFreq

def TFIDF_xtrain(xtrain_file, topkDF, stop_words):
    row = []
    col = []
    data = []
    # for each of the comments (rows)
    row_index = 0
    # loop through each of the comments in the 
    for comment in tqdm(xtrain_file['1'].values):
        #comment contains all the words in the comments, but we are only interested in the 15
        comment = [t.lower() for t in comment.replace('.',' ').replace(',',' ').split(" ") if ((len(t) > 1) and (t.lower() not in stop_words))]
        # create a dictionary for all words
        c_counter = Counter(comment)        
        col_index = 0
        # loop through the top 10k words
        for word in topkDF.keys():
            if word in c_counter.keys():
                row.append(row_index)
                col.append(col_index)
                data.append(round(((c_counter[word]/len(comment))*math.log10(len(xtrain_file)/topkDF[word])),5))
                col_index += 1
            else:
                col_index += 1
        row_index += 1
    return sparse.coo_matrix((data,(row,col)), shape = (len(xtrain_file),len(topkDF))).toarray()

def TFIDF_test(xtrain_file, test_file, topkDF, stop_words):
    row = []
    col = []
    data = []
    # for each of the comments (rows)
    row_index = 0
    # loop through each of the comments in the 
    for comment in tqdm(test_file['1'].values):      
        #comment contains all the words in the comments, but we are only interested in the 15
        comment = [t.lower() for t in comment.replace('.',' ').replace(',',' ').split(" ") if ((len(t) > 1) and (t.lower() not in stop_words))]
        # create a dictionary for all words
        c_counter = Counter(comment)  
        col_index = 0
        # loop through the top 10k words
        for word in topkDF.keys():
            if word in c_counter.keys():
                row.append(row_index)
                col.append(col_index)
                data.append(round(((c_counter[word]/len(comment))*math.log10(len(xtrain_file)/topkDF[word])),5))
                col_index += 1
            else:
                col_index += 1
        row_index += 1
    return sparse.coo_matrix((data,(row,col)), shape = (len(test_file),len(topkDF))).toarray()


def existing_neighbourhood_dictionary(businesses_file, neighbourhoods_file):
    id=[]
    latitude=[]
    longitude=[]
    for x in range(businesses_file.shape[0]):
        id.append(businesses_file.iloc[x,13])
        longitude.append(businesses_file.iloc[x,30])
        latitude.append(businesses_file.iloc[x,31])  
    df = pd.DataFrame(
        {'ID': id,
         'Latitude': latitude,
         'Longitude': longitude})
    gdf = gpd.GeoDataFrame(df, geometry= gpd.points_from_xy(df.Longitude, df.Latitude))    
    business_neighbourhood = {}
    b_names = businesses_file.iloc[:,13]
    for key in b_names:
        business_neighbourhood[key] = float('nan')      
    for i in range(gdf.shape[0]): #for businesses
        for j in range(neighbourhoods_file.shape[0]): # for neighbourhood
            if (neighbourhoods_file.loc[j, 'geometry']).contains(gdf.iloc[i,3]) == True:
                business_neighbourhood[businesses_file.iloc[i,13]] = neighbourhoods_file.iloc[j,6]
    return business_neighbourhood

def official_neighbourhoods(businesses_file):
    neighbourhood_official = {}
    for x in tqdm(range(businesses_file.shape[0])):
        neighbourhood_official[(businesses_file.iloc[x,13])] = businesses_file.iloc[x,29]
    return neighbourhood_official

def cosineSimilarity(tfidf_xtrain, tfidf_test):
    similarities = np.zeros((len(tfidf_test),len(tfidf_xtrain)))
    for i in tqdm(range(tfidf_test.shape[0])):
        a = tfidf_test[i]
        for j in range(tfidf_xtrain.shape[0]):
                b = tfidf_xtrain[j]
                cosinesimilarity = round(np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b)), 1)
                if cosinesimilarity == 0:
                    continue
                else:
                    similarities[i][j] = cosinesimilarity
    return similarities

def kNearestNeighbours(test_file, xtrain_comments_file, existing_neighbourhoods_xtrain, cos_sim, k): #want to specify the size  
    kNearest = {}   
    # Have a dictionary stating the business and the neighbourhood, for ex. {2: 'A', 3: 'C', 6: 'C'}
    for row in tqdm(range(len(cos_sim))):
        possible_neighbours = []
        for col in range(len(cos_sim[0])):
            possible_neighbours.append((cos_sim[row][col], existing_neighbourhoods_xtrain[xtrain_comments_file.iloc[col,1]]))
        possible_neighbours.sort(key=lambda x: x[0], reverse=True)
  
        votes = Counter()
        for index in range(k):
            votes[possible_neighbours[index][1]] += 1
    
        kNearest[test_file.iloc[row,1]] = votes.most_common(1)[0][0]
    return kNearest
        
def Accuracy(kNearest_, existing_neighbourhood_dictionary):
    total = len(kNearest_)
    sum = 0
    for key in kNearest_.keys():
        if kNearest_[key] == existing_neighbourhood_dictionary[key]:
            sum += 1
    average = round(sum/total,5)
    return average

def official_neighbourhoods2(businesses_file):
    neighbourhood_official = {}
    for x in tqdm(range(businesses_file.shape[0])):
        neighbourhood_official[(businesses_file.iloc[x,0])] = businesses_file.iloc[x,29]
    return neighbourhood_official

def kNearestData(test_file, xtrain_comments_file, existing_neighbourhoods_xtrain, cos_sim, k): #want to specify the size  
    kNearest = {}   
    hit_ratio_list = []
    results_sf = []
    # Have a dictionary stating the business and the neighbourhood, for ex. {2: 'A', 3: 'C', 6: 'C'}
    for row in tqdm(range(len(cos_sim))):
        possible_neighbours = []
        for col in range(len(cos_sim[0])):
             possible_neighbours.append((cos_sim[row][col], existing_neighbourhoods_xtrain[xtrain_comments_file.iloc[col,1]]))
        possible_neighbours.sort(key=lambda x: x[0], reverse=True)
        votes = Counter()
        for index in range(k):
            votes[possible_neighbours[index][1]] += 1         
        kNearest[test_file.iloc[row,1]] = votes.most_common(1)[0][0]
        if existing_neighbourhoods_xtrain[test_file.iloc[row,1]] in votes.keys():        
            hit_ratio_list.append(round(votes[existing_neighbourhoods_xtrain[test_file.iloc[row,1]]]/k,4))
            if existing_neighbourhoods_xtrain[test_file.iloc[row,1]] == votes.most_common(1)[0][0]:
                results_sf.append("Success")
            else:
                results_sf.append("Fail")
        else: 
            hit_ratio_list.append(0)
            results_sf.append("Fail")
    return kNearest, votes, hit_ratio_list, results_sf

In [None]:
businesses

In [None]:
stop_words =  list(stopwords.words('english')) 
businesses = pd.read_csv("businesses (1).csv")
data = pd.read_csv('business_text_stripped.csv')
toronto_map = gpd.read_file('Neighbourhoods.geojson')

In [None]:
X_train, X_test = train_test_split(data, test_size=0.33)

In [None]:
topkDF_10000 = topkdocFreq(X_train, stop_words, 10000)

In [None]:
tfidf_xtrain_10000 = TFIDF_xtrain(X_train, topkDF_10000, stop_words)

In [None]:
tfidf_test_10000 = TFIDF_test(X_train, X_test, topkDF_10000, stop_words)

In [None]:
cos_sim_10000 = cosineSimilarity(tfidf_xtrain_10000, tfidf_test_10000)

In [None]:
official_neighbourhoods_byName = official_neighbourhoods(businesses)

In [None]:
official_neighbourhoods_byId = official_neighbourhoods2(businesses)

In [None]:
testGroup_1, testGroup_2, testGroup_3, testGroup_4 = np.array_split(X_test, 4)

In [None]:
cos_sim_1, cos_sim_2, cos_sim_3, cos_sim_4 = np.array_split(cos_sim_10000, 4)

### Group 1, k = 50

In [None]:
kNearest_10000_1 = kNearestNeighbours(testGroup_1, X_train, official_neighbourhoods_byName, cos_sim_1, 50)

In [None]:
accuracy_10000_1 = Accuracy(kNearest_10000_1, official_neighbourhoods_byName)

In [15]:
accuracy_10000_1

0.06964

### Group 2, k = 50

In [None]:
kNearest_650_2 = kNearestNeighbours(testGroup_2, X_train, official_neighbourhoods_byName, cos_sim_2, 50)

In [None]:
kNearest_10000_2 = kNearest_650_2 #renaming

In [None]:
accuracy_10000_2 = Accuracy(kNearest_10000_2, official_neighbourhoods_byName)

In [None]:
accuracy_10000_2

### Group 3, k = 50

In [None]:
kNearest_10000_3 = kNearestNeighbours(testGroup_3, X_train, official_neighbourhoods_byName, cos_sim_3, 50)

In [None]:
accuracy_10000_3 = Accuracy(kNearest_10000_3, official_neighbourhoods_byName)

In [14]:
accuracy_10000_3

0.06132

### Group 4, k = 50

In [None]:
kNearest_10000_4 = kNearestNeighbours(testGroup_4, X_train, official_neighbourhoods_byName, cos_sim_4, 50)

In [None]:
accuracy_10000_4 = Accuracy(kNearest_10000_4, official_neighbourhoods_byName)

In [13]:
accuracy_10000_4

0.06207

# Second Run 
### Group 1, k = 50

In [None]:
kNearest_1_50, votes_1_50, hitRatio_1_50, results_1_50 = kNearestData(testGroup_1, X_train, official_neighbourhoods_byName, cos_sim_1, 50)

In [None]:
accuracy_1_50 = Accuracy(kNearest_1_50, official_neighbourhoods_byName)

In [12]:
accuracy_1_50

0.06964

In [26]:
results_1_50_counter = Counter(results_1_50)

In [27]:
results_1_50_counter

Counter({'Fail': 2458, 'Success': 184})

### Group 1, k = 25

In [None]:
kNearest_1_25, votes_1_25, hitRatio_1_25, results_1_25 = kNearestData(testGroup_1, X_train, official_neighbourhoods_byName, cos_sim_1, 25)

In [None]:
accuracy_1_25 = Accuracy(kNearest_1_25, official_neighbourhoods_byName)

In [3]:
accuracy_1_25

0.06207

In [5]:
results_1_25_counter = Counter(results_1_25)

In [6]:
results_1_25_counter

Counter({'Fail': 2478, 'Success': 164})

In [10]:
hitRatio_1_25_counter = Counter(hitRatio_1_25)

In [11]:
hitRatio_1_25_counter

Counter({0.04: 417,
         0.08: 179,
         0: 1855,
         0.12: 84,
         0.8: 1,
         0.16: 51,
         0.28: 7,
         0.2: 22,
         0.52: 1,
         0.6: 1,
         0.24: 18,
         0.32: 2,
         0.48: 1,
         0.44: 2,
         0.4: 1})

### Group 1, k = 100

In [16]:
kNearest_1_100, votes_1_100, hitRatio_1_100, results_1_100 = kNearestData(testGroup_1, X_train, official_neighbourhoods_byName, cos_sim_1, 100)

100%|██████████| 2642/2642 [33:27<00:00,  1.07it/s]


In [19]:
accuracy_1_100 = Accuracy(kNearest_1_100, official_neighbourhoods_byName)

In [20]:
accuracy_1_100

0.07192

In [28]:
results_1_100_counter = Counter(results_1_100)

In [29]:
results_1_100_counter

Counter({'Fail': 2452, 'Success': 190})

In [17]:
hitRatio_1_100_counter = Counter(hitRatio_1_100)

In [18]:
hitRatio_1_100_counter

Counter({0.01: 445,
         0.04: 123,
         0: 1252,
         0.05: 82,
         0.03: 180,
         0.09: 41,
         0.36: 1,
         0.02: 268,
         0.07: 41,
         0.06: 66,
         0.14: 8,
         0.12: 14,
         0.22: 2,
         0.1: 29,
         0.11: 27,
         0.08: 23,
         0.21: 4,
         0.16: 5,
         0.3: 1,
         0.27: 1,
         0.13: 13,
         0.15: 7,
         0.18: 4,
         0.31: 1,
         0.17: 1,
         0.2: 2,
         0.19: 1})

### Group 1, k = 200

In [21]:
kNearest_1_200, votes_1_200, hitRatio_1_200, results_1_200 = kNearestData(testGroup_1, X_train, official_neighbourhoods_byName, cos_sim_1, 200)

100%|██████████| 2642/2642 [37:28<00:00,  1.56it/s]


In [24]:
accuracy_1_200 = Accuracy(kNearest_1_200, official_neighbourhoods_byName)

In [25]:
accuracy_1_200

0.07419

In [30]:
results_1_200_counter = Counter(results_1_200)

In [31]:
results_1_200_counter

Counter({'Fail': 2446, 'Success': 196})

### Group 1, k = 1500

In [33]:
kNearest_1_1500, votes_1_1500, hitRatio_1_1500, results_1_1500 = kNearestData(testGroup_1, X_train, official_neighbourhoods_byName, cos_sim_1, 1500)

100%|██████████| 2642/2642 [28:24<00:00,  1.69it/s]  


In [35]:
accuracy_1_1500 = Accuracy(kNearest_1_1500, official_neighbourhoods_byName)

In [36]:
accuracy_1_1500

0.06207