In [0]:
#Importing the necessary libraries and loading the data
from pyspark.sql.functions import *
# File location and type
file_location = "FileStore/tables/tourist_attractions09.csv"
file_type = "csv"

# CSV options
infer_schema = "false"
first_row_is_header = "true"
delimiter = ","

# The applied options are for CSV files.
df = spark.read.format(file_type) \
  .option("inferSchema", infer_schema) \
  .option("header", first_row_is_header) \
  .option("sep", delimiter) \
  .load(file_location)

display(df)

name,type,region,locality,geolocation
"""The """"second"""" home shopping store """"IJ Churin and Co. """"and store the Platonic brothers""",architecture,Amur region,Blagoveshchensk,"(Decimal('127.548877'), Decimal('49.811568'))"
"""""""Town of security officers""""""",architecture,Sverdlovsk region,Ekaterinburg,"(Decimal('60.621271'), Decimal('56.841798'))"
"""""""Palace for the beloved""""""",architecture,Kursk region,Safonovka,"(Decimal('35.139965'), Decimal('51.491293'))"
"""""""The House with The Firebird"""" (manor Zhelyabov LK)""",architecture,Tomsk region,Tomsk,"(Decimal('85.050946'), Decimal('56.469513'))"
"""""""House with the ghosts""""""",architecture,Novosibirsk region,Novosibirsk,"(Decimal('82.958761'), Decimal('55.041787'))"
"""""""House-decanter""""""",architecture,Sverdlovsk region,Verkh-neyvinsky,"(Decimal('60.131644'), Decimal('57.265584'))"
"""""""Emelyanovsky number""""""",architecture,Yaroslavskaya oblast,Rostov,"(Decimal('39.414526'), Decimal('57.185866'))"
"""""""Myasnikovskaya» Hospital""",architecture,Yaroslavskaya oblast,Rostov,"(Decimal('39.414526'), Decimal('57.185866'))"
"""""""The track Vejnbauma""""""",architecture,Belgorod region,Belgorod,"(Decimal('36.587223'), Decimal('50.59566'))"
"""""""Mansion Khomich""""""",architecture,Tomsk region,Tomsk,"(Decimal('84.947649'), Decimal('56.48464'))"


In [0]:
# This table will persist across cluster restarts as well as allow various users across different notebooks to query this data.
permanent_table_name = "tourist_attractions09_csv"
cols = df.columns

In [0]:
#Length of columns in dataframe
len(df.columns)

In [0]:
#Total number of entries in dataframe
df.count()

In [0]:
#Defined datatypes of the columns
df.dtypes

In [0]:
#Displaying the first 5-rows from the dataset
df.show(5)

In [0]:
#Displaying the datatypes of the columns
df.dtypes

In [0]:
#Dispalying count of columns having NULL values
from pyspark.sql.types import IntegerType
from pyspark.sql.functions import isnan, when, count, col

df.select([count(when(isnan(c) | col(c).isNull(), c)).alias(c) for c in df.columns]).show()

In [0]:
#Display the dataframe
df.show(20)

In [0]:
#Improting spark libraries and dropping the NULL values if present
from pyspark.sql import SparkSession
df.na.drop('any')
df.show()

In [0]:
#Importing the SQL libraries and cleaning the data and storin it to new DF
pyspark.sql.functions 
from pyspark.sql.functions import *
newDf = df.withColumn('geolocation', translate('geolocation', '(Decimal(', ''))
newDf = newDf.withColumn('geolocation', translate('geolocation', ')', ''))
split_col = pyspark.sql.functions.split(newDf['geolocation'], ',')
df2 = newDf.withColumn('lat', split_col.getItem(0)) .withColumn('long', split_col.getItem(1)) 

In [0]:
#Dispalying the new DF
df2.show(10)

In [0]:
#Storing the cleaned data to dataset
df3 = df2.select("lat","long")
df3 = df3.withColumn('lat', translate('lat', "'", ''))
df3 = df3.withColumn('long', translate('lat', "'", ''))
#Performing the correlation of feature
pandas_df = df3.toPandas()
#pandas_df.corr()
display(pandas_df)

lat,long
127.548877,127.548877
60.621271,60.621271
35.139965,35.139965
85.050946,85.050946
82.958761,82.958761
60.131644,60.131644
39.414526,39.414526
39.414526,39.414526
36.587223,36.587223
84.947649,84.947649


In [0]:
#Dispalying the new DF with new columns
df2.show()

In [0]:
#Import the libraries and preprocessing
import pyspark.sql.functions as f
df4 = df2.groupBy('name').count()

In [0]:
#Visualization for the name of place
display(df4)

name,count
The Office of the Kolyvan-Resurrection plants,1
Chamber Makarova in Kaluga,1
"""Monument """"Kadarchy""""""",1
Rotunda in Volgograd,1
Manor Vavilov,2
? Women's Mariinsky gymnasium,1
vedette,2
VV monument Nikiforov-Kyulyumnyuru,1
Monument military communications,1
Monument of deer-vehicle battalions,1


In [0]:
#Use of SQL for preprocessing
df3 = df2.groupBy('locality').count()

In [0]:
#Visualization according to the locality
display(df3)

locality,count
Volgograd,114
Gorki,1
guli,1
Polibin,1
Chaltyr,1
Mamonovo,2
Gnezdilovo,1
Baikalsk,1
SCW,2
Krutets,1


In [0]:
#Use of SQL for preprocessing
df5 = df2.groupBy('region').count()
#Visualization according to the region
display(df5)

region,count
Tyumen region,33
Sevastopol,6
Leningrad region,70
Ryazan Oblast,21
Republic of Crimea,36
Kaliningrad region,35
Chechen Republic,5
Kaluga region,59
Altai region,62
Vologda Region,60


In [0]:
#Use of SQL for preprocessing
df6 = df2.groupBy('type').count()
#Visualization according to the type pf place
display(df6)

type,count
industrial facilities,1
"the labor office""""""",1
monasteries,261
Granges,117
"Monuments, sculptures, memorials",1136
Entertainment,1
Abandoned shrine,24
palaces,35
archaeological sites,99
Obelisk / Stele,33


In [0]:
#Loading the data into new DF and preprocessing the data for the analysis
NEW_DF = df2.toPandas()
visits_by_locality = NEW_DF['locality'].value_counts()
def applyVisitsByLocality(x):
    return visits_by_locality[x]
NEW_DF['visits_by_locality'] = NEW_DF['locality'].apply(applyVisitsByLocality)
print(NEW_DF['visits_by_locality'].unique())

In [0]:
visits_by_region = NEW_DF['region'].value_counts()
def applyVisitsByRegion(x):
    return visits_by_region[x]
NEW_DF['visits_by_region'] = NEW_DF['region'].apply(applyVisitsByRegion)
print(NEW_DF['visits_by_region'].unique())

In [0]:
visits_by_name = NEW_DF['name'].value_counts()
def applyVisitsByName(x):
    return visits_by_name[x]
NEW_DF['visits_by_name'] = NEW_DF['name'].apply(applyVisitsByName)
print(NEW_DF['visits_by_name'].unique())

In [0]:
#Storing the modified and processed data into a new DF
MOD_DF = NEW_DF[['visits_by_name','visits_by_locality','visits_by_region']].corr()
MOD_DF.head()


Unnamed: 0,visits_by_name,visits_by_locality,visits_by_region
visits_by_name,1.0,-0.058932,-0.05249
visits_by_locality,-0.058932,1.0,0.817618
visits_by_region,-0.05249,0.817618,1.0


In [0]:
#Import the libraries required
import seaborn as sns
import matplotlib.pyplot as plt

plt.figure(figsize = (10,8))
#determinig the correlation for the each of the column values
ax = sns.heatmap(MOD_DF, annot=True)
#sns.heatmap(MOD_DF[['visits_by_name','visits_by_locality','visits_by_region']].corr(), annot=True)
plt.show()

In [0]:
#The DF to be used for analysis
NEW_DF.head()

Unnamed: 0,name,type,region,locality,geolocation,lat,long,visits_by_locality,visits_by_region,visits_by_name
0,"""The """"second"""" home shopping store """"IJ Churi...",architecture,Amur region,Blagoveshchensk,"'127.548877', '49.811568'",'127.548877','49.811568',49,54,1
1,"""""""Town of security officers""""""",architecture,Sverdlovsk region,Ekaterinburg,"'60.621271', '56.841798'",'60.621271','56.841798',93,135,1
2,"""""""Palace for the beloved""""""",architecture,Kursk region,Safonovka,"'35.139965', '51.491293'",'35.139965','51.491293',1,39,1
3,"""""""The House with The Firebird"""" (manor Zhelya...",architecture,Tomsk region,Tomsk,"'85.050946', '56.469513'",'85.050946','56.469513',23,26,1
4,"""""""House with the ghosts""""""",architecture,Novosibirsk region,Novosibirsk,"'82.958761', '55.041787'",'82.958761','55.041787',97,108,1


In [0]:
df = spark.createDataFrame(NEW_DF)
df.show()

In [0]:
#Importing the libraries and pre-processing the model
from pyspark.ml.feature import StringIndexer

indexer = StringIndexer(inputCol="name", outputCol="nameIndex")
df = indexer.fit(df).transform(df)
df.show()

In [0]:
#Importing and applying the MLlib techniques
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler

assembler = VectorAssembler(
    inputCols=["visits_by_locality", "visits_by_region"],
    outputCol="features")

df = assembler.transform(df)
#print("Assembled columns 'hour', 'mobile', 'userFeatures' to vector column 'features'")
df.select("features", "nameIndex").show(truncate=False)

In [0]:
#Applying Naive Bayes Classifier:
from pyspark.ml.classification import NaiveBayes
from pyspark.ml.evaluation import MulticlassClassificationEvaluator


df = df.withColumnRenamed("nameIndex","label")
# Split the data into train and test
splits = df.randomSplit([0.8, 0.2])
train = splits[0]
test = splits[1]

# create the trainer and set its parameters
nb = NaiveBayes(smoothing=10, modelType="multinomial")

# train the model
model = nb.fit(train)

# select example rows to display.
predictions = model.transform(test)
#predictions.show()

# compute accuracy on the test set
evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(predictions)
print("Test set accuracy = " + str(accuracy))

In [0]:
#Using Decision Tree Classifier:
from pyspark.ml import Pipeline
from pyspark.ml.classification import DecisionTreeClassifier
from pyspark.ml.feature import StringIndexer, VectorIndexer
from pyspark.ml.evaluation import MulticlassClassificationEvaluator


labelIndexer = StringIndexer(inputCol="label", outputCol="indexedLabel").fit(df)
# Automatically identify categorical features, and index them.
# We specify maxCategories so features with > 4 distinct values are treated as continuous.
featureIndexer =\
    VectorIndexer(inputCol="features", outputCol="indexedFeatures").fit(df)

# Split the data into training and test sets (30% held out for testing)
(trainingData, testData) = df.randomSplit([0.7, 0.3])

# Train a DecisionTree model.
dt = DecisionTreeClassifier(labelCol="indexedLabel", featuresCol="indexedFeatures")

# Chain indexers and tree in a Pipeline
pipeline = Pipeline(stages=[labelIndexer, featureIndexer, dt])

# Train model.  This also runs the indexers.
model = pipeline.fit(trainingData)

# Make predictions.
predictions = model.transform(testData)

# Select example rows to display.
predictions.select("prediction", "indexedLabel", "features").show(5)

# Select (prediction, true label) and compute test error
evaluator = MulticlassClassificationEvaluator(
    labelCol="indexedLabel", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(predictions)
print("Test Error = %g " % (1.0 - accuracy))

treeModel = model
# summary only
print(treeModel)

In [0]:
#implementing k-nearest neighbour classifier:
from scipy.sparse import csr_matrix 
from sklearn.neighbors import NearestNeighbors
tourist_visits = NEW_DF[['name','region','locality','visits_by_locality','visits_by_region']]
tourist_visits.drop_duplicates(subset ="name",keep = False, inplace = True) 
tourist_visits.head(10)

Unnamed: 0,name,region,locality,visits_by_locality,visits_by_region
0,"""The """"second"""" home shopping store """"IJ Churi...",Amur region,Blagoveshchensk,49,54
1,"""""""Town of security officers""""""",Sverdlovsk region,Ekaterinburg,93,135
2,"""""""Palace for the beloved""""""",Kursk region,Safonovka,1,39
3,"""""""The House with The Firebird"""" (manor Zhelya...",Tomsk region,Tomsk,23,26
4,"""""""House with the ghosts""""""",Novosibirsk region,Novosibirsk,97,108
5,"""""""House-decanter""""""",Sverdlovsk region,Verkh-neyvinsky,2,135
6,"""""""Emelyanovsky number""""""",Yaroslavskaya oblast,Rostov,15,96
7,"""""""Myasnikovskaya» Hospital""",Yaroslavskaya oblast,Rostov,15,96
8,"""""""The track Vejnbauma""""""",Belgorod region,Belgorod,9,28
9,"""""""Mansion Khomich""""""",Tomsk region,Tomsk,23,26


In [0]:
#columns = NEW_DF['region','locality']
tourist_visits_pivot = tourist_visits.pivot(index = 'name', 
                                            columns='region', 
                                            values='visits_by_region').fillna(0)

tourist_visits_pivot = tourist_visits.pivot(index = 'name', 
                                            columns='locality', 
                                            values='visits_by_locality').fillna(0)

In [0]:
X = NEW_DF[["visits_by_region","visits_by_locality"]]
y = NEW_DF[["name"]]

In [0]:
#Train and test the dataset
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20)

from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(X_train)

X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

In [0]:
#Import required libraries
from sklearn.neighbors import KNeighborsClassifier
classifier = KNeighborsClassifier(n_neighbors=6, metric='euclidean',algorithm='brute')
classifier.fit(X_train, np.ravel(y_train))
y_pred = classifier.predict(X_test)

In [0]:
#Getting the accuracy
from sklearn.metrics import accuracy_score
accuracy_score(y_test, y_pred)

In [0]:
from sklearn.metrics import f1_score
f1_score(y_test, y_pred, average=None)

In [0]:
model_knn = NearestNeighbors(metric='euclidean',algorithm='brute')
model_knn.fit(tourist_visits_matrix)

print(type(tourist_visits_pivot))
tourist_visits_pivot.head(5)

locality,: VIP,ALABIN,Abakan,Abaza:,Abram,Aginskoe,Ahtynsky,Ahtyrka,Ak-Erik,Aksarka,Aksay,Alatyr,Alekseevka,Alkun,Almazovo,Altun,Alushta,Always,Anapa,Anastasovo,Andrianov,Annino,Anzhero-Sudzhensk,Apatity,Apsheronsk,Aramil,Arkhangelsk,Armavir,Arzamas,Astrakhan,Avchurino,Axtuʙinsk,Azov,Açair,Bagrationovsk,Baikalsk,Bain,Bakhchisaray,Balaam,Balashikha,...,plowing,ponds,powder,red,red Armored car,red Hills,red Mountain,rich,saltwort,sanatorium Monino,searchers,small,spa,specific Uta,splices,suburban Slobidka,toiler,transitions,turistickou,upper Fiagdon,wasp,welcome,whenever,white Beach,white Crosses,your servant,youthful,zadonsk,zelenogradsk,Çaadaevka,İlinsko-Podomskoe,Амга,Золотаревка,Исетское,Качуг,Котельниково,Партизанск,Покча,Сретенск,Шанчы
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1,Unnamed: 81_level_1
"""""""Emelyanovsky number""""""",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
"""""""Gray Horse""""""",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
"""""""House with the ghosts""""""",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
"""""""House-decanter""""""",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
"""""""Mansion Khomich""""""",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [0]:
#Specifying the resuts needed and analyzing those by using the KNN for the recommendation of some new places:
import numpy as np
query_index = np.random.choice(tourist_visits_pivot.shape[0])
distances, indices = model_knn.kneighbors(tourist_visits_pivot.iloc[query_index,:].values.reshape(1, -1), n_neighbors=6)
distances_flattern_array = distances.flatten().astype(int)
for i in range(0,len(distances_flattern_array)):
    if i == 0:
        print("Recommendation for",tourist_visits_pivot.index[query_index]," \n")
    else:
        tempDf = NEW_DF[NEW_DF['name'] == tourist_visits_pivot.index[indices.flatten()[i]]][['name','region','locality','type','geolocation','visits_by_locality','visits_by_region']]
        print("Place Name         :",tempDf['name'].values[0],"\n")
        print("Type               :",tempDf['type'].values[0],"\n")
        print("Locality           :",tempDf['locality'].values[0],"\n")
        print("Region             :",tempDf['region'].values[0],"\n")
        print("Geolocation        :",tempDf['geolocation'].values[0],"\n")
        print("Visitor in Locality:",tempDf['visits_by_locality'].values[0],"\n")
        print("Visitor in Region  :",tempDf['visits_by_region'].values[0],"\n")
    print("----------------------------------------------------------------------------\n")

In [0]:
#Thus the results are well analyzed for the recommendation purposed.