In [10]:
from pyspark.sql import SparkSession
from pyspark.sql.types import *
from pyspark.sql.functions import udf, col, lower, regexp_replace,countDistinct
from pyspark.ml.feature import Tokenizer, StopWordsRemover, NGram, HashingTF, IDF
from pyspark.ml import Pipeline
from pyspark.sql.types import StructType,StructField
import pandas as pd
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.classification import LogisticRegressionModel
from pyspark.ml import PipelineModel
import time
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')



[nltk_data] Downloading package stopwords to /home/jovyan/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [11]:
spark = SparkSession.builder.getOrCreate()

In [12]:


#telechargement des données de trainning
train_review = "/home/jovyan/work/data/streaming_booking_prepared.csv"

# lecture d'un fichier de manière la plus brute
schema = StructType([
    StructField('c0', IntegerType(),True),
    StructField('hotel_name', StringType(),True),
    StructField('lat', FloatType(),True),
    StructField('long', FloatType(),True),
    StructField('average_score', FloatType(),True),
    StructField('review', StringType(),True),
    StructField('polarity', IntegerType(),True),
    StructField('word_counts', IntegerType(),True),
    StructField('tags', StringType(),True),
])

df = spark.read.format('csv').options(header=True).options(delimiter= ";").schema(schema).load(train_review )
df.dtypes

[('c0', 'int'),
 ('hotel_name', 'string'),
 ('lat', 'float'),
 ('long', 'float'),
 ('average_score', 'float'),
 ('review', 'string'),
 ('polarity', 'int'),
 ('word_counts', 'int'),
 ('tags', 'string')]

In [13]:
# Chargement du pipeline de preprocessing des données
#mypipeline =  Pipeline.load("./myPipeline")
mypipeline =  PipelineModel.load("pipe2")
# Chargement de notre modele
mylrModel =  LogisticRegressionModel.load("./myModel")

In [15]:
df_prepared= mypipeline.transform(df)


In [13]:
df_prepared


DataFrame[c0: int, hotel_name: string, lat: float, long: float, average_score: float, review: string, polarity: int, word_counts: int, tags: string, words_token: array<string>, words_clean: array<string>, bigrams: array<string>, rawFeatures: vector, features: vector]

In [16]:
lrPreds= mylrModel.transform(df_prepared)

In [17]:
lrPreds.select('hotel_name','polarity','prediction').head(50)

[Row(hotel_name='Best Western S vres Montparnasse', polarity=1, prediction=1.0),
 Row(hotel_name='Holiday Inn London Stratford City', polarity=0, prediction=0.0),
 Row(hotel_name='Novotel Suites Paris Montreuil Vincennes', polarity=0, prediction=0.0),
 Row(hotel_name='St James Court A Taj Hotel London', polarity=1, prediction=1.0),
 Row(hotel_name='Hotel Berna', polarity=0, prediction=0.0),
 Row(hotel_name='Ambassadors Bloomsbury', polarity=0, prediction=0.0),
 Row(hotel_name='Holiday Inn London Bloomsbury', polarity=1, prediction=1.0),
 Row(hotel_name='NH Collection Amsterdam Barbizon Palace', polarity=1, prediction=0.0),
 Row(hotel_name='St Pancras Renaissance Hotel London', polarity=1, prediction=1.0),
 Row(hotel_name='Hotel de France Wien', polarity=0, prediction=0.0),
 Row(hotel_name='NH Milano Touring', polarity=0, prediction=0.0),
 Row(hotel_name='Park Plaza Westminster Bridge London', polarity=0, prediction=0.0),
 Row(hotel_name='Hotel Erzherzog Rainer', polarity=1, prediction=

In [18]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator

my_eval_lr = BinaryClassificationEvaluator(rawPredictionCol='prediction', labelCol='polarity', metricName='areaUnderROC')
my_eval_lr.evaluate(lrPreds)

0.8781185256561072