In [1]:
# the path for data is data/json/
import json
import time
import pandas as pd # pandas pour avoir un format (DataFrame) confortable pour les données.
import numpy as np # Numpy pour le calcul du taux de bonnes prédictions
from sklearn.feature_extraction.text import TfidfVectorizer # outil pour traiter le texte
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.linear_model import SGDClassifier
import os

path_to_data = os.path.join("data/json/")

### 1. Apprentissage avec une SVM

#### 1.1 Définition du classifier

In [2]:
# Définition de la pipeline
svmPlus_clf = Pipeline([('vect', TfidfVectorizer(analyzer = "word",stop_words='english',max_features = 5000)),
                        ('clf', SGDClassifier(loss='hinge', penalty='l2',alpha=1e-4, n_iter=50, random_state=42))])

# Chargement du fichier contenant les données d'apprentissage
train = pd.read_csv("data_movies/train_data/labeledTrainData.tsv", header=0,delimiter="\t", quoting=3)

# Apprentissage
tps1 = time.clock()
svmPlus_clf = svmPlus_clf.fit(train["review"],train["sentiment"])
tps2 = time.clock()

print "Apprentissage réalisé en",tps2-tps1,"secondes"

Apprentissage réalisé en 6.86984189616 secondes


#### 1.2 Analyse des fichiers tripAdvisor
##### 1.2.1 Prédictions

In [3]:
file_list =[f for f in os.listdir("data/json/")]

tps1 = time.clock() 
for f in file_list:

    with open(path_to_data + f) as data:
        dt = json.load(data)
    
    hotel_id = dt["HotelInfo"]["HotelID"]
    review_id = [review["ReviewID"] for review in dt["Reviews"]]
    review_to_anayse = [review["Content"] for review in dt["Reviews"]]

    if len(review_to_anayse)!=0:
        res_rf = svmPlus_clf.predict(review_to_anayse) # avec les forêts aléatoires 
        f_out = "data/results/"+hotel_id+".csv"
        output1 = pd.DataFrame( data={"id":review_id, "sentiment":res_rf} ).to_csv( f_out, index=False, quoting=3 )
    else:
        print "File",f,"is empty"
        res_rf = ()
        f_out = "data/results/"+hotel_id+".csv"
        output1 = pd.DataFrame( data={"id":review_id, "sentiment":res_rf} ).to_csv( f_out, index=False, quoting=3 )
        
tps2 = time.clock()
print""
print "Done in ",tps2 - tps1," seconds."

File 1166845.json is empty
File 119640.json is empty
File 152681.json is empty
File 152886.json is empty
File 152887.json is empty
File 152890.json is empty
File 152892.json is empty
File 152894.json is empty
File 152896.json is empty
File 153138.json is empty
File 154001.json is empty
File 154868.json is empty
File 185324.json is empty
File 186909.json is empty
File 186910.json is empty
File 209428.json is empty
File 219145.json is empty
File 219163.json is empty
File 286104.json is empty
File 303140.json is empty
File 506374.json is empty
File 642781.json is empty
File 668969.json is empty
File 735474.json is empty
File 81222.json is empty
File 81295.json is empty
File 951172.json is empty

Done in  456.781377205  seconds.


##### 1.2.2 Mauvais Hotels

In [4]:
res_list =[f for f in os.listdir("data/results/")]

tps1 = time.clock() 
nb_hotel = 0.
nb_bad = 0.
for f in res_list:
    nb_hotel = nb_hotel+1
    my_hotel = pd.read_csv("data/results/" + f, header=0,delimiter=",", quoting=3)

    if 0.0<my_hotel["sentiment"].mean()<0.5:
        nb_bad = nb_bad+1
        print "count: ",my_hotel["sentiment"].count()
        print "The hotel",f,"does not worth it. The note is",my_hotel["sentiment"].mean(),"."        
tps2 = time.clock()

print""
print 100.0*nb_bad/nb_hotel,"% of all hotels are bad"


count:  48
The hotel 100506.csv does not worth it. The note is 0.416666666667 .
count:  52
The hotel 100508.csv does not worth it. The note is 0.288461538462 .
count:  5
The hotel 100531.csv does not worth it. The note is 0.4 .
count:  78
The hotel 100584.csv does not worth it. The note is 0.333333333333 .
count:  20
The hotel 100600.csv does not worth it. The note is 0.3 .
count:  71
The hotel 100605.csv does not worth it. The note is 0.338028169014 .
count:  8
The hotel 1011237.csv does not worth it. The note is 0.375 .
count:  7
The hotel 1015196.csv does not worth it. The note is 0.428571428571 .
count:  3
The hotel 1015604.csv does not worth it. The note is 0.333333333333 .
count:  11
The hotel 1015811.csv does not worth it. The note is 0.454545454545 .
count:  8
The hotel 1016578.csv does not worth it. The note is 0.375 .
count:  35
The hotel 1019217.csv does not worth it. The note is 0.428571428571 .
count:  7
The hotel 1020431.csv does not worth it. The note is 0.428571428571 .

### 2. Apprentissage avec une forêt aléatoire

#### 2.1 Définition du classifier

In [5]:
# Définition de la pipeline
rf_clf = Pipeline([('vect', TfidfVectorizer(analyzer = "word",stop_words='english',max_features = 5000)),
                   ('clf', RandomForestClassifier(n_estimators = 150))])

# Chargement du fichier contenant les données d'apprentissage
train = pd.read_csv("data_movies/train_data/labeledTrainData.tsv", header=0,delimiter="\t", quoting=3)

# Apprentissage
tps1 = time.clock() 
rf_clf = rf_clf.fit(train["review"],train["sentiment"])
tps2 = time.clock()

print "Apprentissage réalisé en",tps2-tps1,"secondes"

Apprentissage réalisé en 151.862705168 secondes


#### 2.2 Analyse du fichier trip advisor

##### 2.2.1 Prédictions

In [6]:
file_list =[f for f in os.listdir("data/json/")]

tps1 = time.clock() 
for f in file_list:

    with open(path_to_data + f) as data:
        dt = json.load(data)
    
    hotel_id = dt["HotelInfo"]["HotelID"]
    review_id = [review["ReviewID"] for review in dt["Reviews"]]
    review_to_anayse = [review["Content"] for review in dt["Reviews"]]

    if len(review_to_anayse)!=0:
        res_rf = rf_clf.predict(review_to_anayse) # avec les forêts aléatoires 
        f_out = "data/results/"+hotel_id+".csv"
        output1 = pd.DataFrame( data={"id":review_id, "sentiment":res_rf} ).to_csv( f_out, index=False, quoting=3 )
    else:
        print "File",f,"is empty"
        res_rf = ()
        f_out = "data/results/"+hotel_id+".csv"
        output1 = pd.DataFrame( data={"id":review_id, "sentiment":res_rf} ).to_csv( f_out, index=False, quoting=3 )
        
tps2 = time.clock()
print""
print "Done in ",tps2 - tps1," seconds."

File 1166845.json is empty
File 119640.json is empty
File 152681.json is empty
File 152886.json is empty
File 152887.json is empty
File 152890.json is empty
File 152892.json is empty
File 152894.json is empty
File 152896.json is empty
File 153138.json is empty
File 154001.json is empty
File 154868.json is empty
File 185324.json is empty
File 186909.json is empty
File 186910.json is empty
File 209428.json is empty
File 219145.json is empty
File 219163.json is empty
File 286104.json is empty
File 303140.json is empty
File 506374.json is empty
File 642781.json is empty
File 668969.json is empty
File 735474.json is empty
File 81222.json is empty
File 81295.json is empty
File 951172.json is empty

Done in  691.749224237  seconds.


##### 2.2.2 Mauvais Hotels

In [7]:
res_list =[f for f in os.listdir("data/results/")]

tps1 = time.clock() 
nb_hotel = 0.
nb_bad = 0.
for f in res_list:
    nb_hotel = nb_hotel+1
    my_hotel = pd.read_csv("data/results/" + f, header=0,delimiter=",", quoting=3)

    if 0.0<my_hotel["sentiment"].mean()<0.5:
        nb_bad = nb_bad+1
        print "count: ",my_hotel["sentiment"].count()
        print "The hotel",f,"does not worth it. The note is",my_hotel["sentiment"].mean(),"."        
tps2 = time.clock()

print""
print 100.0*nb_bad/nb_hotel,"% of all hotels are bad"

count:  48
The hotel 100506.csv does not worth it. The note is 0.479166666667 .
count:  52
The hotel 100508.csv does not worth it. The note is 0.442307692308 .
count:  5
The hotel 100531.csv does not worth it. The note is 0.4 .
count:  78
The hotel 100584.csv does not worth it. The note is 0.307692307692 .
count:  71
The hotel 100605.csv does not worth it. The note is 0.338028169014 .
count:  10
The hotel 1006218.csv does not worth it. The note is 0.4 .
count:  9
The hotel 1008216.csv does not worth it. The note is 0.333333333333 .
count:  21
The hotel 1016537.csv does not worth it. The note is 0.380952380952 .
count:  8
The hotel 1016578.csv does not worth it. The note is 0.25 .
count:  35
The hotel 1019217.csv does not worth it. The note is 0.457142857143 .
count:  11
The hotel 1023603.csv does not worth it. The note is 0.363636363636 .
count:  21
The hotel 1024019.csv does not worth it. The note is 0.380952380952 .
count:  20
The hotel 1024331.csv does not worth it. The note is 0.25

In [None]:
with open(path_to_data + "98712.json") as data:
    dt = json.load(data)
    
ratings = [review["Ratings"]
           for review in dt["Reviews"]]

k = [1.*sum([int(ratings[el].get(key)) for key in ratings[el].keys() if not int(ratings[el].get(key))==-1])/len([int(ratings[el].get(key)) for key in ratings[el].keys() if not int(ratings[el].get(key))==-1])
     for el in range(len(ratings))
    ]

print k