In [1]:
# the path for data is data/json/
import json
import time
import random
import pandas as pd # pandas pour avoir un format (DataFrame) confortable pour les données.
import numpy as np # Numpy pour le calcul du taux de bonnes prédictions
from sklearn.feature_extraction.text import TfidfVectorizer # outil pour traiter le texte
from sklearn.pipeline import Pipeline
from sklearn.cluster import KMeans
import os

path_to_data = os.path.join("data/json/")

In [None]:
# Définition de la pipeline
km_clf = Pipeline([('vect', TfidfVectorizer(analyzer = "word",stop_words='english',max_features = 3000)),
                   ('clf', KMeans(n_clusters=5, init='k-means++', max_iter=20, n_init=5, n_jobs=1))])

In [None]:
# Apprentissage avec les k-means
file_list =[f for f in os.listdir("data/json/")]

tps1 = time.clock() 

f_out = "data/clustering_results/result.json"
with open(f_out,'w') as out:  
    for f in file_list:
        with open(path_to_data + f) as data:
            dt = json.load(data)
    
        review_id = [review["ReviewID"] for review in dt["Reviews"]]
        review_to_anayse = [review["Content"] for review in dt["Reviews"]]

        if len(review_to_anayse)>5:
            km_clf.fit(review_to_anayse)
            output = {"HotelID":dt["HotelInfo"]["HotelID"], "Review_Clustering":dict(zip(review_id,km_clf.named_steps['clf'].labels_))}
            json.dump(output,out)
        else:
            print "File",f,"is empty"
            tmp = ['null' for i in range(len(review_to_anayse))]
            output = {"HotelID":dt["HotelInfo"]["HotelID"], "Review_Clustering":dict(zip(review_id,tmp))}
            json.dump(output,out)
            
tps2 = time.clock()
print""
print "Done in ",tps2 - tps1," seconds."

In [None]:
# Clustering avec la Non-Negative Matrix Factorization
from time import time

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import NMF

n_samples = 1000
n_features = 50
n_topics = 10
n_top_words = 5                                                                                                                                                             

file_list =[f for f in os.listdir("data/json/")]
samples_id = [file_list[id] for id in random.sample(range(len(file_list)), n_samples)]

def print_top_words(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print("Topic #%d:" % topic_idx)
        print(" ".join([feature_names[i] for i in topic.argsort()[:-n_top_words - 1:-1]]))
reviews = []
for f in samples_id:
    with open(path_to_data + f) as data:
        dt = json.load(data)

    reviews.append([review["Content"] for review in dt["Reviews"]])

reviews = [el for index in range(len(reviews))
              for el in reviews[index]]

# Use tf-idf features for NMF.
print("Extracting tf-idf features for NMF...")
tfidf_vectorizer = TfidfVectorizer(analyzer = "word",stop_words=english_stop_words,max_features = n_features)

t0 = time()
tfidf = tfidf_vectorizer.fit_transform(reviews)
print("done in %0.3fs." % (time() - t0))

# Fit the NMF model
print("Fitting the NMF model with tf-idf features,""n_samples=%d and n_features=%d..."% (n_samples, n_features))
t0 = time()
nmf = NMF(n_components=n_topics, tol = 1e-5,random_state=1).fit(tfidf)
print("done in %0.3fs." % (time() - t0))

print("\nTopics in NMF model:")
tfidf_feature_names = tfidf_vectorizer.get_feature_names()
print_top_words(nmf, tfidf_feature_names, n_top_words)

In [None]:
import matplotlib.pyplot as plt
import pandas as pd
from urllib2 import urlopen 
import numpy as np
%matplotlib inline
# Plot it out
fig, ax = plt.subplots()
heatmap = ax.pcolor(nmf.components_, cmap=plt.cm.Blues, alpha=0.8)

##################################################
## FORMAT ##
##################################################

fig = plt.gcf()
fig.set_size_inches(15,17)

# turn off the frame
ax.set_frame_on(False)

# put the major ticks at the middle of each cell
ax.set_yticks(np.arange(nmf.components_.shape[0])+0.5, minor=False)
ax.set_xticks(np.arange(nmf.components_.shape[1])+0.5, minor=False)

# want a more natural, table-like display
ax.invert_yaxis()
ax.xaxis.tick_top()

# Set the labels

# label source:https://en.wikipedia.org/wiki/Basketball_statistics
labels_x = [word for word in tfidf_feature_names]
labels_y = ['Topic '+ str(i) for i in range(nmf.components_.shape[0])]
# note I could have used nba_sort.columns but made "labels" instead
ax.set_xticklabels(labels_x, minor=False) 
ax.set_yticklabels(labels_y, minor=False)

# rotate the 
plt.xticks(rotation=90)

ax.grid(False)

# Turn off all the ticks
ax = plt.gca()

for t in ax.xaxis.get_major_ticks(): 
    t.tick1On = False 
    t.tick2On = False 
for t in ax.yaxis.get_major_ticks(): 
    t.tick1On = False 
    t.tick2On = False  

In [None]:
# Définition de la pipeline
km_clf = Pipeline([('vect', TfidfVectorizer(analyzer = "word",stop_words='english',max_features = n_features)),
                   ('clf', KMeans(init=nmf.components_, max_iter=20, n_init=5))])

# Apprentissage avec les k-means
file_list =[f for f in os.listdir("data/json/")]

tps1 = time.clock() 

# Liste des fichiers à éviter
to_avoid = ['1153745.json','231512.json','258610.json']

count =0
for f in file_list:
    count = count + 1
    if count==5000:
        break
            
    with open(path_to_data + f) as data:
        dt = json.load(data)
    
    review_id = [review["ReviewID"] for review in dt["Reviews"]]
    review_to_anayse = [review["Content"] for review in dt["Reviews"]]
        
    f_out = "data/clustering_results/"+dt["HotelInfo"]["HotelID"]+".json"
        
    if len(review_to_anayse)>nmf.components_.shape[0] and not f in to_avoid:
        km_clf.fit(review_to_anayse)
        output = pd.DataFrame( data={"id":review_id, "cluster":km_clf.named_steps['clf'].labels_} ).to_csv( f_out, index=False, quoting=3 )
    else:
        tmp = ['null' for i in range(len(review_to_anayse))]
        output = pd.DataFrame( data={"id":review_id, "cluster":tmp} ).to_csv( f_out, index=False, quoting=3 )
            
tps2 = time.clock()
print""
print "Done in ",tps2 - tps1," seconds."

In [None]:
file_list =[f for f in os.listdir("data/clustering_results/")]

heat = []

count = 0
for f in file_list:
    count = count + 1
    
    if count == 31:
        break
        
    data = pd.read_csv("data/clustering_results/"+f, header=0, delimiter=",")
    
    clusters = [0 for i in range(n_topics)]

    for i in data["cluster"]:
        if i!='null':
            clusters[i] += 1
    
    heat.append(clusters)

In [None]:
heat = np.array(heat)
# Plot it out
fig, ax = plt.subplots()
heatmap = ax.pcolor(heat, cmap=plt.cm.Blues, alpha=0.8)

##################################################
## FORMAT ##
##################################################

fig = plt.gcf()
fig.set_size_inches(15,17)

# turn off the frame
ax.set_frame_on(False)

# put the major ticks at the middle of each cell
ax.set_yticks(np.arange(heat.shape[0])+0.5, minor=False)
ax.set_xticks(np.arange(heat.shape[1])+0.5, minor=False)

# want a more natural, table-like display
ax.invert_yaxis()
ax.xaxis.tick_top()

# Set the labels

# label source:https://en.wikipedia.org/wiki/Basketball_statistics
labels_x = ['Topic '+ str(i) for i in range(n_topics)]
labels_y = ['Hotel '+ str(i) for i in range(heat.shape[0])]
# note I could have used nba_sort.columns but made "labels" instead
ax.set_xticklabels(labels_x, minor=False) 
ax.set_yticklabels(labels_y, minor=False)

# rotate the 
plt.xticks(rotation=90)

ax.grid(False)

# Turn off all the ticks
ax = plt.gca()

for t in ax.xaxis.get_major_ticks(): 
    t.tick1On = False 
    t.tick2On = False 
for t in ax.yaxis.get_major_ticks(): 
    t.tick1On = False 
    t.tick2On = False 