# Customer Feedback Data Visualization

In [1]:
# Required packages
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
import matplotlib.pyplot as plt
%matplotlib inline
import io
import base64
import nltk
from nltk.util import ngrams
from nltk.tokenize import word_tokenize
from collections import Counter
from wordcloud import WordCloud

In [2]:
# %pip install nbformat

In [3]:
# Téléchargement des données NLTK
try:
    nltk.data.find('tokenizers/punkt')
    nltk.data.find('corpora/stopwords')
except LookupError:
    nltk.download('punkt')
    nltk.download('stopwords')

In [4]:
# Load data
df = pd.read_csv("../data/full/full_reviews.csv")
df.info()
df.head()
df.tail()
df.describe()
df.columns
df.dtypes

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7578 entries, 0 to 7577
Data columns (total 26 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   id                     7578 non-null   object 
 1   title                  7578 non-null   object 
 2   review                 7578 non-null   object 
 3   rating                 7578 non-null   int64  
 4   reply                  4158 non-null   object 
 5   experienceDate         7578 non-null   object 
 6   createdDateTime        7578 non-null   object 
 7   publishedDate          7578 non-null   object 
 8   replyPublishedDate     4158 non-null   object 
 9   reviewExperienceDelay  7578 non-null   float64
 10  date                   7578 non-null   object 
 11  year                   7578 non-null   int64  
 12  yearQuarter            7578 non-null   object 
 13  yearMonth              7578 non-null   object 
 14  month                  7578 non-null   int64  
 15  mont

id                        object
title                     object
review                    object
rating                     int64
reply                     object
experienceDate            object
createdDateTime           object
publishedDate             object
replyPublishedDate        object
reviewExperienceDelay    float64
date                      object
year                       int64
yearQuarter               object
yearMonth                 object
month                      int64
monthName                 object
day                        int64
dayName                   object
hour                       int64
replyYear                float64
replyMonth               float64
replyDay                 float64
replyHour                float64
reviewLength               int64
titleLength                int64
sentiment                 object
dtype: object

## Prepare data

In [5]:
# sort value by date
df.sort_values(by="createdDateTime", ascending=True, inplace=True)

df.head(2)

Unnamed: 0,id,title,review,rating,reply,experienceDate,createdDateTime,publishedDate,replyPublishedDate,reviewExperienceDelay,...,day,dayName,hour,replyYear,replyMonth,replyDay,replyHour,reviewLength,titleLength,sentiment
1565,5a86cec5d27b0a0594cdf500,Super bien mais trop de mails,note précédente expérience achat livraison pro...,4,,2018-02-16 12:29:57,2018-02-16 12:29:57,2018-02-16 12:29:57,,0.0,...,16,Friday,12,,,,,281,29,positive
1564,5a86d126d27b0a052072f799,J'ai reçu le portable très rapidement…,reçu portable très rapidement très état choisi...,5,,2018-02-16 12:40:06,2018-02-16 12:40:06,2018-02-16 12:40:06,,0.0,...,16,Friday,12,,,,,159,38,positive


In [6]:
# Prepare data

# Convert date columns to datetime
df["createdDateTime"] = pd.to_datetime(df["createdDateTime"])
#df["createdDateTime"] = df["createdDateTime"].dt.tz_localize(None)
df["publishedDate"] = pd.to_datetime(df["publishedDate"])
try:
    df["replyPublishedDate"] = pd.to_datetime(df["replyPublishedDate"])
except:
    df["replyPublishedDate"] = None
df["experienceDate"] = pd.to_datetime(df["experienceDate"])



## Reviews Volume

In [7]:
# Yearly Review Volume and Average Rating
data = df.groupby(["year"]).agg(
    review_volume = ("id", "count"), # Count reviews
    average_rating = ("rating", "mean") # Average rating
).reset_index()

fig_table = go.Figure(
    data=[
        go.Table(
            header=dict(
                values=["Year", "Review Volume", "Average Rating"],  # Column headers
                fill_color="lightblue",  # Header background color
                align="center",  # Center-align text
                font=dict(size=14, color="black")  # Font style
            ),
            cells=dict(
                values=[
                    data["year"],  # Year column
                    data["review_volume"],  # Review volume column
                    data["average_rating"].round(2)  # Average rating (rounded to 2 decimals)
                ],
                fill_color="white",  # Cell background color
                align="center",  # Center-align text
                font=dict(size=12, color="black")  # Font style
            )
        )
    ]
)
fig_table.update_layout(
    title_text="Review Volume and Average Rating Per Year",  # Title
    title_x=0.5,  # Center-align the title
    margin=dict(l=20, r=20, t=50, b=20)  # Adjust margins
)
fig_table.show()


In [8]:
# Daily Review Volume Trend
data = df.groupby("date")["id"].count()
daily_rev_trend_fig = px.line(data,  title="Daily Review Volume Trend Over Time")
daily_rev_trend_fig.show()


In [9]:
# Monthly Review Volume Trend
data = df.groupby("yearMonth")["id"].count()
monthly_rev_trend_fig = px.line(data,  title="Monthly Review Volume Trend")
monthly_rev_trend_fig.show()

## Sentiment Analysis

In [10]:
# Sentiment répartition (Pie chart)
sent_rep_fig = px.pie(df, names="sentiment", title="Sentiment Repartition")
sent_rep_fig.show()

In [11]:
# Sentiment Analysis
sent_dist_fig = px.histogram(df, x="sentiment", color = "sentiment", title="Sentiment Distribution")
sent_dist_fig.show()

In [12]:
# Yearly Sentiment trend
data = df.groupby("year")["sentiment"].value_counts().unstack().fillna(0)
yearly_sent_trend_fig = px.bar(data, title="Yearly Sentiment Trend Over Time")
yearly_sent_trend_fig.show()

In [13]:
df["rating"].value_counts().reset_index()

Unnamed: 0,rating,count
0,5,5586
1,4,973
2,1,584
3,3,266
4,2,169


In [14]:
# Rating Distribution
rating_counts = df["rating"].value_counts().reset_index()
rating_bar = px.bar(rating_counts, x="rating", y="count", title="Rating Distribution")
rating_bar.show()

sentiment_pie = px.pie(df, names="sentiment", title="Sentiment Distribution")
sentiment_pie.show()

In [35]:
# Quaterly Sentiment trend
data = df.groupby("yearQuarter")["sentiment"].value_counts().unstack().fillna(0)
quaterly_sent_trend_fig = px.bar(data, title="Quaterly Sentiment Trend")
quaterly_sent_trend_fig.show()

In [16]:
# Sentiment count by month
data = df.groupby("yearMonth")["sentiment"].value_counts().unstack().fillna(0)
fig = px.bar(data, title="Sentiment count by month", barmode="stack")
fig.show()

In [17]:
# Daily Sentiment Trend
data = df.groupby("date")["sentiment"].value_counts().unstack().fillna(0)

sent_trend_fig = px.line(data,  title="daily Sentiment Trend Over Time")
sent_trend_fig.show()

## Review Rating Analysis

In [18]:
# Sentiment repartition (Pie chart)
rating_rep_fig = px.pie(df, names="rating", title="Rating Repartition")
rating_rep_fig.show()

In [19]:
# YEARLY Rating Trend
data = df.groupby("year")["rating"].value_counts().unstack().fillna(0)
yearly_rating_trend_fig = px.bar(data, title="Yearly Rating Trend")
yearly_rating_trend_fig.show()

In [20]:
# Quaterly Rating Trend

data = df.groupby("yearQuarter")["rating"].value_counts().unstack().fillna(0)
quaterly_rating_trend_fig = px.bar(data, title="Quaterly Rating Trend")
quaterly_rating_trend_fig.show()

In [21]:
# Monthly Rating Trend
data = df.groupby("yearMonth")["rating"].value_counts().unstack().fillna(0)
monthly_rating_trend_fig = px.line(data, title="Monthly Rating Trend", markers=True)
monthly_rating_trend_fig.show()

In [22]:
# Daily Rating Trend
data = df.groupby("date")["rating"].value_counts().unstack().fillna(0)
daily_rating_trend_fig = px.line(data, title="Daily Rating Trend")
daily_rating_trend_fig.show()

In [23]:
# Daily Average Rating
data = df.groupby("date")["rating"].mean()
daily_avg_rating_fig = px.line(data, title="Daily Average Rating")
daily_avg_rating_fig.show()

In [24]:
# Monthly Average Rating
data = df.groupby("yearMonth")["rating"].mean()
monthly_avg_rating_fig = px.line(data, title="Monthly Average Rating", markers=True)
monthly_avg_rating_fig.show()

In [25]:
# Yearly Average Rating
data = df.groupby("year")["rating"].mean()
yearly_avg_rating_fig = px.bar(data, title="Yearly Average Rating")
yearly_avg_rating_fig.show()

## Reply & Response Time Analysis

In [26]:
fig9 = px.bar(df["reply"].notna().value_counts(), title="Reviews with and without Reply")
fig9.show()
data = df[df["reply"].notna()].copy()
data["replyPublishedDate"] = pd.to_datetime(data["replyPublishedDate"])
data["publishedDate"] = pd.to_datetime(data["publishedDate"])
data["replyPublishedDate"] = pd.to_datetime(data["replyPublishedDate"])
data["replyDelay"] = (data["replyPublishedDate"] - data["publishedDate"]).dt.total_seconds().div(60*60).round(2)
data["publishedDate"] = pd.to_datetime(data["publishedDate"])
fig10 = px.histogram(data, x="replyDelay", title="Reply Time Distribution (Hours)", nbins=50)
fig10.show()

In [27]:
df["replyPublishedDate"]

1565    None
1564    None
1563    None
1562    None
1561    None
        ... 
5409    None
5408    None
5407    None
5406    None
5405    None
Name: replyPublishedDate, Length: 7578, dtype: object

In [28]:

df["replyPublishedDate"].notna().sum()

np.int64(0)

## Review Lenght Analysis

In [29]:
# 8. Review Length Analysis
fig16 = px.histogram(df, x="reviewLength", title="Distribution of Review Lengths", nbins=50)

fig17 = px.box(df, x="sentiment", y="reviewLength", title="Review Length per Sentiment")

fig18 = px.box(df, x="rating", y="reviewLength", title="Review Length per Sentiment")

fig19 = px.scatter(df, x="reviewLength", y="rating", title="Review Length vs Rating")

fig20 = px.imshow(df[["reviewLength", "rating"]].corr(), title="Correlation Matrix: Review Length vs Rating")
# Show new figures
for fig in [fig16, fig17, fig18, fig19, fig20]:
    fig.show()

## Textual Analysis

In [30]:
# Fonction pour générer un nuage de mots avec WordCloud
def plot_ngram_wordcloud_dash(text, n=3, title="Nuage de mots"):
    if not isinstance(text, str) or not text.strip():
        return html.Div("Aucun texte valide fourni pour le nuage de mots")
    
    try:
        tokens = nltk.word_tokenize(text.lower())
        ngrams_list = list(ngrams(tokens, n))
        if not ngrams_list:
            return html.Div("Aucun n-gramme valide trouvé dans le texte")
        
        ngram_freq = Counter(ngrams_list)
        ngram_text = {" ".join(k): v for k, v in ngram_freq.items()}
        
        wordcloud = WordCloud(
            width=800,
            height=400,
            background_color='white',
            min_font_size=10
        ).generate_from_frequencies(ngram_text)
        
        plt.figure(figsize=(10, 5))
        plt.imshow(wordcloud, interpolation='bilinear')
        plt.title(title)
        plt.axis("off")
        
        buf = io.BytesIO()
        plt.savefig(buf, format="png", bbox_inches='tight')
        buf.seek(0)
        img_base64 = base64.b64encode(buf.getvalue()).decode('utf-8')
        buf.close()
        plt.close()
        
        return html.Img(src=f"data:image/png;base64,{img_base64}", style={'width': '100%', 'maxWidth': '800px'})
    except Exception as e:
        return html.Div(f"Erreur lors de la génération du nuage de mots : {str(e)}")


In [31]:
def filter_data(years, quarters, months):
    filtered_data = df.copy()
    if years:
        filtered_data = filtered_data[filtered_data["year"].isin(years)]
    if quarters:
        filtered_data = filtered_data[filtered_data["yearQuarter"].isin(quarters)]
    if months:
        filtered_data = filtered_data[filtered_data["yearMonth"].isin(months)]
    return filtered_data if not filtered_data.empty else None

In [32]:

filtered_data = filter_data(years, quarters, months)

NameError: name 'years' is not defined

In [None]:


sentiment_pie = px.pie(filtered_data, names="sentiment", title="Sentiment Distribution", 
                           color_discrete_sequence=px.colors.qualitative.Pastel, hole=0.3)
    sentiment_pie.update_traces(textinfo="percent+label", hoverinfo="label+percent+value")