In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud
from textblob import TextBlob
import statsmodels.api as sm
from statsmodels.formula.api import ols
from scipy import stats
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier




import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import nltk
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

In [None]:
# Load tripadvisor Data
tripadvisor_df=pd.read_csv('Tripadvisor_data_source.csv')
tripadvisor_df.head(5)

In [None]:
# Load yelp Data
yelp_df=pd.read_csv('Yelp_data_source.csv')
yelp_df.head(5)

In [None]:
# Load trustpilot Data
trustpilot_df = pd.read_csv('trustpilot_data_source.csv', encoding='ISO-8859-1')  # or 'latin1'
#print(trustpilot_df.head())
trustpilot_df.head(5)


In [None]:
trustpilot_df.info()
yelp_df.info()
tripadvisor_df.info()



In [None]:
# Dropping duplicates
tripadvisor_df = tripadvisor_df.drop_duplicates()
yelp_df = yelp_df.drop_duplicates()
trustpilot_df = trustpilot_df .drop_duplicates()


# Handling missing values
tripadvisor_df  = tripadvisor_df .fillna('Unknown')
yelp_df=yelp_df.fillna('Unknown')
trustpilot_df=trustpilot_df.fillna('Unknown')


# Convert 'Review Date' to datetime
tripadvisor_df ['Review Date'] = pd.to_datetime(tripadvisor_df ['Review Date'], errors='coerce')
yelp_df['Review Date'] = pd.to_datetime(yelp_df['Review Date'], errors='coerce')
trustpilot_df['Review Date'] = pd.to_datetime(trustpilot_df['Review Date'], errors='coerce')



In [None]:

 # Convert the 'Star Rating' columns to numeric by extracting digits and converting to float
trustpilot_df['Star Rating'] = trustpilot_df['Star Rating'].astype(str).str.extract('(\d)').astype(float)
yelp_df['Star Rating'] = yelp_df['Star Rating'].astype(str).str.extract('(\d)').astype(float)
tripadvisor_df['Star Rating'] = tripadvisor_df['Star Rating'].astype(str).str.extract('(\d)').astype(float)
# Display the first few rows to ensure the conversion worked
trustpilot_df[['Star Rating']].head()

In [None]:
tripadvisor_df.isnull().sum()


In [None]:
yelp_df.isnull().sum()


In [None]:
trustpilot_df.isnull().sum()

In [None]:
# Remove rows with missing Review Dates
yelp_df = tripadvisor_df.dropna(subset=['Review Date','Star Rating'])
trustpilot_df= trustpilot_df.dropna(subset=['Review Date'])


In [None]:
# Extract summary statistics
tripadvisor_summary = tripadvisor_df.describe(include='all')
trustpilot_summary = trustpilot_df.describe(include='all')
yelp_summary = yelp_df.describe(include='all')

tripadvisor_summary , trustpilot_summary ,yelp_summary

In [None]:
# Extract rating distributions
tripadvisor_ratings = tripadvisor_df['Star Rating'].value_counts()
trustpilot_ratings = trustpilot_df['Star Rating'].value_counts()
yelp_ratings = yelp_df['Star Rating'].value_counts()

tripadvisor_ratings, trustpilot_ratings, yelp_ratings


In [None]:
# Plotting the rating distributions
fig, ax = plt.subplots(1, 3, figsize=(25, 8))

# Tripadvisor rating distribution
ax[0].bar(tripadvisor_ratings.index.astype(str), tripadvisor_ratings.values, color='blue')
ax[0].set_title('Tripadvisor Rating Distribution')
ax[0].set_xlabel('Star Rating')
ax[0].set_ylabel('Number of Reviews')

# Trustpilot rating distribution

ax[1].bar(trustpilot_ratings.index.astype(str), trustpilot_ratings.values, color='green')
ax[1].set_title('Trustpilot Rating Distribution')
ax[1].set_xlabel('Star Rating')
ax[1].set_ylabel('Number of Reviews')

# Yelp rating distribution
ax[2].bar(yelp_ratings.index.astype(str), yelp_ratings.values, color='red')
ax[2].set_title('Yelp Rating Distribution')
ax[2].set_xlabel('Star Rating')
ax[2].set_ylabel('Number of Reviews')

plt.tight_layout()
plt.show()

In [None]:
# Extracting overall rating distributions
tripadvisor_overall_rating = tripadvisor_df['Overall Rating'].value_counts()
trustpilot_overall_rating = trustpilot_df['Overall Rating'].value_counts()
yelp_overall_rating = yelp_df['Overall Rating'].value_counts()

# Plotting the overall rating distributions
fig, ax = plt.subplots(1, 3, figsize=(20, 8))

# Tripadvisor overall rating distribution
ax[0].bar(tripadvisor_overall_rating.index.astype(str), tripadvisor_overall_rating.values, color='blue', width=0.5)
ax[0].set_title('Tripadvisor Overall Rating Distribution')
ax[0].set_xlabel('Overall Rating')
ax[0].set_ylabel('Number of Reviews')

# Trustpilot overall rating distribution
ax[1].bar(trustpilot_overall_rating.index.astype(str), trustpilot_overall_rating.values, color='green', width=0.5)
ax[1].set_title('Trustpilot Overall Rating Distribution')
ax[1].set_xlabel('Overall Rating')
ax[1].set_ylabel('Number of Reviews')

# Yelp overall rating distribution
ax[2].bar(yelp_overall_rating.index.astype(str), yelp_overall_rating.values, color='red', width=0.5)
ax[2].set_title('Yelp Overall Rating Distribution')
ax[2].set_xlabel('Overall Rating')
ax[2].set_ylabel('Number of Reviews')

plt.tight_layout()
plt.show()

In [None]:
# NLP

In [None]:
# TEXT PREPROCESSING

In [None]:
import nltk
#  Manually download the 'punkt' from https://www.nltk.org/nltk_data/ in C:/nltk_data/tokenizers/punkt/). to resolve the error.

# Download the NLTK stopwords data
nltk.download('stopwords')

# Retry loading and preprocessing the data
stop_words = set(stopwords.words('english'))

# Functions for text preprocessing
def clean_text(text):
    # Lowercase the text
    text = text.lower()
    # Remove punctuation and non-alphabetic characters
    text = re.sub(r'[^a-z\s]', '', text)
    # Tokenize the text
    words = word_tokenize(text)
    # Remove stopwords
    words = [word for word in words if word not in stop_words]
    return ' '.join(words)

# Clean and preprocess text data for each dataset
tripadvisor_df['Cleaned_Review_Text'] = tripadvisor_df['Review Text'].apply(lambda x: clean_text(str(x)))
trustpilot_df['Cleaned_Review_Text'] = trustpilot_df['Review Text'].apply(lambda x: clean_text(str(x)))
yelp_df['Cleaned_Review_Text'] = yelp_df['Review Text'].apply(lambda x: clean_text(str(x)))


# Add a feature for review length (number of words)
tripadvisor_df['Review_Length'] = tripadvisor_df['Cleaned_Review_Text'].apply(lambda x: len(x.split()))
trustpilot_df['Review_Length'] = trustpilot_df['Cleaned_Review_Text'].apply(lambda x: len(x.split()))
yelp_df['Review_Length'] = yelp_df['Cleaned_Review_Text'].apply(lambda x: len(x.split()))

tripadvisor_df['Cleaned_Review_Text']

In [None]:
#Sentiment Analysis

In [None]:
# SENTIMENT ANALYSIS USING
from textblob import TextBlob

# Function to analyze sentiment using TextBlob
def get_sentiment(text):
    analysis = TextBlob(text)
    # Determine the sentiment polarity
    if analysis.sentiment.polarity > 0:
        return 'Positive'
    elif analysis.sentiment.polarity < 0:
        return 'Negative'
    else:
        return 'Neutral'

# Apply sentiment analysis to each dataset
tripadvisor_df['Sentiment'] = tripadvisor_df['Cleaned_Review_Text'].apply(get_sentiment)
trustpilot_df['Sentiment'] = trustpilot_df['Cleaned_Review_Text'].apply(get_sentiment)
yelp_df['Sentiment'] = yelp_df['Cleaned_Review_Text'].apply(get_sentiment)

# Calculate sentiment distribution for each platform
tripadvisor_sentiment = tripadvisor_df['Sentiment'].value_counts()
trustpilot_sentiment = trustpilot_df['Sentiment'].value_counts()
yelp_sentiment = yelp_df['Sentiment'].value_counts()

tripadvisor_sentiment, trustpilot_sentiment, yelp_sentiment

In [None]:
# Plotting sentiment distribution
fig, ax = plt.subplots(1, 3, figsize=(20, 8))

# Tripadvisor sentiment distribution
ax[0].bar(tripadvisor_sentiment.index, tripadvisor_sentiment.values, color=['blue', 'orange', 'green'], width=0.5)
ax[0].set_title('Tripadvisor Sentiment Distribution')
ax[0].set_xlabel('Sentiment')
ax[0].set_ylabel('Number of Reviews')

# Trustpilot sentiment distribution
ax[1].bar(trustpilot_sentiment.index, trustpilot_sentiment.values, color=['blue', 'orange', 'green'], width=0.5)
ax[1].set_title('Trustpilot Sentiment Distribution')
ax[1].set_xlabel('Sentiment')
ax[1].set_ylabel('Number of Reviews')

# Yelp sentiment distribution
ax[2].bar(yelp_sentiment.index, yelp_sentiment.values, color=['blue', 'orange', 'green'], width=0.5)
ax[2].set_title('Yelp Sentiment Distribution')
ax[2].set_xlabel('Sentiment')
ax[2].set_ylabel('Number of Reviews')

plt.tight_layout()
plt.show()


In [None]:
# Topic Modeling

In [None]:
# Topic Modeling

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation

# Function to perform topic modeling using LDA
def lda_topic_modeling(text_data, num_topics=5, num_words=10):
    # Vectorize the text data
    vectorizer = CountVectorizer(max_df=0.95, min_df=2, stop_words='english')
    dtm = vectorizer.fit_transform(text_data)

    # Apply LDA
    lda = LatentDirichletAllocation(n_components=num_topics, random_state=42)
    lda.fit(dtm)

    # Get the words corresponding to the topics
    words = vectorizer.get_feature_names_out()
    topics = {}
    for i, topic in enumerate(lda.components_):
        topics[f'Topic {i+1}'] = [words[i] for i in topic.argsort()[-num_words:]]

    return topics

# Applying LDA on the cleaned review texts for each platform
tripadvisor_topics = lda_topic_modeling(tripadvisor_df['Cleaned_Review_Text'])
trustpilot_topics = lda_topic_modeling(trustpilot_df['Cleaned_Review_Text'])
yelp_topics = lda_topic_modeling(yelp_df['Cleaned_Review_Text'])

tripadvisor_topics, trustpilot_topics, yelp_topics


# wordcloud

In [None]:
from wordcloud import WordCloud

# Generate word clouds for each platform
tripadvisor_wordcloud = WordCloud(width=800, height=400, background_color='white').generate(' '.join(tripadvisor_df['Cleaned_Review_Text']))
trustpilot_wordcloud = WordCloud(width=800, height=400, background_color='white').generate(' '.join(trustpilot_df['Cleaned_Review_Text']))
yelp_wordcloud = WordCloud(width=800, height=400, background_color='white').generate(' '.join(yelp_df['Cleaned_Review_Text']))

# Plotting the word clouds
fig, ax = plt.subplots(1, 3, figsize=(24, 12))

# Tripadvisor word cloud
ax[0].imshow(tripadvisor_wordcloud, interpolation='bilinear')
ax[0].set_title('Tripadvisor Word Cloud')
ax[0].axis('off')

# Trustpilot word cloud
ax[1].imshow(trustpilot_wordcloud, interpolation='bilinear')
ax[1].set_title('Trustpilot Word Cloud')
ax[1].axis('off')

# Yelp word cloud
ax[2].imshow(yelp_wordcloud, interpolation='bilinear')
ax[2].set_title('Yelp Word Cloud')
ax[2].axis('off')

plt.tight_layout()
plt.show()


In [None]:
# Apply Named Entity Recognition (NER)

In [None]:
import spacy
# Load the spaCy English model
nlp = spacy.load("en_core_web_sm")

# Efficient NER processing using nlp.pipe
def ner_analysis_pipe(text_series):
    texts = text_series.tolist()
    entities = []
    for doc in nlp.pipe(texts, batch_size=50, disable=["parser", "tagger"]):
        entities.append([(ent.text, ent.label_) for ent in doc.ents])
    return entities


# Applying NER to the Tripadvisor dataset
tripadvisor_df['Entities'] = ner_analysis_pipe(tripadvisor_df['Review Text'])
# Display the entities for the first few reviews
tripadvisor_df[['Review Text', 'Entities']].head(2)



In [None]:
# Applying NER to the Trustpilot dataset
trustpilot_df['Entities'] = ner_analysis_pipe(trustpilot_df['Review Text'])
# Display the entities for the first few reviews
trustpilot_df[['Review Text', 'Entities']].head()

# Applying NER to the Yelp dataset
yelp_df['Entities'] = ner_analysis_pipe(yelp_df['Review Text'])
# Display the entities for the first few reviews
yelp_df[['Review Text', 'Entities']].head()

In [None]:
from tqdm import tqdm

def ner_analysis_pipe_with_progress(text_series):
    texts = text_series.tolist()
    entities = []
    for doc in tqdm(nlp.pipe(texts, batch_size=50, disable=["parser", "tagger"]), total=len(texts)):
        entities.append([(ent.text, ent.label_) for ent in doc.ents])
    return entities

# Apply with progress tracking
tripadvisor_df['Entities'] = ner_analysis_pipe_with_progress(tripadvisor_df['Review Text'])
trustpilot_df['Entities'] = ner_analysis_pipe_with_progress(trustpilot_df['Review Text'])
yelp_df['Entities'] = ner_analysis_pipe_with_progress(yelp_df['Review Text'])


In [None]:

# Load the spaCy English model
nlp = spacy.load("en_core_web_sm")

# Function to extract noun chunks (key phrases)
def extract_key_phrases_spacy(text):
    doc = nlp(text)
    return [chunk.text for chunk in doc.noun_chunks]


# Apply spaCy key phrase extraction to the reviews
tripadvisor_df['Key_Phrases'] = tripadvisor_df['Review Text'].apply(extract_key_phrases_spacy)
trustpilot_df['Key_Phrases'] = trustpilot_df['Review Text'].apply(extract_key_phrases_spacy)
yelp_df['Key_Phrases'] = yelp_df['Review Text'].apply(extract_key_phrases_spacy)


# Display the key phrases for the first few reviews
tripadvisor_df[['Review Text', 'Key_Phrases']].head()
trustpilot_df[['Review Text', 'Key_Phrases']].head()
yelp_df[['Review Text', 'Key_Phrases']].head()


In [None]:
from collections import Counter
# To visualize the extracted key phrases
# Function to create a bar chart
def create_bar_chart(phrases, title):
    all_phrases = [phrase for sublist in phrases for phrase in sublist]
    phrase_counts = Counter(all_phrases)

    # Convert to DataFrame and get top 10 phrases
    phrase_df = pd.DataFrame(phrase_counts.items(), columns=['Phrase', 'Frequency'])
    top_phrases = phrase_df.nlargest(10, 'Frequency')

    plt.figure(figsize=(10, 5))
    plt.barh(top_phrases['Phrase'], top_phrases['Frequency'], color='skyblue')
    plt.xlabel('Frequency')
    plt.title(title)
    plt.gca().invert_yaxis()  # Highest frequency at the top
    plt.show()

# Generate bar charts for each dataset
create_bar_chart(tripadvisor_df['Key_Phrases'], "Top 10 Key Phrases - Tripadvisor")
create_bar_chart(trustpilot_df['Key_Phrases'], "Top 10 Key Phrases - Trustpilot")
create_bar_chart(yelp_df['Key_Phrases'], "Top 10 Key Phrases - Yelp")


In [None]:
# Assuming tripadvisor_sentiment, trustpilot_sentiment, and yelp_sentiment are already defined

# Normalize sentiment counts to proportions
tripadvisor_sentiment_norm = tripadvisor_sentiment / tripadvisor_sentiment.sum()
trustpilot_sentiment_norm = trustpilot_sentiment / trustpilot_sentiment.sum()
yelp_sentiment_norm = yelp_sentiment / yelp_sentiment.sum()

# Adjusting the bar positions to avoid overlap
fig, ax = plt.subplots(figsize=(10, 6))

# Define bar width and positions
bar_width = 0.2
r1 = np.arange(len(tripadvisor_sentiment_norm))
r2 = [x + bar_width for x in r1]
r3 = [x + bar_width for x in r2]

# Plot each platform's sentiment data
ax.bar(r1, tripadvisor_sentiment_norm.values, color='blue', width=bar_width, label='Tripadvisor')
ax.bar(r2, trustpilot_sentiment_norm.values, color='orange', width=bar_width, label='Trustpilot')
ax.bar(r3, yelp_sentiment_norm.values, color='green', width=bar_width, label='Yelp')

# Adding labels and title
ax.set_title('Sentiment Comparison Across Platforms')
ax.set_xlabel('Sentiment')
ax.set_ylabel('Proportion of Reviews')
ax.set_xticks([r + bar_width for r in range(len(tripadvisor_sentiment_norm))])
ax.set_xticklabels(tripadvisor_sentiment_norm.index)
ax.legend()



In [None]:
#Correlate the Sentiment with Star Ratings

# Recalculating the average rating for each sentiment category across the platforms
tripadvisor_avg_rating_by_sentiment = tripadvisor_df.groupby('Sentiment')['Star Rating'].mean()
trustpilot_avg_rating_by_sentiment = trustpilot_df.groupby('Sentiment')['Star Rating'].mean()
yelp_avg_rating_by_sentiment = yelp_df.groupby('Sentiment')['Star Rating'].mean()

tripadvisor_avg_rating_by_sentiment, trustpilot_avg_rating_by_sentiment, yelp_avg_rating_by_sentiment





In [None]:
# visualize the Sentiment VS Star Rating

import matplotlib.pyplot as plt
import numpy as np

# Data provided by the user
tripadvisor_avg_rating_by_sentiment = {
    'Negative': 2.695279,
    'Neutral': 4.409091,
    'Positive': 4.631290
}

trustpilot_avg_rating_by_sentiment = {
    'Negative': 4.019868,
    'Neutral': 4.306122,
    'Positive': 4.869896
}

yelp_avg_rating_by_sentiment = {
    'Negative': 2.695279,
    'Neutral': 4.409091,
    'Positive': 4.631290
}

# Prepare the data for plotting
labels = ['Negative', 'Neutral', 'Positive']
tripadvisor_ratings = list(tripadvisor_avg_rating_by_sentiment.values())
trustpilot_ratings = list(trustpilot_avg_rating_by_sentiment.values())
yelp_ratings = list(yelp_avg_rating_by_sentiment.values())

x = np.arange(len(labels))  # the label locations

# Set up the figure and axis
fig, ax = plt.subplots(figsize=(10, 6))
bar_width = 0.2

# Plotting the bars
ax.bar(x - bar_width, tripadvisor_ratings, width=bar_width, label='Tripadvisor', color='blue')
ax.bar(x, trustpilot_ratings, width=bar_width, label='Trustpilot', color='orange')
ax.bar(x + bar_width, yelp_ratings, width=bar_width, label='Yelp', color='green')

# Adding labels and titles
ax.set_xlabel('Sentiment')
ax.set_ylabel('Average Rating')
ax.set_title('Average Rating by Sentiment Across Platforms')
ax.set_xticks(x)
ax.set_xticklabels(labels)
ax.legend()

# Adjust layout and display the plot
plt.tight_layout()
plt.show()


In [None]:
# Statistical Analysis
from scipy.stats import f_oneway, ttest_ind

# Prepare the data for ANOVA and t-tests
def prepare_data_for_tests(data, sentiment_col='Sentiment', rating_col='Star Rating'):
    negative_ratings = data[data[sentiment_col] == 'Negative'][rating_col].dropna()
    neutral_ratings = data[data[sentiment_col] == 'Neutral'][rating_col].dropna()
    positive_ratings = data[data[sentiment_col] == 'Positive'][rating_col].dropna()
    return negative_ratings, neutral_ratings, positive_ratings

# ANOVA and T-Test for Tripadvisor
tripadvisor_negative, tripadvisor_neutral, tripadvisor_positive = prepare_data_for_tests(tripadvisor_df)
anova_tripadvisor = f_oneway(tripadvisor_negative, tripadvisor_neutral, tripadvisor_positive)

# T-tests for Tripadvisor
ttest_tripadvisor_neg_neu = ttest_ind(tripadvisor_negative, tripadvisor_neutral)
ttest_tripadvisor_neg_pos = ttest_ind(tripadvisor_negative, tripadvisor_positive)
ttest_tripadvisor_neu_pos = ttest_ind(tripadvisor_neutral, tripadvisor_positive)

# ANOVA and T-Test for Trustpilot
trustpilot_negative, trustpilot_neutral, trustpilot_positive = prepare_data_for_tests(trustpilot_df)
anova_trustpilot = f_oneway(trustpilot_negative, trustpilot_neutral, trustpilot_positive)

# T-tests for Trustpilot
ttest_trustpilot_neg_neu = ttest_ind(trustpilot_negative, trustpilot_neutral)
ttest_trustpilot_neg_pos = ttest_ind(trustpilot_negative, trustpilot_positive)
ttest_trustpilot_neu_pos = ttest_ind(trustpilot_neutral, trustpilot_positive)

# ANOVA and T-Test for Yelp
yelp_negative, yelp_neutral, yelp_positive = prepare_data_for_tests(yelp_df)
anova_yelp = f_oneway(yelp_negative, yelp_neutral, yelp_positive)

# T-tests for Yelp
ttest_yelp_neg_neu = ttest_ind(yelp_negative, yelp_neutral)
ttest_yelp_neg_pos = ttest_ind(yelp_negative, yelp_positive)
ttest_yelp_neu_pos = ttest_ind(yelp_neutral, yelp_positive)

# Displaying the results
print("ANOVA Results")
print("Tripadvisor:", anova_tripadvisor)
print("Trustpilot:", anova_trustpilot)
print("Yelp:", anova_yelp)

print("\nT-Test Results")
print("Tripadvisor Neg vs Neu:", ttest_tripadvisor_neg_neu)
print("Tripadvisor Neg vs Pos:", ttest_tripadvisor_neg_pos)
print("Tripadvisor Neu vs Pos:", ttest_tripadvisor_neu_pos)

print("Trustpilot Neg vs Neu:", ttest_trustpilot_neg_neu)
print("Trustpilot Neg vs Pos:", ttest_trustpilot_neg_pos)
print("Trustpilot Neu vs Pos:", ttest_trustpilot_neu_pos)

print("Yelp Neg vs Neu:", ttest_yelp_neg_neu)
print("Yelp Neg vs Pos:", ttest_yelp_neg_pos)
print("Yelp Neu vs Pos:", ttest_yelp_neu_pos)


In [None]:
pip install scikit-learn pandas

In [None]:
# Machine Learning Algorithms on Each Dataset

In [None]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder

# Define a function to apply ML model on a dataset
def apply_ml_model(data, sentiment_col='Sentiment', text_col='Review Text', rating_col='Star Rating'):
    # Preprocessing
    features = data[[text_col, rating_col]]
    target = data[sentiment_col]

    # Convert 'Review Text' to TF-IDF features
    tfidf = TfidfVectorizer(max_features=1000, stop_words='english')
    X_text = tfidf.fit_transform(features[text_col])

    # Ensure 'Star Rating' is numeric
    features[rating_col] = pd.to_numeric(features[rating_col], errors='coerce').fillna(0)

    # Combine TF-IDF features with 'Star Rating'
    X = pd.concat([pd.DataFrame(X_text.toarray()), features[rating_col].reset_index(drop=True)], axis=1)
    X.columns = X.columns.astype(str)  # Ensure all column names are strings

    # Encode target labels (sentiments)
    le = LabelEncoder()
    y = le.fit_transform(target)

    # Splitting the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Initialize the model (Random Forest)
    model = RandomForestClassifier(random_state=42)
    # Train the model
    model.fit(X_train, y_train)
    # Make predictions
    y_pred = model.predict(X_test)
    # Evaluate the model
    print("Accuracy:", accuracy_score(y_test, y_pred))
    print("Classification Report:\n", classification_report(y_test, y_pred, target_names=le.classes_))

# Assuming tripadvisor_df, trustpilot_df, and yelp_df are defined and loaded
apply_ml_model(tripadvisor_df)
apply_ml_model(trustpilot_df)
apply_ml_model(yelp_df)


In [None]:
# Define a function to apply ML model on a dataset
def apply_ml_model(data, sentiment_col='Sentiment', text_col='Review Text', rating_col='Star Rating'):
    # Preprocessing
    features = data[[text_col, rating_col]]
    target = data[sentiment_col]

    # Convert 'Review Text' to TF-IDF features
    tfidf = TfidfVectorizer(max_features=1000, stop_words='english')
    X_text = tfidf.fit_transform(features[text_col])

    # Ensure 'Star Rating' is numeric
    features[rating_col] = pd.to_numeric(features[rating_col], errors='coerce').fillna(0)

    # Combine TF-IDF features with 'Star Rating'
    X = pd.concat([pd.DataFrame(X_text.toarray()), features[rating_col].reset_index(drop=True)], axis=1)
    X.columns = X.columns.astype(str)  # Ensure all column names are strings

    # Encode target labels (sentiments)
    le = LabelEncoder()
    y = le.fit_transform(target)

    # Splitting the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Naive Bayes Classifier

    model = MultinomialNB()
    # Train the model
    model.fit(X_train, y_train)

    # Make predictions
    y_pred = model.predict(X_test)

    # Evaluate the model
    print("Accuracy:", accuracy_score(y_test, y_pred))
    print("Classification Report:\n", classification_report(y_test, y_pred, target_names=le.classes_))

# Assuming tripadvisor_df, trustpilot_df, and yelp_df are defined and loaded
apply_ml_model(tripadvisor_df)
apply_ml_model(trustpilot_df)
apply_ml_model(yelp_df)






In [None]:
import xgboost as xgb
# Define a function to apply ML model on a dataset
def apply_ml_model(data, sentiment_col='Sentiment', text_col='Review Text', rating_col='Star Rating'):
    # Preprocessing
    features = data[[text_col, rating_col]]
    target = data[sentiment_col]

    # Convert 'Review Text' to TF-IDF features
    tfidf = TfidfVectorizer(max_features=1000, stop_words='english')
    X_text = tfidf.fit_transform(features[text_col])

    # Ensure 'Star Rating' is numeric
    features[rating_col] = pd.to_numeric(features[rating_col], errors='coerce').fillna(0)

    # Combine TF-IDF features with 'Star Rating'
    X = pd.concat([pd.DataFrame(X_text.toarray()), features[rating_col].reset_index(drop=True)], axis=1)
    X.columns = X.columns.astype(str)  # Ensure all column names are strings

    # Encode target labels (sentiments)
    le = LabelEncoder()
    y = le.fit_transform(target)

    # Splitting the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # XGBoost Classifier

    model = xgb.XGBClassifier(use_label_encoder=False, eval_metric='mlogloss')
    # Train the model
    model.fit(X_train, y_train)

    # Make predictions
    y_pred = model.predict(X_test)

    # Evaluate the model
    print("Accuracy:", accuracy_score(y_test, y_pred))
    print("Classification Report:\n", classification_report(y_test, y_pred, target_names=le.classes_))

# Assuming tripadvisor_df, trustpilot_df, and yelp_df are defined and loaded
apply_ml_model(tripadvisor_df)
apply_ml_model(trustpilot_df)
apply_ml_model(yelp_df)


In [None]:
# Define a function to apply ML model on a dataset
def apply_ml_model(data, sentiment_col='Sentiment', text_col='Review Text', rating_col='Star Rating'):
    # Preprocessing
    features = data[[text_col, rating_col]]
    target = data[sentiment_col]

    # Convert 'Review Text' to TF-IDF features
    tfidf = TfidfVectorizer(max_features=1000, stop_words='english')
    X_text = tfidf.fit_transform(features[text_col])

    # Ensure 'Star Rating' is numeric
    features[rating_col] = pd.to_numeric(features[rating_col], errors='coerce').fillna(0)

    # Combine TF-IDF features with 'Star Rating'
    X = pd.concat([pd.DataFrame(X_text.toarray()), features[rating_col].reset_index(drop=True)], axis=1)
    X.columns = X.columns.astype(str)  # Ensure all column names are strings

    # Encode target labels (sentiments)
    le = LabelEncoder()
    y = le.fit_transform(target)

    # Splitting the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Naive Bayes Classifier

    model = MultinomialNB()
    # Train the model
    model.fit(X_train, y_train)

    # Make predictions
    y_pred = model.predict(X_test)

    # Evaluate the model
    print("Accuracy:", accuracy_score(y_test, y_pred))
    print("Classification Report:\n", classification_report(y_test, y_pred, target_names=le.classes_))

# Assuming tripadvisor_df, trustpilot_df, and yelp_df are defined and loaded
apply_ml_model(tripadvisor_df)
apply_ml_model(trustpilot_df)
apply_ml_model(yelp_df)






In [None]:
from sklearn.neighbors import KNeighborsClassifier
# Define a function to apply ML model on a dataset
def apply_ml_model(data, sentiment_col='Sentiment', text_col='Review Text', rating_col='Star Rating'):
    # Preprocessing
    features = data[[text_col, rating_col]]
    target = data[sentiment_col]

    # Convert 'Review Text' to TF-IDF features
    tfidf = TfidfVectorizer(max_features=1000, stop_words='english')
    X_text = tfidf.fit_transform(features[text_col])

    # Ensure 'Star Rating' is numeric
    features[rating_col] = pd.to_numeric(features[rating_col], errors='coerce').fillna(0)

    # Combine TF-IDF features with 'Star Rating'
    X = pd.concat([pd.DataFrame(X_text.toarray()), features[rating_col].reset_index(drop=True)], axis=1)
    X.columns = X.columns.astype(str)  # Ensure all column names are strings

    # Encode target labels (sentiments)
    le = LabelEncoder()
    y = le.fit_transform(target)

    # Splitting the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # KNeighborsClassifier CLASSIFIER
    model = KNeighborsClassifier(n_neighbors=5)
    # Train the model
    model.fit(X_train, y_train)

    # Make predictions
    y_pred = model.predict(X_test)

    # Evaluate the model
    print("Accuracy:", accuracy_score(y_test, y_pred))
    print("Classification Report:\n", classification_report(y_test, y_pred, target_names=le.classes_))

# Assuming tripadvisor_df, trustpilot_df, and yelp_df are defined and loaded
apply_ml_model(tripadvisor_df)
apply_ml_model(trustpilot_df)
apply_ml_model(yelp_df)






In [None]:
from sklearn.svm import SVC
# Define a function to apply ML model on a dataset
def apply_ml_model(data, sentiment_col='Sentiment', text_col='Review Text', rating_col='Star Rating'):
    # Preprocessing
    features = data[[text_col, rating_col]]
    target = data[sentiment_col]

    # Convert 'Review Text' to TF-IDF features
    tfidf = TfidfVectorizer(max_features=1000, stop_words='english')
    X_text = tfidf.fit_transform(features[text_col])

    # Ensure 'Star Rating' is numeric
    features[rating_col] = pd.to_numeric(features[rating_col], errors='coerce').fillna(0)

    # Combine TF-IDF features with 'Star Rating'
    X = pd.concat([pd.DataFrame(X_text.toarray()), features[rating_col].reset_index(drop=True)], axis=1)
    X.columns = X.columns.astype(str)  # Ensure all column names are strings

    # Encode target labels (sentiments)
    le = LabelEncoder()
    y = le.fit_transform(target)

    # Splitting the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Support Vactor
    model = SVC(random_state=42)
    # Train the model
    model.fit(X_train, y_train)

    # Make predictions
    y_pred = model.predict(X_test)

    # Evaluate the model
    print("Accuracy:", accuracy_score(y_test, y_pred))
    print("Classification Report:\n", classification_report(y_test, y_pred, target_names=le.classes_))

# Assuming tripadvisor_df, trustpilot_df, and yelp_df are defined and loaded
apply_ml_model(tripadvisor_df)
apply_ml_model(trustpilot_df)
apply_ml_model(yelp_df)



In [None]:
# DEEP LEARNING MODEL LSTM

In [None]:

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from sklearn.metrics import classification_report

# Prepare the data
def prepare_lstm_data(data, sentiment_col='Sentiment', text_col='Review Text', max_len=100, num_words=10000):
    # Encode the target labels
    le = LabelEncoder()
    y = le.fit_transform(data[sentiment_col])

    # Tokenize the text data
    tokenizer = Tokenizer(num_words=num_words, oov_token="<OOV>")
    tokenizer.fit_on_texts(data[text_col])
    sequences = tokenizer.texts_to_sequences(data[text_col])

    # Pad the sequences
    X = pad_sequences(sequences, maxlen=max_len, padding='post', truncating='post')

    # Split the data
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    return X_train, X_test, y_train, y_test, tokenizer, le.classes_

# Define the LSTM model
def create_lstm_model(input_length, vocab_size, num_classes):
    model = Sequential([
        Embedding(vocab_size, 128, input_length=input_length),
        LSTM(128, return_sequences=True),
        Dropout(0.2),
        LSTM(128),
        Dropout(0.2),
        Dense(128, activation='relu'),
        Dense(num_classes, activation='softmax')
    ])

    model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

# Function to train and evaluate LSTM on a dataset
def train_evaluate_lstm(data, dataset_name):
    print(f"### {dataset_name} ###")
    X_train, X_test, y_train, y_test, tokenizer, classes = prepare_lstm_data(data)

    # Get the input length and vocab size from the tokenizer
    input_length = X_train.shape[1]
    vocab_size = len(tokenizer.word_index) + 1

    # Create the LSTM model
    lstm_model = create_lstm_model(input_length, vocab_size, len(classes))

    # Train the model
    lstm_model.fit(X_train, y_train, epochs=5, validation_data=(X_test, y_test))

    # Evaluate the model
    loss, accuracy = lstm_model.evaluate(X_test, y_test)
    print(f'Accuracy: {accuracy*100:.2f}%')

    # Predict and get a classification report
    y_pred = lstm_model.predict(X_test)
    y_pred_classes = y_pred.argmax(axis=1)
    print(classification_report(y_test, y_pred_classes, target_names=classes))
    print("\n")

# Apply LSTM on each dataset
train_evaluate_lstm(tripadvisor_df, "Tripadvisor")
train_evaluate_lstm(trustpilot_df, "Trustpilot")
train_evaluate_lstm(yelp_df, "Yelp")


In [None]:
import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve, auc, roc_auc_score
from sklearn.preprocessing import label_binarize
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import GradientBoostingClassifier
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
from sklearn.preprocessing import LabelEncoder

# Prepare the data for one of the models
def prepare_data_for_models(data, sentiment_col='Sentiment', text_col='Review Text', max_len=100, num_words=10000):
    # Encode the target labels
    le = LabelEncoder()
    y = le.fit_transform(data[sentiment_col])

    # Tokenize the text data
    tokenizer = Tokenizer(num_words=num_words, oov_token="<OOV>")
    tokenizer.fit_on_texts(data[text_col])
    sequences = tokenizer.texts_to_sequences(data[text_col])

    # Pad the sequences
    X = pad_sequences(sequences, maxlen=max_len, padding='post', truncating='post')

    # Split the data
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    return X_train, X_test, y_train, y_test, le.classes_

# Define a function to train a model and generate a confusion matrix
def train_model_and_confusion_matrix(model, X_train, X_test, y_train, y_test, class_names, model_name, dataset_name):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    # Generate confusion matrix
    cm = confusion_matrix(y_test, y_pred)

    # Plot confusion matrix
    plt.figure(figsize=(8, 6))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=class_names, yticklabels=class_names)
    plt.title(f'Confusion Matrix: {model_name} on {dataset_name}')
    plt.xlabel('Predicted')
    plt.ylabel('True')
    plt.show()

# Apply this function to each model on each dataset

# For LSTM, you would need to use the trained model's predict method and then round or argmax the output for the confusion matrix
def train_lstm_and_confusion_matrix(X_train, X_test, y_train, y_test, class_names, dataset_name):
    vocab_size = len(tokenizer.word_index) + 1
    input_length = X_train.shape[1]

    lstm_model = Sequential([
        Embedding(vocab_size, 128, input_length=input_length),
        LSTM(128, return_sequences=True),
        Dropout(0.2),
        LSTM(128),
        Dropout(0.2),
        Dense(128, activation='relu'),
        Dense(len(class_names), activation='softmax')
    ])

    lstm_model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    lstm_model.fit(X_train, y_train, epochs=5, validation_data=(X_test, y_test))

    y_pred = lstm_model.predict(X_test).argmax(axis=1)

    # Generate confusion matrix
    cm = confusion_matrix(y_test, y_pred)

    # Plot confusion matrix
    plt.figure(figsize=(8, 6))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=class_names, yticklabels=class_names)
    plt.title(f'Confusion Matrix: LSTM on {dataset_name}')
    plt.xlabel('Predicted')
    plt.ylabel('True')
    plt.show()

# Apply the models on a dataset, e.g., Tripadvisor
X_train, X_test, y_train, y_test, class_names = prepare_data_for_models(tripadvisor_df)

# Train and evaluate different models
train_model_and_confusion_matrix(RandomForestClassifier(random_state=42), X_train, X_test, y_train, y_test, class_names, "Random Forest", "Tripadvisor")
train_model_and_confusion_matrix(SVC(random_state=42), X_train, X_test, y_train, y_test, class_names, "SVM", "Tripadvisor")
train_model_and_confusion_matrix(KNeighborsClassifier(n_neighbors=5), X_train, X_test, y_train, y_test, class_names, "KNeighboursClassifier", "Tripadvisor")
train_model_and_confusion_matrix(MultinomialNB(), X_train, X_test, y_train, y_test, class_names, "Naive Bayes Classifier", "Tripadvisor")

train_lstm_and_confusion_matrix(X_train, X_test, y_train, y_test, class_names, "Tripadvisor")
