Install libraries

In [2]:
!pip install deep_translator
!pip install textblob
!pip install langdetect
!pip install vaderSentiment

Collecting deep_translator
  Downloading deep_translator-1.11.4-py3-none-any.whl.metadata (30 kB)
Downloading deep_translator-1.11.4-py3-none-any.whl (42 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/42.3 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.3/42.3 kB[0m [31m3.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: deep_translator
Successfully installed deep_translator-1.11.4
Collecting langdetect
  Downloading langdetect-1.0.9.tar.gz (981 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m981.5/981.5 kB[0m [31m23.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: langdetect
  Building wheel for langdetect (setup.py) ... [?25l[?25hdone
  Created wheel for langdetect: filename=langdetect-1.0.9-py3-none-any.whl size=993222 sha256=c37c9703a04fe1baf144b2f55481582207035deefd23a852c29f9

Import libraries

In [3]:
import pandas as pd
from deep_translator import GoogleTranslator
from textblob import TextBlob

Import data

In [4]:
csv_file = '/content/filtered_reviews.csv'

# Set up an empty list to hold the filtered DataFrame chunks
filtered_chunks = []

# Read the CSV file in chunks and filter rows where 'date' >= '2023-12-01'
# Only keep relevant fields: date, listing_id, reviewer_id, comments
for chunk in pd.read_csv(csv_file, chunksize=1000, parse_dates=['date'], usecols=['date', 'listing_id', 'reviewer_id', 'comments']):
    filtered_chunk = chunk[chunk['date'] >= '2023-12-01']
    filtered_chunks.append(filtered_chunk)

# Combine the filtered chunks into a single DataFrame
reviews_df = pd.concat(filtered_chunks, ignore_index=True)


In [5]:
reviews_df

Unnamed: 0,listing_id,date,reviewer_id,comments
0,1992333,2023-12-01,37012494,What a place to stay! The view is breathtaking...
1,1992333,2023-12-09,76872138,"Search no further, this is the place to stay. ..."
2,3191,2023-12-23,9927643,This was my second visit and it was as good as...
3,15007,2023-12-03,243886263,Thank you Dirk and your wife for wonderful wee...
4,897063,2023-12-01,398675800,"Great place, great location and a great host."
...,...,...,...,...
11969,1045539636083237263,2023-12-27,8112629,We were extremely comfortable during this visi...
11970,1045595447074880765,2023-12-17,313105614,The apartment is brand new and definitely felt...
11971,1048683134427485052,2023-12-19,41455737,I had an amazing stay in this very stylish apa...
11972,1049033266515220845,2023-12-25,26831109,Anna's Camps Bay Retreat: ⭐⭐⭐⭐⭐<br/>Annas's ap...


Clean data

In [6]:
# Check column types
reviews_df.dtypes

Unnamed: 0,0
listing_id,int64
date,datetime64[ns]
reviewer_id,int64
comments,object


In [7]:
# Standardize data formats
reviews_df.convert_dtypes

In [8]:
# Check for duplicates
reviews_df.duplicated().sum()

0

In [9]:
# Remove duplicates
reviews_df.drop_duplicates()

Unnamed: 0,listing_id,date,reviewer_id,comments
0,1992333,2023-12-01,37012494,What a place to stay! The view is breathtaking...
1,1992333,2023-12-09,76872138,"Search no further, this is the place to stay. ..."
2,3191,2023-12-23,9927643,This was my second visit and it was as good as...
3,15007,2023-12-03,243886263,Thank you Dirk and your wife for wonderful wee...
4,897063,2023-12-01,398675800,"Great place, great location and a great host."
...,...,...,...,...
11969,1045539636083237263,2023-12-27,8112629,We were extremely comfortable during this visi...
11970,1045595447074880765,2023-12-17,313105614,The apartment is brand new and definitely felt...
11971,1048683134427485052,2023-12-19,41455737,I had an amazing stay in this very stylish apa...
11972,1049033266515220845,2023-12-25,26831109,Anna's Camps Bay Retreat: ⭐⭐⭐⭐⭐<br/>Annas's ap...


In [10]:
# Detect and remove non-numerical rows in the 'reviewer_id' column
reviews_df['reviewer_id'] = pd.to_numeric(reviews_df['reviewer_id'], errors='coerce')  # Converts non-numerical values to NaN
reviews_df = reviews_df.dropna(subset=['reviewer_id'])  # Drops rows where 'id_numeric' is NaN


Transform data

In [None]:
# Translate non-English comments to English

column_to_translate = 'comments'

from langdetect import detect 

def translate_text(text):
    """
    Translate the given text to English if it's not already in English.

    Parameters:
    - text: str or None, the text to translate.

    Returns:
    - Translated text if input is not None and not in English, else the original text.
    """
    if pd.notnull(text):
        try:
            # Detect the language
            language = detect(text)
            if language != 'en':  # Only translate if the detected language is not English
                return GoogleTranslator(source='auto', target='en').translate(text)
            return text  # Return the original text if it's already in English
        except Exception as e:
            print(f"Error translating text '{text}': {e}")
            return text  # Return the original text if translation fails
    return text  # Return None if input is None

reviews_df['translated_comments'] = reviews_df[column_to_translate].apply(translate_text)


Error translating text '.': No features in text.
Error translating text '.': No features in text.
Error translating text '⭐️⭐️⭐️⭐️⭐️': No features in text.
Error translating text '⭐️⭐️⭐️⭐️⭐️': No features in text.
Error translating text '10/10': No features in text.
Error translating text '👌': No features in text.
Error translating text '-': No features in text.
Error translating text '👌🏽': No features in text.
Error translating text '.': No features in text.
Error translating text '.': No features in text.
Error translating text '🫶': No features in text.
Error translating text '.': No features in text.
Error translating text '👍👍': No features in text.
Error translating text '👍👍': No features in text.
Error translating text '10/10': No features in text.
Error translating text '.': No features in text.
Error translating text '👍🏼👍🏼👍🏼': No features in text.
Error translating text '-': No features in text.
Error translating text '.': No features in text.
Error translating text '👍': No feat

In [12]:
reviews_df

Unnamed: 0,listing_id,date,reviewer_id,comments,translated_comments
0,1992333,2023-12-01,37012494,What a place to stay! The view is breathtaking...,What a place to stay! The view is breathtaking...
1,1992333,2023-12-09,76872138,"Search no further, this is the place to stay. ...","Search no further, this is the place to stay. ..."
2,3191,2023-12-23,9927643,This was my second visit and it was as good as...,This was my second visit and it was as good as...
3,15007,2023-12-03,243886263,Thank you Dirk and your wife for wonderful wee...,Thank you Dirk and your wife for wonderful wee...
4,897063,2023-12-01,398675800,"Great place, great location and a great host.","Great place, great location and a great host."
...,...,...,...,...,...
11969,1045539636083237263,2023-12-27,8112629,We were extremely comfortable during this visi...,We were extremely comfortable during this visi...
11970,1045595447074880765,2023-12-17,313105614,The apartment is brand new and definitely felt...,The apartment is brand new and definitely felt...
11971,1048683134427485052,2023-12-19,41455737,I had an amazing stay in this very stylish apa...,I had an amazing stay in this very stylish apa...
11972,1049033266515220845,2023-12-25,26831109,Anna's Camps Bay Retreat: ⭐⭐⭐⭐⭐<br/>Annas's ap...,Anna's Camps Bay Retreat: ⭐⭐⭐⭐⭐<br/>Annas's ap...


In [13]:
import pandas as pd
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

# Initialize the VADER sentiment analyzer
analyzer = SentimentIntensityAnalyzer()

"""
Uses VADER sentiment analysis to analyze comments and returns whether the review is 'good', 'neutral', or 'bad'.

Args:
    row (int): The row index number
    comment (string): The comment to analyze

Returns:
    string: 'good', 'neutral', or 'bad'
"""
def sentiment(row, comment):
    # Check if the comment is null or empty
    if pd.isnull(comment) or comment.strip() == "":
        return None

    print(f"Processing index {row}")
    try:
        # Analyze sentiment using VADER
        sentiment_score = analyzer.polarity_scores(comment)['compound']

        # Classify sentiment as 'good', 'neutral', or 'bad'
        if sentiment_score > 0.05:
            return 'good'
        elif sentiment_score < -0.05:
            return 'bad'
        else:
            return 'neutral'
    except Exception as e:
        print(f"Error at index {row}: {e}")
        return None  # or another appropriate fallback value

# Apply the function to the 'comments' column
reviews_df['sentiment'] = reviews_df.apply(lambda row: sentiment(row.name, row['translated_comments']), axis=1)


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Processing index 6973
Processing index 6974
Processing index 6975
Processing index 6976
Processing index 6977
Processing index 6978
Processing index 6979
Processing index 6980
Processing index 6981
Processing index 6982
Processing index 6983
Processing index 6984
Processing index 6985
Processing index 6986
Processing index 6987
Processing index 6988
Processing index 6989
Processing index 6990
Processing index 6991
Processing index 6992
Processing index 6993
Processing index 6994
Processing index 6995
Processing index 6996
Processing index 6997
Processing index 6998
Processing index 6999
Processing index 7000
Processing index 7001
Processing index 7002
Processing index 7003
Processing index 7004
Processing index 7005
Processing index 7006
Processing index 7007
Processing index 7008
Processing index 7009
Processing index 7010
Processing index 7011
Processing index 7012
Processing index 7013
Processing index 7014
Processing 

In [14]:
reviews_df

Unnamed: 0,listing_id,date,reviewer_id,comments,translated_comments,sentiment
0,1992333,2023-12-01,37012494,What a place to stay! The view is breathtaking...,What a place to stay! The view is breathtaking...,good
1,1992333,2023-12-09,76872138,"Search no further, this is the place to stay. ...","Search no further, this is the place to stay. ...",good
2,3191,2023-12-23,9927643,This was my second visit and it was as good as...,This was my second visit and it was as good as...,good
3,15007,2023-12-03,243886263,Thank you Dirk and your wife for wonderful wee...,Thank you Dirk and your wife for wonderful wee...,good
4,897063,2023-12-01,398675800,"Great place, great location and a great host.","Great place, great location and a great host.",good
...,...,...,...,...,...,...
11969,1045539636083237263,2023-12-27,8112629,We were extremely comfortable during this visi...,We were extremely comfortable during this visi...,good
11970,1045595447074880765,2023-12-17,313105614,The apartment is brand new and definitely felt...,The apartment is brand new and definitely felt...,good
11971,1048683134427485052,2023-12-19,41455737,I had an amazing stay in this very stylish apa...,I had an amazing stay in this very stylish apa...,good
11972,1049033266515220845,2023-12-25,26831109,Anna's Camps Bay Retreat: ⭐⭐⭐⭐⭐<br/>Annas's ap...,Anna's Camps Bay Retreat: ⭐⭐⭐⭐⭐<br/>Annas's ap...,good


Export to CSV

In [16]:
reviews_df.to_csv('Reviews.csv')