In [1]:
pip install --upgrade nltk

Note: you may need to restart the kernel to use updated packages.


In [2]:
#for detecting languages uses in review
!pip install langdetect



In [3]:
#for handling tabular dataset
import pandas as pd

In [4]:
#for linear, algorithms operations
import numpy as np

In [5]:
#for detecting languages
from langdetect import detect

In [6]:
#to remove punctuations, import regular expressions (re)
import re

In [7]:
#for plotting and visualisation
import matplotlib.pyplot as plt

In [8]:
#for cleaning webscrapped data .i.e. removing <br> etc
from bs4 import BeautifulSoup

In [9]:
#for handling natural languages, import the natural language tool kit
import nltk

In [9]:
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

In [10]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score,f1_score,ConfusionMatrixDisplay
from sklearn.model_selection import train_test_split

In [11]:
from nltk.sentiment import SentimentIntensityAnalyzer
from tqdm.notebook import tqdm

In [12]:
#set working directory
import os

In [13]:
#Get present working directory
pwd = os.getcwd()

In [14]:
pwd

"C:\\Users\\HP\\Documents\\Master's Project Thaier Hameed\\OUTCOME"

In [15]:
#Set file directory
new_directory = "C:/Users/HP/Documents/Master's Project Thaier Hameed"

In [16]:
#Change directory to preferred directory
os.chdir(new_directory)

In [17]:
pwd = os.getcwd()

In [19]:
data = pd.read_csv(pwd + "/reviews.csv")

In [20]:
data.head()

Unnamed: 0,listing_id,id,id2,date,reviewer_id,reviewer_name,comments
0,157612.0,919313.0,0,13/02/2012,1378688,Kristin,Margaret and her husband were the perfect host...
1,157612.0,922493.0,1,14/02/2012,1724861,Katy,"Margaret and Tom are warm, welcoming and incre..."
2,157612.0,1244776.0,2,07/05/2012,2284316,Ian,"The place was great, and the photographs give ..."
3,157612.0,1486412.0,3,15/06/2012,1440146,Tim,"Super Place, Margaret and Tom are Lovely peopl..."
4,157612.0,1538944.0,4,22/06/2012,2640396,Sherry,Margaret was such a great host and was extreme...


In [21]:
#Remove unnecessary columns
data.drop(['id','id2','date','reviewer_id','reviewer_name'], axis=1, inplace=True)

In [22]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 157064 entries, 0 to 157063
Data columns (total 2 columns):
 #   Column      Non-Null Count   Dtype  
---  ------      --------------   -----  
 0   listing_id  157064 non-null  float64
 1   comments    157044 non-null  object 
dtypes: float64(1), object(1)
memory usage: 2.4+ MB


In [23]:
data.head()

Unnamed: 0,listing_id,comments
0,157612.0,Margaret and her husband were the perfect host...
1,157612.0,"Margaret and Tom are warm, welcoming and incre..."
2,157612.0,"The place was great, and the photographs give ..."
3,157612.0,"Super Place, Margaret and Tom are Lovely peopl..."
4,157612.0,Margaret was such a great host and was extreme...


In [24]:
#Check for Missing Values (NAs)
data.isnull().sum()

listing_id     0
comments      20
dtype: int64

REVIEW ANALYSIS

In [25]:
data.dropna(inplace = True)

In [26]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 157044 entries, 0 to 157063
Data columns (total 2 columns):
 #   Column      Non-Null Count   Dtype  
---  ------      --------------   -----  
 0   listing_id  157044 non-null  float64
 1   comments    157044 non-null  object 
dtypes: float64(1), object(1)
memory usage: 3.6+ MB


In [27]:
#Our data observation revealed that some comments/reviews are not in English language.
#Find and extract only the English reviews

In [28]:
def is_english(text):
    try:
        return detect(text) == 'en'
    except:
        return False

def filter_non_english_rows(dataframe, comments):
    filtered_df = dataframe[dataframe['comments'].apply(is_english)]
    return filtered_df

# Example usage
if __name__ == "__main__":

    eng_comments = filter_non_english_rows(data, 'comments')

    #Eng_comments contains only the rows with English language in the specified column
    print(eng_comments)


          listing_id                                           comments
0       1.576120e+05  Margaret and her husband were the perfect host...
1       1.576120e+05  Margaret and Tom are warm, welcoming and incre...
2       1.576120e+05  The place was great, and the photographs give ...
3       1.576120e+05  Super Place, Margaret and Tom are Lovely peopl...
4       1.576120e+05  Margaret was such a great host and was extreme...
...              ...                                                ...
157059  8.440000e+17   Lovely place to stay. Would recommend to anyone.
157060  8.490000e+17  I want to thank Uche for having such a wonderf...
157061  8.440000e+17       Great stay ! <br/>Great city <br/>Great host
157062  8.510000e+17  I had an amazing stay in Ayman's place. The si...
157063  8.510000e+17  My stay was super comfortable. The Hosts were ...

[146824 rows x 2 columns]


In [45]:
#eng_comments.to_csv('Engreviews.csv')

DATA CLEANING/PREPROCESSING PROCEDURE


. Remove HTML Tags

. Remove punctuation and numeric values

. Convert texts to lower case

. Split words to list

. Remove stopwords

. Perform Lemmatization

In [46]:
#Install stopwords
#Wordnet holds a lemmatizer class

In [47]:
data = pd.read_csv(pwd + "/Engreviews.csv")

In [48]:
data.head()

Unnamed: 0.1,Unnamed: 0,listing_id,comments
0,0,157612.0,Margaret and her husband were the perfect host...
1,1,157612.0,"Margaret and Tom are warm, welcoming and incre..."
2,2,157612.0,"The place was great, and the photographs give ..."
3,3,157612.0,"Super Place, Margaret and Tom are Lovely peopl..."
4,4,157612.0,Margaret was such a great host and was extreme...


In [49]:
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [50]:
from nltk.sentiment import SentimentIntensityAnalyzer
from tqdm.notebook import tqdm

In [51]:
#APPLY the process to the entire comment

USING VADER (Valence Aware Dictionary and SENTIMENT Reasoner)- Bag of words approach

In [53]:
import nltk
nltk.download('vader_lexicon')

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


True

In [54]:
def review_transform(review):
    review = BeautifulSoup(review).get_text()
    review = re.sub("[^a-zA-Z]", " ", review)
    review = review.lower()
    review = review.split()
    stop_words = set(stopwords.words('english'))
    lemmatizer = WordNetLemmatizer()
    review = [lemmatizer.lemmatize(w) for w in review if w not in stop_words]
    return " ".join(review)

In [55]:
# Apply the review_transform function to the 'comments' column and store the transformed comments in a new column
data['transformed_comments'] = data['comments'].apply(review_transform)



In [56]:
sia = SentimentIntensityAnalyzer()

In [57]:
# Initialize lists to store sentiment scores
compound_scores = []
neg_scores = []
neu_scores = []
pos_scores = []

In [58]:
# Iterate over the transformed comments and calculate sentiment scores
for text in tqdm(data['transformed_comments']):
    sentiment_scores = sia.polarity_scores(text)
    compound_scores.append(sentiment_scores['compound'])
    neg_scores.append(sentiment_scores['neg'])
    neu_scores.append(sentiment_scores['neu'])
    pos_scores.append(sentiment_scores['pos'])

  0%|          | 0/146824 [00:00<?, ?it/s]

In [59]:
# Add the sentiment scores to the original dataset
data['compound_score'] = compound_scores
data['neg_score'] = neg_scores
data['neu_score'] = neu_scores
data['pos_score'] = pos_scores

In [60]:
# Drop the 'transformed_comments' column if no longer needed
data.drop('transformed_comments', axis=1, inplace=True)

In [61]:
print(data)

        Unnamed: 0    listing_id  \
0                0  1.576120e+05   
1                1  1.576120e+05   
2                2  1.576120e+05   
3                3  1.576120e+05   
4                4  1.576120e+05   
...            ...           ...   
146819      157059  8.440000e+17   
146820      157060  8.490000e+17   
146821      157061  8.440000e+17   
146822      157062  8.510000e+17   
146823      157063  8.510000e+17   

                                                 comments  compound_score  \
0       Margaret and her husband were the perfect host...          0.9873   
1       Margaret and Tom are warm, welcoming and incre...          0.9348   
2       The place was great, and the photographs give ...          0.8777   
3       Super Place, Margaret and Tom are Lovely peopl...          0.8885   
4       Margaret was such a great host and was extreme...          0.9789   
...                                                   ...             ...   
146819   Lovely place to sta

In [63]:
#data.to_csv('Engreviewdatawithsentiment.csv')

In [20]:
avgcenti = pd.read_csv(pwd + "/OUTCOME/Engreviewdatawithsentiment.csv")

In [21]:
avgcenti.head()

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,listing_id,comments,compound_score,neg_score,neu_score,pos_score
0,0,0,157612.0,Margaret and her husband were the perfect host...,0.9873,0.0,0.386,0.614
1,1,1,157612.0,"Margaret and Tom are warm, welcoming and incre...",0.9348,0.0,0.497,0.503
2,2,2,157612.0,"The place was great, and the photographs give ...",0.8777,0.0,0.373,0.627
3,3,3,157612.0,"Super Place, Margaret and Tom are Lovely peopl...",0.8885,0.0,0.462,0.538
4,4,4,157612.0,Margaret was such a great host and was extreme...,0.9789,0.0,0.465,0.535


In [23]:
# Group by listing_id and take mean of compound_score
df = avgcenti.groupby('listing_id').mean().reset_index()

df.head()

Unnamed: 0.2,listing_id,Unnamed: 0.1,Unnamed: 0,compound_score,neg_score,neu_score,pos_score
0,157612.0,76.456897,78.887931,0.897447,0.016603,0.475802,0.507569
1,283495.0,177.5,185.5,0.96678,0.0029,0.4286,0.5684
2,299194.0,518.065147,538.136808,0.878289,0.017792,0.478404,0.503801
3,310742.0,1002.262295,1052.606557,0.832784,0.015984,0.476672,0.507311
4,411843.0,1362.338028,1443.737089,0.868647,0.014103,0.516042,0.469854


In [26]:
# Group by 'listing_id' and calculate the average of 'compound_score' for each group
avg_compound_score = avgcenti.groupby('listing_id')['compound_score'].mean().reset_index()

# Print the resulting DataFrame
avg_compound_score.head()

Unnamed: 0,listing_id,compound_score
0,157612.0,0.897447
1,283495.0,0.96678
2,299194.0,0.878289
3,310742.0,0.832784
4,411843.0,0.868647


In [27]:
avg_compound_score.to_csv('avg_compound_score.csv')

MERGE LISTING AND AVERAGE COMPOUND SCORE

In [32]:
list = pd.read_csv("C:/Users/HP/Documents/Master's Project Thaier Hameed/OUTCOME/listings_complete_details.csv")

In [33]:
# Merge the two DataFrames on 'listing_id'
merged_list_avg_compound = pd.merge(list, avg_compound_score, on='listing_id', how='inner')

# Print the resulting merged DataFrame
merged_list_avg_compound.head()

Unnamed: 0,listing_id,listing_url,scrape_id,last_scraped,source,name,description,neighborhood_overview,picture_url,host_id,...,review_scores_location,review_scores_value,license,instant_bookable,calculated_host_listings_count,calculated_host_listings_count_entire_homes,calculated_host_listings_count_private_rooms,calculated_host_listings_count_shared_rooms,reviews_per_month,compound_score
0,157612.0,https://www.airbnb.com/rooms/157612,20200000000000.0,27/03/2023,city scrape,New attic space/single & Dble room,"The loft space is a small but cosy, private an...",There is a public park within easy walking dis...,https://a0.muscache.com/pictures/18150718/745a...,757016,...,4.66,4.92,,f,1,1,0,0,0.91,0.897447
1,1241309.0,https://www.airbnb.com/rooms/1241309,20200000000000.0,27/03/2023,city scrape,★Glass Roof★Walkable★Full Kitch★Office★Deck★Ga...,✔︎Walk Score 75 (most errands can be accomplis...,You will find my personal recommendations in m...,https://a0.muscache.com/pictures/miso/Hosting-...,6766640,...,4.0,4.0,,f,3,3,0,0,0.06,0.8779
2,4468181.0,https://www.airbnb.com/rooms/4468181,20200000000000.0,27/03/2023,city scrape,Double Room in shared apartment,"Close to Motorway links, free on street public...",,https://a0.muscache.com/pictures/5284eef5-392d...,23182584,...,3.67,5.0,,f,1,0,1,0,0.05,0.764475
3,283495.0,https://www.airbnb.com/rooms/283495,20200000000000.0,27/03/2023,city scrape,En-suite room in detached house,<b>The space</b><br />Double bedroom with King...,The suburbaness of it all but 2 minutes from t...,https://a0.muscache.com/pictures/78775473/2d8f...,1476718,...,4.8,5.0,,f,1,0,1,0,0.1,0.96678
4,1329093.0,https://www.airbnb.com/rooms/1329093,20200000000000.0,27/03/2023,city scrape,Private B&B Double Room 2,<b>The space</b><br />Private Guest Rooms suit...,,https://a0.muscache.com/pictures/a80681fd-9e4d...,219489200,...,4.81,4.61,,f,1,0,1,0,3.54,0.74523


In [34]:
merged_list_avg_compound.to_csv('listwithAvgRating.csv')