In [13]:
import pandas as pd
import nltk
from textblob import TextBlob

In [2]:
file = "reviews.csv"

reviews = pd.read_csv(file).dropna()
#reviews = reviews.iloc[:1000, :]
print(len(reviews))
reviews.head()

1031571


Unnamed: 0,listing_id,id,date,reviewer_id,reviewer_name,comments
0,2595,17857,21/11/2009,50679,Jean,Notre séjour de trois nuits.\r\nNous avons app...
1,2595,19176,05/12/2009,53267,Cate,Great experience.
2,2595,19760,10/12/2009,38960,Anita,I've stayed with my friend at the Midtown Cast...
3,2595,34320,09/04/2010,71130,Kai-Uwe,"We've been staying here for about 9 nights, en..."
4,2595,46312,25/05/2010,117113,Alicia,We had a wonderful stay at Jennifer's charming...


In [15]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\AVSMo\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [3]:
from nltk.corpus import stopwords
import re
from nltk.stem.porter import PorterStemmer

In [4]:
reviews = reviews.reset_index()
reviews = reviews.drop(labels = "index", axis = 1)

## Cleaning text corpus (punctuation removal, lower text and stemming)

In [17]:
%%time

corpus = []
ps = PorterStemmer()
for i in range (0,reviews.shape[0]):
    sent = re.sub("[^A-Za-z]", " ", reviews.comments[i])
    sent = sent.lower().split()
    sent = [ps.stem(word) for word in sent if word not in set(stopwords.words("english"))]
    sent = " ".join(sent)
    corpus.append(sent)
    print(i, end = '\r')

Wall time: 3h 10min 10s


In [21]:
pip install -U textblob

Collecting textblob
  Downloading textblob-0.15.3-py2.py3-none-any.whl (636 kB)
Installing collected packages: textblob
Successfully installed textblob-0.15.3
Note: you may need to restart the kernel to use updated packages.


## Applying textblob to text corpus

In [18]:
%%time
reviews["sentiments"] = [TextBlob(row).sentiment[0] for row in corpus]

Wall time: 3min 52s


In [19]:
reviews.head()

Unnamed: 0,listing_id,id,date,reviewer_id,reviewer_name,comments,sentiments
0,2595,17857,21/11/2009,50679,Jean,Notre séjour de trois nuits.\r\nNous avons app...,0.0
1,2595,19176,05/12/2009,53267,Cate,Great experience.,0.8
2,2595,19760,10/12/2009,38960,Anita,I've stayed with my friend at the Midtown Cast...,0.437374
3,2595,34320,09/04/2010,71130,Kai-Uwe,"We've been staying here for about 9 nights, en...",0.25
4,2595,46312,25/05/2010,117113,Alicia,We had a wonderful stay at Jennifer's charming...,0.0


## Creating bins for positive, negative and neutral sentiments

In [20]:
reviews["sentiments_1"] = pd.cut(reviews["sentiments"], bins = [-1.0, -0.1, 0.1, 1.0], include_lowest = True, labels = ["Negative", "Neutral", "Positive"])

In [21]:
reviews.head()

Unnamed: 0,listing_id,id,date,reviewer_id,reviewer_name,comments,sentiments,sentiments_1
0,2595,17857,21/11/2009,50679,Jean,Notre séjour de trois nuits.\r\nNous avons app...,0.0,Neutral
1,2595,19176,05/12/2009,53267,Cate,Great experience.,0.8,Positive
2,2595,19760,10/12/2009,38960,Anita,I've stayed with my friend at the Midtown Cast...,0.437374,Positive
3,2595,34320,09/04/2010,71130,Kai-Uwe,"We've been staying here for about 9 nights, en...",0.25,Positive
4,2595,46312,25/05/2010,117113,Alicia,We had a wonderful stay at Jennifer's charming...,0.0,Neutral


## Counting type of sentiments

In [22]:
from collections import Counter

Counter(reviews["sentiments_1"])

Counter({'Neutral': 164649, 'Positive': 857699, 'Negative': 9223})

In [23]:
reviews = reviews.drop(labels = "sentiments", axis = 1)
reviews.head()

Unnamed: 0,listing_id,id,date,reviewer_id,reviewer_name,comments,sentiments_1
0,2595,17857,21/11/2009,50679,Jean,Notre séjour de trois nuits.\r\nNous avons app...,Neutral
1,2595,19176,05/12/2009,53267,Cate,Great experience.,Positive
2,2595,19760,10/12/2009,38960,Anita,I've stayed with my friend at the Midtown Cast...,Positive
3,2595,34320,09/04/2010,71130,Kai-Uwe,"We've been staying here for about 9 nights, en...",Positive
4,2595,46312,25/05/2010,117113,Alicia,We had a wonderful stay at Jennifer's charming...,Neutral


## converting dataframe to csv file

In [24]:
reviews.to_csv("sentiments.csv")

In [25]:
reviews[reviews["sentiments_1"] == "Negative"].index

Int64Index([     31,     144,     147,     364,     566,     683,     710,
                712,     725,     726,
            ...
            1030896, 1030932, 1031070, 1031074, 1031090, 1031201, 1031205,
            1031357, 1031417, 1031435],
           dtype='int64', length=9223)

Check randon Index Location: 364

In [27]:
reviews["comments"][364]

'Worst Airbnb and I felt uncomfortable and left. What’s in the picture is not what you going to get in person real talk.'

In [28]:
reviews[reviews["sentiments_1"] == "Positive"].index

Int64Index([      1,       2,       3,       5,       7,       8,       9,
                 10,      11,      12,
            ...
            1031559, 1031560, 1031561, 1031562, 1031563, 1031564, 1031565,
            1031566, 1031567, 1031569],
           dtype='int64', length=857699)

Check randon Index Location: 1031568

In [29]:
reviews["comments"][1031568]

'Location, host, & place was awesome with minor cleanliness issues.'

In [30]:
from collections import Counter

Counter(reviews["sentiments_1"])

Counter({'Neutral': 164649, 'Positive': 857699, 'Negative': 9223})

## After Manualy reading few labels, ratings as Positive/Negative/Neutral

In [33]:
df = pd.read_csv("sentiments_1.csv")

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


In [34]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1031571 entries, 0 to 1031570
Data columns (total 9 columns):
 #   Column         Non-Null Count    Dtype 
---  ------         --------------    ----- 
 0   Unnamed: 0     1031571 non-null  int64 
 1   listing_id     1031571 non-null  int64 
 2   id             1031571 non-null  int64 
 3   date           1031571 non-null  object
 4   reviewer_id    1031571 non-null  int64 
 5   reviewer_name  1031571 non-null  object
 6   comments       1031571 non-null  object
 7   sentiments_1   1031571 non-null  object
 8   Label          124 non-null      object
dtypes: int64(4), object(5)
memory usage: 70.8+ MB


## Only keeping the rows where we have manualy labeled the ratings.

In [35]:
df_verification = df.dropna()

In [36]:
df_verification.head()

Unnamed: 0.1,Unnamed: 0,listing_id,id,date,reviewer_id,reviewer_name,comments,sentiments_1,Label
0,0,2595,17857,11/21/2009,50679,Jean,Notre séjour de trois nuits.\nNous avons appré...,Neutral,2
1,1,2595,19176,12/5/2009,53267,Cate,Great experience.,Positive,0
2,2,2595,19760,12/10/2009,38960,Anita,I've stayed with my friend at the Midtown Cast...,Positive,0
3,3,2595,34320,4/9/2010,71130,Kai-Uwe,"We've been staying here for about 9 nights, en...",Positive,0
4,4,2595,46312,5/25/2010,117113,Alicia,We had a wonderful stay at Jennifer's charming...,Neutral,0


## Counting the number of readings where the predicted and manual labeling matches.

In [37]:
postive_correct = df_verification[(df_verification.sentiments_1 == "Positive") & (df_verification.Label == "0")].shape[0]
negetive_correct = df_verification[(df_verification.sentiments_1 == "Negetive") & (df_verification.Label == "1")].shape[0]
neutral_correct = df_verification[(df_verification.sentiments_1 == "Neutral") & (df_verification.Label == "2")].shape[0]
total_correct = postive_correct + negetive_correct + neutral_correct

In [38]:
print("Accuracy is: ", round((total_correct/df_verification.shape[0])*100, 2), "%")

Accuracy is:  85.48 %
