# Improving Preprocessing

Applying preprocessing to the original dataset `Reviews_withURL.csv`
* Some steps take some time to process the text; override the dataset with new columns to speed up the process.
    * Preprocessing
    * Language detection

In [1]:
# Database
import pymongo

# Preprocessing
import os
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup # Handle HTML syntax
from html import unescape
from datetime import datetime
from natsort import natsort_keygen
import re
import nltk
nltk.download('wordnet')
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

# Data Visualization
import matplotlib.pyplot as plt
import plotly.express as px

[nltk_data] Downloading package wordnet to /home/ml/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /home/ml/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


* Read Database

In [2]:
amazon_df = pd.read_csv('Reviews_withURL.csv',index_col=0)

In [3]:
# Preprocessing

* Set to the same NaN format

In [4]:
amazon_df.replace({None: np.nan}, inplace=True)

* Drop na and duplicates

In [5]:
amazon_df = amazon_df.dropna()
amazon_df = amazon_df.drop_duplicates()

In [6]:
amazon_df.head()

Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text,ProductURL
0,1,B001E4KFG0,A3SGXH7AUHU8GW,delmartian,1,1,5,1303862400,Good Quality Dog Food,I have bought several of the Vitality canned d...,https://www.amazon.com/dp/B001E4KFG0
1,2,B00813GRG4,A1D87F6ZCVE5NK,dll pa,0,0,1,1346976000,Not as Advertised,Product arrived labeled as Jumbo Salted Peanut...,https://www.amazon.com/dp/B00813GRG4
2,3,B000LQOCH0,ABXLMWJIXXAIN,"Natalia Corres ""Natalia Corres""",1,1,4,1219017600,"""Delight"" says it all",This is a confection that has been around a fe...,https://www.amazon.com/dp/B000LQOCH0
3,4,B000UA0QIQ,A395BORC6FGVXV,Karl,3,3,2,1307923200,Cough Medicine,If you are looking for the secret ingredient i...,https://www.amazon.com/dp/B000UA0QIQ
4,5,B006K2ZZ7K,A1UQRSCLF8GW1T,"Michael D. Bigham ""M. Wassir""",0,0,5,1350777600,Great taffy,Great taffy at a great price. There was a wid...,https://www.amazon.com/dp/B006K2ZZ7K


* Preprocessing

In [24]:
def clean_text(text):
    # Decode HTML text
    text = unescape(text)
    #Lowercase
    text = text.lower()
    #Remove html tags
    soup = BeautifulSoup(text,'html.parser')
    text = soup.get_text()
    # Remove Links
    text = re.sub(r'http\S+','',text)
    # Remove Punctuation
    text = re.sub(r'[^a-zA-Z0-9\s]',' ',text)
    # Remove Stopwords
    stop_words = set(stopwords.words('english'))
    text = ' '.join([word for word in text.split() if word not in stop_words])
    return text

In [25]:
mr = 'not for me'
mr = clean_text(mr)
mr

''

In [8]:
amazon_df['Text'] = amazon_df['Text'].apply(clean_text)
amazon_df['Summary'] = amazon_df['Summary'].apply(clean_text)

  soup = BeautifulSoup(text,'html.parser')
  soup = BeautifulSoup(text,'html.parser')


In [9]:
amazon_df.to_csv('amazon_df.csv')

### Open Preprocessed Dataset

In [10]:
amazon_df = pd.read_csv('amazon_df.csv',index_col=0)

# Check languages 

* To enhance data preprocessing, it is essential to verify that all text is in English.

In [11]:
# language detection
from langdetect import detect

In [12]:
#blob.detect_language() 
# it seems detect_language() shows error, however, I found some other resource
from langdetect import detect
detect('هيا بنا نلعب')
detect('Cuando termina la clase')

'es'

### Language Detection
* `pip install langdetect`

Sometimes `langdetect` has errors when trying to recognize the language, so you should double-check if the text is not in English.

In [13]:
# detect language; return undefined if language is not detected

def detect_language(text):
    try:
        result = detect(text)
    except:
        result = 'undefined'
    return result

In [14]:
# check each row; if the column is already in the dataframe, skip process

if 'language' not in amazon_df.columns:
    amazon_df['language'] = amazon_df['Text'].apply(lambda x: detect_language(x))

In [15]:
# Takes some time to dectect the language; override the dataset with new column 'language'

amazon_df.to_csv('amazon_df.csv')

In [2]:
amazon_df = pd.read_csv('amazon_df.csv',index_col=0)

In [16]:
amazon_df.head()

Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text,ProductURL,language
0,1,B001E4KFG0,A3SGXH7AUHU8GW,delmartian,1,1,5,1303862400,good quality dog food,bought several vitality canned dog food produc...,https://www.amazon.com/dp/B001E4KFG0,en
1,2,B00813GRG4,A1D87F6ZCVE5NK,dll pa,0,0,1,1346976000,advertised,product arrived labeled jumbo salted peanuts p...,https://www.amazon.com/dp/B00813GRG4,en
2,3,B000LQOCH0,ABXLMWJIXXAIN,"Natalia Corres ""Natalia Corres""",1,1,4,1219017600,delight says,confection around centuries light pillowy citr...,https://www.amazon.com/dp/B000LQOCH0,en
3,4,B000UA0QIQ,A395BORC6FGVXV,Karl,3,3,2,1307923200,cough medicine,looking secret ingredient robitussin believe f...,https://www.amazon.com/dp/B000UA0QIQ,en
4,5,B006K2ZZ7K,A1UQRSCLF8GW1T,"Michael D. Bigham ""M. Wassir""",0,0,5,1350777600,great taffy,great taffy great price wide assortment yummy ...,https://www.amazon.com/dp/B006K2ZZ7K,en


* __Obs__: It seems that not all the rows are in english. Checking rows with __langdetect__

In [3]:
amazon_df['language'].value_counts()

language
en           560594
da             1981
af             1681
no             1488
nl              453
it              425
fr              418
ro              304
ca              284
et              205
es              189
sv              147
tl               44
cy               41
sk               23
id               21
sl               19
so               18
hr               16
pt               15
tr                7
de                6
sq                5
pl                5
sw                3
fi                3
cs                2
lt                2
undefined         1
lv                1
Name: count, dtype: int64

In [5]:
# Store a list of languages

language_list = amazon_df['language'].value_counts().reset_index()['language'].to_list()
language_list[1:]

['da',
 'af',
 'no',
 'nl',
 'it',
 'fr',
 'ro',
 'ca',
 'et',
 'es',
 'sv',
 'tl',
 'cy',
 'sk',
 'id',
 'sl',
 'so',
 'hr',
 'pt',
 'tr',
 'de',
 'sq',
 'pl',
 'sw',
 'fi',
 'cs',
 'lt',
 'undefined',
 'lv']

### Must delete
delete vi

In [6]:
# checking text that are not recognize as english

# amazon_df[amazon_df['language'].isin(language_list[1:])]

In [7]:
# personalized searching for each language

text = amazon_df[(amazon_df['language'].isin(language_list[1:])) & (amazon_df['language']== 'undefined')]['Text']
text

233938    NaN
Name: Text, dtype: object

In [21]:
# Check the text for language recognition and use a for loop to print the entire message.

for i in range(len(text)):
    print(text[text.index][text.index[i]])

nan


In [22]:
amazon_df['language'].value_counts()

language
en           560594
da             1981
af             1681
no             1488
nl              453
it              425
fr              418
ro              304
ca              284
et              205
es              189
sv              147
tl               44
cy               41
sk               23
id               21
sl               19
so               18
hr               16
pt               15
tr                7
de                6
sq                5
pl                5
sw                3
fi                3
cs                2
lt                2
undefined         1
lv                1
Name: count, dtype: int64

In [14]:
amazon_df[amazon_df.isnull().any(axis=1)]

Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text,ProductURL,language
131,132,B003OB0IB8,A2DLRG3YX6A2ZU,luckybabe,2,3,2,1323820800,,must bit wuss soup tastes imagine fire might t...,https://www.amazon.com/dp/B003OB0IB8,en
335,336,B00469VSJI,A1447CDAPZGLYV,SANA AWAR,0,0,1,1326499200,,serveice delivery seller excellent product ord...,https://www.amazon.com/dp/B00469VSJI,en
1219,1220,B005O072PC,A3UAV8WK31RJY5,"Captain Cap ""Captain Cap""",1,1,4,1341878400,,received 4 first ever omaha steaks order recei...,https://www.amazon.com/dp/B005O072PC,en
2639,2640,B0016FY6H6,A3J171NBGIBYAW,"D. Mansfield ""Book Addict""",0,0,2,1346803200,,wanted desperately love even like drink mix su...,https://www.amazon.com/dp/B0016FY6H6,en
3280,3281,B005K4Q1VI,APTGM66PJRNGC,J. Slade,0,0,2,1330992000,,easy use best flavor much sugar quality though...,https://www.amazon.com/dp/B005K4Q1VI,en
...,...,...,...,...,...,...,...,...,...,...,...,...
567192,567193,B000LL0R92,A213624L3ZBL2B,margeaw,0,0,1,1333929600,,thought sounded interesting healthy taste awfu...,https://www.amazon.com/dp/B000LL0R92,en
567239,567240,B001FA1LF2,A3CVZZF48URP26,GFMom,1,1,3,1274745600,,high expectations bread since love schar class...,https://www.amazon.com/dp/B001FA1LF2,en
567660,567661,B005K4Q68Q,APTGM66PJRNGC,J. Slade,0,0,2,1330992000,,easy use best flavor much sugar quality though...,https://www.amazon.com/dp/B005K4Q68Q,en
567937,567938,B0030VJ8YU,A90I4J49NU3XN,Amy W,0,0,3,1317772800,,18 month old likes eat pretty much antying ped...,https://www.amazon.com/dp/B0030VJ8YU,en


In [23]:
amazon_df.isna().sum()

Id                           0
ProductId                    0
UserId                       0
ProfileName                  0
HelpfulnessNumerator         0
HelpfulnessDenominator       0
Score                        0
Time                         0
Summary                   1260
Text                         1
ProductURL                   0
language                     0
dtype: int64

In [23]:
a = detect_language("say wonderful highlight weekend ohhhhhh use soda water liquid even better")
a

'af'

### TextBlob Analysis

In [24]:
from textblob import TextBlob

In [25]:
blob = TextBlob(text)

TypeError: The `text` argument passed to `__init__(text)` must be a string, not <class 'pandas.core.series.Series'>

In [None]:
# Analyzing words, senteces and part-of-speach tagging

blob.words
blob.tags
blob.noun_phrases
blob.sentences

In [None]:
blob.sentiment_assessments

### Tranlation
1. `pip install translate`
2. Check translate version `pip list | grep translate`

In [None]:
text

In [None]:
from translate import Translator

translator = Translator(to_lang="fr")  # Destination language (e.g., "fr" for French)
translation = translator.translate("I like the product a lot but it sucks")
print(translation)


In [None]:
from translate import Translator

translator = Translator(from_lang='es',to_lang="en")  # Destination language (e.g., "fr" for French)
translation = translator.translate(text)
print(translation)



In [15]:
import pandas as pd
import numpy as np

In [16]:
g = pd.read_csv('Reviews_withURL.csv',index_col=0)

In [17]:
g.head()

Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text,ProductURL
0,1,B001E4KFG0,A3SGXH7AUHU8GW,delmartian,1,1,5,1303862400,Good Quality Dog Food,I have bought several of the Vitality canned d...,https://www.amazon.com/dp/B001E4KFG0
1,2,B00813GRG4,A1D87F6ZCVE5NK,dll pa,0,0,1,1346976000,Not as Advertised,Product arrived labeled as Jumbo Salted Peanut...,https://www.amazon.com/dp/B00813GRG4
2,3,B000LQOCH0,ABXLMWJIXXAIN,"Natalia Corres ""Natalia Corres""",1,1,4,1219017600,"""Delight"" says it all",This is a confection that has been around a fe...,https://www.amazon.com/dp/B000LQOCH0
3,4,B000UA0QIQ,A395BORC6FGVXV,Karl,3,3,2,1307923200,Cough Medicine,If you are looking for the secret ingredient i...,https://www.amazon.com/dp/B000UA0QIQ
4,5,B006K2ZZ7K,A1UQRSCLF8GW1T,"Michael D. Bigham ""M. Wassir""",0,0,5,1350777600,Great taffy,Great taffy at a great price. There was a wid...,https://www.amazon.com/dp/B006K2ZZ7K


In [22]:
g[g.index == 131]

Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text,ProductURL
131,132,B003OB0IB8,A2DLRG3YX6A2ZU,luckybabe,2,3,2,1323820800,Not for me,"I must be a bit of a wuss, because this soup t...",https://www.amazon.com/dp/B003OB0IB8


In [21]:
g[g.index == 131]['Text'].values

array(["I must be a bit of a wuss, because this soup tastes to me how I imagine fire might taste. Typically I like spicy food if it has a good flavor.  I don't find this to be the case with this soup. Any flavor is killed off by the burn."],
      dtype=object)

In [None]:
gg = g.replace({None: np.nan})

In [None]:
g.isna().sum()