In [14]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import re

In [15]:
data = pd.read_csv("Amazon_Unlocked_Mobile.csv")

In [16]:
data.isna().sum()

Product Name        0
Brand Name      65171
Price            5933
Rating              0
Reviews            62
Review Votes    12296
dtype: int64

In [17]:
data = data[['Rating', 'Reviews']]
data.dropna(inplace=True)
data.isna().sum()

Rating     0
Reviews    0
dtype: int64

In [18]:
def label(rating):
    if rating >=4:
        return 'Positive'
    elif rating >=4:
        return 'Negative'
    else:
        return 'Neutral'
    
data['Label'] = data['Rating'].apply(label)

In [19]:
data.head(10)

Unnamed: 0,Rating,Reviews,Label
0,5,I feel so LUCKY to have found this used (phone...,Positive
1,4,"nice phone, nice up grade from my pantach revu...",Positive
2,5,Very pleased,Positive
3,4,It works good but it goes slow sometimes but i...,Positive
4,4,Great phone to replace my lost phone. The only...,Positive
5,1,I already had a phone with problems... I know ...,Neutral
6,2,The charging port was loose. I got that solder...,Neutral
7,2,"Phone looks good but wouldn't stay charged, ha...",Neutral
8,5,I originally was using the Samsung S2 Galaxy f...,Positive
9,3,It's battery life is great. It's very responsi...,Neutral


## Text- Pre Processing

In [20]:
data['Reviews'].iloc[2]

'Very pleased'

In [23]:
def clean_url(review_text):
    return re.sub(r'http\S+', ' ', review_text)

data['CleanReview'] = data['Reviews'].apply(clean_url)

In [24]:
# Removing Punctuations and Unwanted numbers or characters
def clean_pun(review_text):
    return re.sub('[^a-zA-Z]',' ',review_text)

data['CleanReview'] = data['CleanReview'].apply(clean_pun)

In [25]:
# Converting Lower Case
def clean_case(review_text):
    return str(review_text).lower()

data['CleanReview'] = data['CleanReview'].apply(clean_case)

In [26]:
# Tokenization Process
import nltk
from nltk.tokenize import word_tokenize

def clean_token(review_text):
    return word_tokenize(review_text)

data['CleanReview'] = data['CleanReview'].apply(clean_token)

In [27]:
data.head(5)

Unnamed: 0,Rating,Reviews,Label,CleanReview
0,5,I feel so LUCKY to have found this used (phone...,Positive,"[i, feel, so, lucky, to, have, found, this, us..."
1,4,"nice phone, nice up grade from my pantach revu...",Positive,"[nice, phone, nice, up, grade, from, my, panta..."
2,5,Very pleased,Positive,"[very, pleased]"
3,4,It works good but it goes slow sometimes but i...,Positive,"[it, works, good, but, it, goes, slow, sometim..."
4,4,Great phone to replace my lost phone. The only...,Positive,"[great, phone, to, replace, my, lost, phone, t..."


In [30]:
# Removing Stopword
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))

def clean_words(tokens):
    return [i for i in tokens if i not in stop_words]

data['CleanReview'] = data['CleanReview'].apply(clean_words)

In [31]:
# Stemming & Lemmatization
from nltk.stem import PorterStemmer

stemmer = PorterStemmer()

# STEMMING
def clean_stem(tokens):
    return [stemmer.stem(i) for i in tokens]

data['CleanReview'] = data['CleanReview'].apply(clean_stem)

In [32]:
# LEMMATIZATION
from nltk.stem import WordNetLemmatizer

lemmer = WordNetLemmatizer()

# STEMMING
def clean_lemm(tokens):
    return [lemmer.lemmatize(word=i, pos='v') for i in tokens]

data['CleanReview'] = data['CleanReview'].apply(clean_lemm)

In [33]:
data.head(5)

Unnamed: 0,Rating,Reviews,Label,CleanReview
0,5,I feel so LUCKY to have found this used (phone...,Positive,"[feel, lucki, find, use, phone, us, use, hard,..."
1,4,"nice phone, nice up grade from my pantach revu...",Positive,"[nice, phone, nice, grade, pantach, revu, clea..."
2,5,Very pleased,Positive,[pleas]
3,4,It works good but it goes slow sometimes but i...,Positive,"[work, good, goe, slow, sometim, good, phone, ..."
4,4,Great phone to replace my lost phone. The only...,Positive,"[great, phone, replac, lose, phone, thing, vol..."
