<a href="https://colab.research.google.com/github/manalibhoir22/manali/blob/master/Cleaning_and_some_visualization.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Load the data

In [4]:
import numpy as np
import pandas as pd
df = pd.read_excel('data.xlsx')
df.head()

Unnamed: 0.1,Unnamed: 0,reviewerID,asin,reviewerName,helpful,reviewText,overall,summary,unixReviewTime,reviewTime
0,1,A30TL5EWN6DFXT,120401325X,christina,"[0, 0]",They look good and stick good! I just don't li...,4,Looks Good,1400630400,"05 21, 2014"
1,2,ASY55RVNIL0UD,120401325X,emily l.,"[0, 0]",These stickers work like the review says they ...,5,Really great product.,1389657600,"01 14, 2014"
2,3,A2TMXE2AFO7ONB,120401325X,Erica,"[0, 0]",These are awesome and make my phone look so st...,5,LOVE LOVE LOVE,1403740800,"06 26, 2014"
3,4,AWJ0WZQYMYFQ4,120401325X,JM,"[4, 4]",Item arrived in great time and was in perfect ...,4,Cute!,1382313600,"10 21, 2013"
4,5,ATX7CZYFXI1KW,120401325X,patrice m rogoza,"[2, 3]","awesome! stays on, and looks great. can be use...",5,leopard home button sticker for iphone 4s,1359849600,"02 3, 2013"


# Dropping unnecessary columns

In [6]:
df.drop(df.columns[[3]],axis=1,inplace=True)
df.head()

Unnamed: 0,asin,reviewText,overall
0,120401325X,They look good and stick good! I just don't li...,4
1,120401325X,These stickers work like the review says they ...,5
2,120401325X,These are awesome and make my phone look so st...,5
3,120401325X,Item arrived in great time and was in perfect ...,4
4,120401325X,"awesome! stays on, and looks great. can be use...",5


# Remove Blank Records and null values

In [7]:
df.isnull().sum()

asin           0
reviewText    99
overall        0
dtype: int64

In [8]:
# Check for whitespace strings (it's OK if there aren't any!):
blanks = []  # start with an empty list

for i,asin,rv,ov in df.itertuples():  # iterate over the DataFrame
    if type(rv)==str:            # avoid NaN values
        if rv.isspace():         # test 'review' for whitespace
            blanks.append(i)     # add matching index numbers to the list
        
len(blanks)

0

In [9]:
df.dropna(inplace=True)

#### Convert reviewText to the string 

In [10]:
df['reviewText']=df['reviewText'].apply(str)
df.head()

Unnamed: 0,asin,reviewText,overall
0,120401325X,They look good and stick good! I just don't li...,4
1,120401325X,These stickers work like the review says they ...,5
2,120401325X,These are awesome and make my phone look so st...,5
3,120401325X,Item arrived in great time and was in perfect ...,4
4,120401325X,"awesome! stays on, and looks great. can be use...",5


In [11]:
(df['reviewText'])[194434]

'Works great just like my original one. I really need extras of these for the car and house so love this and the price!'

In [12]:
len(df)

194340

### Cleaning the data 
Remove ‘\n’

Remove emojis if any

Remove punctuation marks

Remove extra spaces

Remove stopwords — Stopwords are those words which occur very frequently but are not required for analysis as they provide no insights. Removing them will reduce computational load. They include words like I, me, myself, that, him, etc.

In [13]:
import string
import nltk 
from nltk.corpus import stopwords
from nltk import PorterStemmer
import re
from nltk.tokenize import word_tokenize

In [14]:
def deEmojify(text):
    regrex_pattern = re.compile(pattern = "["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           "]+", flags = re.UNICODE)
    return regrex_pattern.sub(r'',text)

In [15]:
stop_words = ['in','of','at','a','the','and','is','on','an','they','was','it','i','them','to','these','this','for']

In [16]:
def decontracted(phrase):
    phrase=re.sub(r"won't","will not",phrase)
    phrase=re.sub(r"can't","can not",phrase)
    phrase=re.sub(r"n\'t"," not",phrase)
    phrase=re.sub(r"\'re"," are",phrase)
    phrase=re.sub(r"\'s"," is",phrase)
    phrase=re.sub(r"\'d"," would",phrase)
    phrase=re.sub(r"\'ll"," will",phrase)    
    phrase=re.sub(r"\'t"," not",phrase)
    phrase=re.sub(r"\'ve"," have",phrase)
    phrase=re.sub(r"\'m"," am",phrase)
    return phrase

#### USING SNOWBALL

In [17]:
import nltk
from nltk.stem.snowball import SnowballStemmer
from nltk.stem import WordNetLemmatizer 
from nltk.corpus import wordnet

##STEMMING USING SNOWBALL
snow_stemmer = SnowballStemmer(language='english')

#LEMMATIZATION
lemmatizer = WordNetLemmatizer()

In [None]:
lemmatizer = WordNetLemmatizer()
wordnet_map = {"N":wordnet.NOUN, "V":wordnet.VERB, "J":wordnet.ADJ, "R":wordnet.ADV} 
# Pos tag, used Noun, Verb, Adjective and Adverb

In [None]:
def lemmatize_words(text):
    pos_tagged_text = nltk.pos_tag(text.split())
    return " ".join([lemmatizer.lemmatize(word, wordnet_map.get(pos[0], wordnet.NOUN)) for word, pos in pos_tagged_text])

In [20]:
def clean_text(text):
    
    snow_stemmer = SnowballStemmer(language='english')
    
    text = decontracted(text)
    
    text = deEmojify(text) #remove Emojis
    
    text_cleaned = "".join([x for x in text if x not in string.punctuation]) #remove punctuation
    
    text_cleaned = re.sub(' +', ' ',text_cleaned) #remove extra spaces
    
    text_cleaned = text_cleaned.lower() #converting into lower case
    
    tokens = text_cleaned.split(" ")
    
    tokens = [token for token in tokens if token not in stop_words] #taking only those words which are not stop words 
    
    text_cleaned = " ".join([snow_stemmer.stem(token) for token in tokens])
    
    text_cleaned = " ".join([lemmatizer.lemmatize(token) for token in tokens])
    
    return text_cleaned

In [22]:
import nltk
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


True



Apply this on reviews column

In [23]:
df['cleaned_reviews']=df['reviewText'].apply(lambda x:clean_text(x))
df.head()

Unnamed: 0,asin,reviewText,overall,cleaned_reviews
0,120401325X,They look good and stick good! I just don't li...,4,look good stick good just do not like rounded ...
1,120401325X,These stickers work like the review says they ...,5,sticker work like review say do stick great st...
2,120401325X,These are awesome and make my phone look so st...,5,are awesome make my phone look so stylish have...
3,120401325X,Item arrived in great time and was in perfect ...,4,item arrived great time perfect condition howe...
4,120401325X,"awesome! stays on, and looks great. can be use...",5,awesome stay look great can be used multiple a...


In [27]:
def sentiment(n):
    return 1 if n >= 4 else 0
df['sentiment'] = df['overall'].apply(sentiment)
df.head()

Unnamed: 0,asin,reviewText,overall,cleaned_reviews,sentiment
0,120401325X,They look good and stick good! I just don't li...,4,look good stick good just do not like rounded ...,1
1,120401325X,These stickers work like the review says they ...,5,sticker work like review say do stick great st...,1
2,120401325X,These are awesome and make my phone look so st...,5,are awesome make my phone look so stylish have...,1
3,120401325X,Item arrived in great time and was in perfect ...,4,item arrived great time perfect condition howe...,1
4,120401325X,"awesome! stays on, and looks great. can be use...",5,awesome stay look great can be used multiple a...,1


In [28]:
df['sentiment'].value_counts()

1    148576
0     45764
Name: sentiment, dtype: int64

# Splitting into train and test

In [29]:
from sklearn.model_selection import train_test_split

X = df['cleaned_reviews']  # this time we want to look at the text
y = df['sentiment']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)
print('Training Data Shape:', X_train.shape)
print('Testing Data Shape: ', X_test.shape)

Training Data Shape: (130207,)
Testing Data Shape:  (64133,)


# TRAIN DIFFERENT CLASSIFIERS

In [31]:
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer



## Modeling
from sklearn.model_selection import cross_validate
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_predict
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import learning_curve
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer, HashingVectorizer
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn import metrics
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from  sklearn.metrics import precision_recall_fscore_support
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import f1_score
from sklearn.metrics import recall_score
from gensim.models import Word2Vec
from tqdm import tqdm
import xgboost as xgb
from xgboost import XGBClassifier
from sklearn.dummy import DummyClassifier

# USING TFIDF VECTORIZER

In [34]:
from sklearn.datasets import make_hastie_10_2
from sklearn.ensemble import GradientBoostingClassifier

In [52]:
#XGB and TFIDF VECTORIZER:
text_clf_xgb_tfidf = Pipeline([('xgb',TfidfVectorizer(binary=True,ngram_range=(1,2))),
                    ('clf',XGBClassifier(learning_rate=0.3,n_estimators=2000)),
])

text_clf_xgb_tfidf.fit(X_train, y_train) 
predictions_xgb_tfidf = text_clf_xgb_tfidf.predict(X_test)
print(f"The Final accuracy using XGB and TFIDF vectorizer is : {metrics.accuracy_score(y_test,predictions_xgb_tfidf)}")

The Final accuracy using XGB and TFIDF vectorizer is : 0.8793444872374596
