In [None]:
# importing several libraries
import matplotlib.pyplot as plt 
import pandas as pd 
import numpy as np
import string
import nltk
import nltk.corpus
import sklearn
import string

In [None]:
pip install wordcloud

In [None]:
from matplotlib import rcParams
from nltk.stem import WordNetLemmatizer 
from nltk.corpus import stopwords 
from nltk import NaiveBayesClassifier
from nltk.corpus import wordnet 
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk import pos_tag
from wordcloud import WordCloud
from sklearn.ensemble import RandomForestClassifier 
from nltk.classify.scikitlearn import SklearnClassifier

In [None]:
# Specify the path to the CSV file
data_path = 'Womens Clothing E-Commerce Reviews.csv'

# Read the CSV file into a DataFrame
df=pd.read_csv(data_path)

# Display the first few rows of the DataFrame
df.head()

In [None]:
# display column names
df.columns

In [None]:
# check the shape of the dataset, rows and columns
df.shape

In [None]:
# Count the number of unique values in each column
df.nunique()

In [None]:
# Count the number of nulls in each column
df.isna().sum()

In [None]:
# remove any row that contains at least one NaN value
df.dropna(inplace=True)

In [None]:
# reset the index after dropping some rows 
df.reset_index(drop=True, inplace=True)
df.head()

In [None]:
# Check for the missing values after droping the null values 
df.isnull().sum()

In [None]:
# drop unnecessary culomns 
df.drop(["Unnamed: 0", "Title", 'Clothing ID'], axis=1, inplace=True)

In [None]:
df.head()

In [None]:
# to remove spaces in columns and replace them with underscore 
df.columns= df.columns.str.replace(" ", "_")

In [None]:
# Create reviews Tuples to store the words along with the categories 
reviews = []
# go through Recommended IND column and get the category and the index 
for (index , category) in enumerate(df.Recommended_IND):
    reviews.append((df.Review_Text[index],category)) # Store the review for spacific index with catogory inside texts array
# Print first 4
reviews[0:4]

In [None]:
# create lemmatizer 
lemmatizer = WordNetLemmatizer()

In [None]:
# download necessary resources
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')

In [None]:
# Create a list of stopwords 
# Get English stopwords
stops = set(stopwords.words('english'))

# Get a set of specific punctuation marks
punctuations = set(string.punctuation)
#print(punctuations)
# Combine stopwords and punctuation sets
stops.update(punctuations)

In [None]:
# total stopwords
len(stops)

In [None]:
# function to get the simpler version of pos tag  to use it in lemmitazation 
def get_simple_pos(tag):
    tag_start = tag[0].upper()
    if tag_start == 'N':
        return wordnet.NOUN
    elif tag_start == 'V':
        return wordnet.VERB
    elif tag_start == 'R':
        return wordnet.ADV
    else:
        return wordnet.NOUN  # default case

In [None]:
# function to return Lemmatized words and cleaned from stop words
def clean_review(words):
    words_tokens = word_tokenize(words)
    
    output_words = [
        lemmatizer.lemmatize(word.lower(), pos=get_simple_pos(tag))
        for word, tag in pos_tag(words_tokens)
        if word.lower() not in stops and any(c.isalnum() for c in word)
    ]
    
    return output_words

In [None]:
cleaned_words = clean_review("My cats are running away from my arms")
print(cleaned_words)

In [None]:
cleaned_reviews= [(clean_review(text),category )for text,category  in reviews]

In [None]:
# copy clean version into excel spreadsheet to be used for tableau purposes
# Create a DataFrame from the list of tuples
df = pd.DataFrame(cleaned_reviews, columns=['Cleaned_Text', 'Category'])

# Save the DataFrame to an Excel file
df.to_excel('cleaned_data.xlsx', index=False)

In [None]:
#check first 5 reviews 
cleaned_reviews[0:5]

In [None]:
#.75% traning = 14746 and 25% testing = 19662-14746 =4916 
#traning_words=cleaned_reviews[0:14746]
#testing_words=cleaned_reviews[14746:]

In [None]:
pip install scikit-learn

In [None]:
from sklearn.model_selection import train_test_split

# Split the data into training and testing sets (e.g., 80% training, 20% testing)
training_words, testing_words = train_test_split(cleaned_reviews, test_size=0.2, random_state=42)


In [None]:
print(f"Training Data = {len(training_words)}")
print(f"Testing Data = {len(testing_words)}")

In [None]:
# array contaning all words 
words_list=[]
for word in training_words:
        words_list+=word[0] # 0 index to get only the words 

In [None]:
# Total words in traning data 
len(words_list)

In [None]:
#frequency distribution for all words 
freq= nltk.FreqDist(words_list)
# The .most_common() method lists the words which occur most frequently in the data along with the frequency
common=freq.most_common()
# features are an array of only the top words in word list without The number of words 
features= [i[0]for i in common]

In [None]:
print(len(common))
print(len(features))

In [None]:
# Most common 5 words 
common[0:5]

In [None]:
# List of 5 features 
features[0:5]

In [None]:
# Visualizing the highest repeating words (features)

# wordcload is techniqe use to show which words are the most frequent 
wordCloud = WordCloud(background_color="white", max_words =3000).generate(str(features))

rcParams["figure.figsize"]= 10,20
plt.imshow(wordCloud)
plt.axis("off")
plt.show 

In [None]:
# function to return a set of the features with true or false 
def get_dict_for_feature(words):
  current_features={}
  words_set= set(words)
  for word in features:
    current_features[word] = word in words_set  # if word comes in words set it will return True otherwise False 
  return current_features

In [None]:
featuers_dic= get_dict_for_feature(training_words[0][0])

In [None]:
# Dictionary containing all words with True classification if the word is exist in each review otherwise false  
featuers_dic  

In [None]:
# create dic for each review wich has feature with value and the category
training_words= [( get_dict_for_feature(words),category ) for words , category in training_words]
testing_words = [( get_dict_for_feature(words),category ) for words , category in testing_words]

In [None]:
training_words[1]

In [None]:
# to the classifier we need to use NaiveBayesClassifier and pass the training words to it 
NB_classifier= NaiveBayesClassifier.train(training_words)
print("classifier accuracy percent:",(nltk.classify.accuracy(NB_classifier, training_words))*100)

In [None]:
review_1 = "Super fast and responsive with any issues. Different style print option was great! Easy to order and a pleasure to have done business with. Looking forward to ordering more items! Thank you"
review_2= "I am thrilled with the quality & fit of the t-shirts& they were very nicely packaged too. I will definitely be re-ordering from you in the future. - Kristina - Spain"
review_3="Missing refunds. Returned parcel and got a date that I would get the refund by, five days after this date no refund. Contacted customer support and they advised I have to wait another 14 days. The service was very unhelpful and rude at times."
reviews = [review_1,review_2,review_3]


def test_custom_review(reviews_list, classifier):
    
    for idx,review in enumerate(reviews_list) : 
        custom_tokens = clean_review(review)
        print(f"The clean review is : "  , str(custom_tokens).replace('[','').replace(']',''))
        classifiers=classifier.classify(dict([token, True] for token in custom_tokens))
        if (classifiers == 1):
            pred = "Positive"
        else:
            pred = "Negative"
        print(f"Review number {idx +1 }  seems to be {pred} \n")

In [None]:
test_custom_review(reviews,NB_classifier)