In [None]:
#Importing Libraries
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt 
%matplotlib inline
import seaborn as sns


In [None]:
#Reading Data
data = pd.read_csv('D:\Pratik\Data Science\Ecommer Reviews\Womens Clothing E-Commerce Reviews.csv')
data.head()

In [None]:
data.dtypes

In [None]:
#Dropout Unwanted Features
data.drop(['Unnamed: 0','Clothing ID'], axis=1, inplace=True)

In [None]:
#Rename the columns names
data.rename(columns={'Review Text': 'text',
                    'Positive Feedback Count': 'feedback_count',
                    'Division Name': 'Division', 'Department Name': 'Department',
                    'Class Name':'Class', 'Recommended IND': 'Recommended'},inplace=True)

In [None]:
#Our New Cloumns Names
print(data.columns)

In [None]:
#Look into the missing vlaues
data.isnull().sum()

In [None]:
#Removing out every row in the DF which contains missing data
data.dropna(axis=0, inplace=True)

In [None]:
data.isnull().sum()

In [None]:
#This histgram below represent that the 40 Yrs Old ladies are the main customers of the Site
sns.histplot(data=data, x='Age', kde=True)
plt.show()

In [None]:
#The majority of site visitors are more likely to give 5 rates for each product
sns.histplot(data=data, x='Rating', kde=False)

In [None]:
#Layering class on the top of whole ratings comparing to other classes
sns.set_style('white')
sns.barplot(x=data['Rating'], y=data['Class'],palette="Blues_d")

In [None]:
#Sweaters are more preferable for those whom are above 40's where Casual bottoms attrcats 20's as this barplot shows below
sns.set_theme(style="white")
sns.color_palette("flare", as_cmap=False)
sns.barplot(x=data['Age'], y=data['Class'],palette="Reds_d")

In [None]:
#Plot shows the ditribution of positive ratings
sns.displot(data=data,
           x='Rating', hue="Class",
    kind="kde", height=6,
    multiple="fill", clip=(0, None),
)

In [None]:
#This boxplot below determine the mean age according to each department
sns.boxplot(x=data['Department'], y=data['Age'])
plt.show()

In [None]:
#Piechart visualize that the Tops department has gotten the most amounts of ratings, where Trends got the low amount of ratings
data.groupby(['Department']).sum().plot(kind='pie', subplots=False, shadow = False,startangle=90,figsize=(15,10), y='Rating')

In [None]:
#Which one is highly Recommended
sns.countplot(data=data, y='Class', hue='Recommended',  palette = "Set1")

In [None]:
sns.countplot(data=data, y='Department', hue='Recommended' ,palette = "Set1")

In [None]:
sns.countplot(data=data, y='Division', hue='Recommended', palette = "Set1")

In [None]:
#Starting with creating word cloud
from wordcloud import WordCloud
text = " ".join(cat.split()[1] for cat in data.text)
# Creating word_cloud with text as argument in .generate() method
word_cloud = WordCloud(collocations = False, background_color = 'white').generate(text)
# Display the generated Word Cloud
plt.imshow(word_cloud, interpolation='bilinear')
plt.axis("off")
plt.show()

In [None]:
#Importing Libraries for NLP 
import nltk
from nltk.corpus import stopwords 
from nltk.stem.porter import PorterStemmer

In [None]:
data.columns

In [None]:
data['all_text'] = data['text']

In [None]:
nltk.download('punkt');

In [None]:
#Tokenization Function
def tokenize(column):
    # Tokenize a Pandas dataframe columns and returns a list of tokens.
    tokens = nltk.word_tokenize(column)
    return [w for w in tokens if w.isalpha]

In [None]:
#Applying Tokenization to Reviews column
data['tokenized'] = data.apply(lambda x: tokenize(x['all_text']), axis=1)
data[['text', 'tokenized']].head()

In [None]:
#Removing Stopwords
nltk.download('stopwords');

In [None]:
#Function to remove stiopewords
def remove_stopwords(tokenized_column):
    # this fuction will retun a list of tokens with English stopwords removed
    stops = set(stopwords.words('english'))
    return [word for word in tokenized_column if not word in stops]

In [None]:
#Applying Stopwords removal
data['stopwords_removed'] = data.apply(lambda x: remove_stopwords(x['tokenized']), axis=1)
data[['text', 'stopwords_removed']].head()

In [None]:
#Stemming
def apply_stemming(tokenized_column):
    # this function will return a list of tokens with PorterStemming applied
    stemmer = PorterStemmer()
    return [stemmer.stem(word) for word in tokenized_column]

In [None]:
data['porter_stemmed'] = data.apply(lambda x: apply_stemming(x['stopwords_removed']), axis=1)
data[['text', 'porter_stemmed']].head()

In [None]:
#Rejoin words
def rejoin_words(tokenized_column):
    # this fuction will rejoin the tokenized words list into a single string
    return (" ".join(tokenized_column))

In [None]:
data['rejoined'] = data.apply(lambda x: rejoin_words(x['porter_stemmed']), axis=1)
data[['text', 'rejoined']].head()

In [None]:
text = " ".join(cat.split()[1] for cat in data.rejoined)
# Creating word_cloud with text as argument in .generate() method
word_cloud = WordCloud(collocations = False, background_color = 'white').generate(text)
# Display the generated Word Cloud
plt.imshow(word_cloud, interpolation='bilinear')
plt.axis("off")
plt.show()

In [None]:
# spliting the data into trainng and testing data:- 
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(data['rejoined'],
                                                   data['Recommended'], test_size=0.3,random_state=0)


In [None]:
print('X_train first entry :\n\n', X_train.iloc[1])
print('\n\nX_train shape: ', X_train.shape)

In [None]:
# fit the countVectorizer to the training data:-
from sklearn.feature_extraction.text import CountVectorizer
vect = CountVectorizer().fit(X_train)

In [None]:
vect.get_feature_names()[::1000]

In [None]:
len(vect.get_feature_names())

In [None]:
# Transform the document in the training data to a document-term matrix
X_train_vectorized = vect.transform(X_train)
X_train_vectorized

In [None]:
#Creating Support vector machine classifier
from sklearn import svm
clf = svm.SVC(kernel='linear')
# train the classifier: 
clf.fit(X_train_vectorized, y_train)

In [None]:
#Evaluating the model
from sklearn.metrics import accuracy_score, classification_report, f1_score
# predict the transformed test documents
predictions = clf.predict(vect.transform(X_test))
print(classification_report(y_test, predictions))
print('Accuracy score: ', round(accuracy_score(y_test, predictions,2)))
print('F1_score: ', round(f1_score(y_test, predictions,2)))

In [None]:
# Fit the TfidfVectorizer to the training data specifiying a minimum document frequency of 5
from sklearn.feature_extraction.text import TfidfVectorizer
vect = TfidfVectorizer(min_df = 5).fit(X_train)
len(vect.get_feature_names())

In [None]:
X_train_vectorized = vect.transform(X_train)
model = svm.SVC(kernel='linear')
model.fit(X_train_vectorized, y_train)
predictions = model.predict(vect.transform(X_test))
print('accuracy :', accuracy_score(y_test, predictions))