In [114]:
import pandas as pd
import string
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import sys
from collections import Counter
import random
import ast
import re

sys.path.append('pymodules')
# This class contains some utility functions Word2Vec, stop words etc. etc.
import pymodules.preprocessing_class as pc

# gender gueser
import gender_guesser.detector as gd

In [115]:
# Function to get the first name so that we can guess the gender
def first_name(x):
    """
    We determine the first name from the given string. We also remove any digits from the name. 
    Further, we use space to split names
    """
    x_split = str(x).split()
    fname = x_split[0]
    # remove reference to digits. Now after removal, there could be some misclassification, but that is ok ..
    fname_p = re.sub(r'[0-9]+', "", fname)
    ret_str = fname_p.capitalize()
    return ret_str


In [116]:
filename = "data/Master-data_Q42021.xlsx"
text_data_raw = pd.read_excel(filename, sheet_name='Scrubbed_data', index_col='REVIEW_DATE')

# We don't need these columns
not_needed = ['OVERALL_RATING', 'COMFORT_RATING', 'VISION_RATING', 'VALUE_FOR_MONEY', 'PROS', 'CONS', 'ORIGINAL_SOURCE', 'REPLY_FROM_ACCUVUE',
'PRODUCT_LINK', 'WEBSITE']

text_data = text_data_raw.drop(columns = not_needed, axis=1)


In [117]:
# Let us figure out the gender from the names and drop the names column
# We use gender_guesser package.
#text_data['AUTHOR'] = text_data['AUTHOR'].astype(str)
gdx = gd.Detector()
text_data['GENDER'] = text_data.AUTHOR.apply(first_name).map(lambda x: gdx.get_gender(x))

# Drop the author column now
text_data.drop(columns = ['AUTHOR'], axis=1, inplace=True)

# Check the gender counts just to see how the data looks like
text_data.GENDER.value_counts()

female           4003
male             2400
unknown          1556
mostly_female     464
mostly_male       253
andy              118
Name: GENDER, dtype: int64

In [118]:

# Consolidate the comments into one column
# Comments can occur both in title and in Comment columns. 
text_data['COMMENT'] = text_data['TITLE'].astype(str).fillna("") + " " + text_data['COMMENTS'].astype(str).fillna("")
text_data.drop(columns = ['TITLE', 'COMMENTS'], axis=1, inplace=True)

In [119]:

# display results
display(text_data)

Unnamed: 0_level_0,FINAL_PRODUCT_NAME,RATING,PRODUCT,BRAND,GENDER,COMMENT
REVIEW_DATE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2021-11-01,Acuvue 2 Contact Lenses ...,5,Acuvue2,Acuvue,male,Acucue 2 Contact Lenses I have used these lens...
2021-12-02,Acuvue 2 Contact Lenses ...,4,Acuvue2,Acuvue,female,Clear vision Tends to cloud up
2021-12-01,Acuvue 2 Contact Lenses ...,4,Acuvue2,Acuvue,female,comfort These are very hard to handle. Flimsy ...
2021-11-16,Acuvue 2 Contact Lenses ...,5,Acuvue2,Acuvue,male,Easy to use I have been using this product for...
2021-12-08,Acuvue 2 Contact Lenses ...,4,Acuvue2,Acuvue,unknown,Excellent Excellent got promised a discount of...
...,...,...,...,...,...,...
2021-11-29,Acuvue VITA Contact Lenses ...,5,Vita,Acuvue,male,"Truly the Best The price, fast shipping, quic..."
2021-11-23,Shop Acuvue Vita 12 pack (1.0 Box ),5,Vita,Acuvue,male,Very Comfortable Acuvue Vita are very comfort...
2021-12-27,Acuvue Vita for Astigmatism Contact Lenses,5,Vita,Acuvue,male,Very Comfortable and Convenient Very Comfortab...
2021-10-16,Acuvue Vita for Astigmatism Contact Lenses,1,Vita,Acuvue,unknown,"Worst lenses I've worn, microscopic tears in a..."
