# Yelp Dataset Processing

In [8]:
import os
import re 
import csv
import nltk
import sklearn
import numpy as np 
import pandas as pd 
from nltk.corpus import names
import matplotlib.pyplot as plt

### Helper methods

In [3]:
def tokenize_reviews(reviews): 
    reviews = [re.sub(r"[\n\(\)\/\\]", " ", r) for r in reviews]
    reviews = [re.sub(r"([.,!?()])", r" \1 ", r) for r in reviews]
    #word tokenize
    tokenized_reviews = [nltk.tokenize.word_tokenize(r) for r in reviews]
    return tokenized_reviews

def binarize_reviews(ratings): 
    # split into binary pos vs neg reviews
    rating = [0 if r < 4 else 1 for r in ratings]
    return rating

In [4]:
# Read Data
review_df = pd.read_csv('datasets/yelp_review.csv')
user_df = pd.read_csv('datasets/yelp_user.csv')

In [5]:
# Load names
male_names = names.words('male.txt')
male_names = [n.lower() for n in male_names]
female_names = names.words('female.txt')
female_names = [n.lower() for n in female_names]
intersection_names = set(male_names).intersection(set(female_names))
print("number of male names: ", len(male_names))
print("number of female names: ", len(female_names))
print("number of male and female names: ", len(intersection_names))

number of male names:  2943
number of female names:  5001
number of male and female names:  365


In [6]:
user_cdf = pd.DataFrame({'user_id': user_df['user_id'], 'name': user_df['name'], 'review_count': user_df['review_count'], 'average_stars': user_df['average_stars']})
user_cdf['gender'] = pd.Series([])
names = user_cdf['name']
print(user_cdf.keys())
gender = [] 
count_arr = [0, 0, 0, 0]
for name in names: 
    if isinstance(name, str): 
        name = name.lower()
    if name in intersection_names: 
        count_arr[3] += 1 
        gender.append('mf')
    elif name in male_names: 
        count_arr[0] += 1 
        gender.append('m')
    elif name in female_names:
        count_arr[1] += 1
        gender.append('f')
    else: 
        count_arr[2] += 1
        gender.append('na')
user_cdf['gender'] = pd.Series(gender)

Index(['average_stars', 'name', 'review_count', 'user_id', 'gender'], dtype='object')


In [11]:
# build userid to gender dictionary
id2gender = {} 
for ind, row in user_cdf.iterrows(): 
    id2gender[row['user_id']] = row['gender']

#match review with gender and save
male_texts = [] 
male_labels = []
female_texts = [] 
female_labels = [] 
with open('datasets/yelp.csv', 'w') as f: 
    writer = csv.writer(f)
    for ind, row in review_df.iterrows(): 
        try: 
            if id2gender[row['user_id']] == 'f': 
                female_texts.append(row['text'])
                female_labels.append(row['stars'])
                writer.writerow([row['text'], 'f', row['stars']])
            elif id2gender[row['user_id']] == 'm':
                male_texts.append(row['text'])
                male_labels.append(row['stars'])
                writer.writerow([row['text'], 'm', row['stars']])
        except KeyError: 
            pass 

In [13]:
# Save a smaller version
gender_review_df = pd.read_csv('datasets/yelp.csv')
sm_gender_review_df = gender_review_df.head(100)
sm_gender_review_df.to_csv('datasets/yelp100.csv')

# Analysis

In [25]:
user_cdf.groupby('gender').mean()

Unnamed: 0_level_0,average_stars,review_count
gender,Unnamed: 1_level_1,Unnamed: 2_level_1
f,3.780351,24.171562
m,3.682556,22.437122
mf,3.736344,22.667431
na,3.619788,22.616181


In [40]:
processed_male = tokenize_reviews(male_texts[:10000])
processed_female = tokenize_reviews(female_texts[:10000])

In [42]:
male_blabels = binarize_reviews(male_labels[:10000])
female_blabels = binarize_reviews(female_labels[:10000])

In [94]:
mixed_reviews = tokenize_reviews(reviews[-20000:])
mixed_blabels = binarize_reviews(ratings[-20000:])

In [95]:
vectorizer = sklearn.feature_extraction.text.CountVectorizer(
    preprocessor=lambda x: x, tokenizer=lambda x: x, max_features=5000)
x_vectors = vectorizer.fit_transform(mixed_reviews)
clf = sklearn.svm.SVC()
clf.fit(x_vectors[:16000], mixed_blabels[:16000])
predictions = clf.predict(x_vectors[16000:])
acc = np.sum(predictions == mixed_blabels[16000:])/len(predictions)
print(acc)

0.80825


In [96]:
x_vectors = vectorizer.fit_transform(processed_male)
clf = sklearn.svm.SVC()
clf.fit(x_vectors, male_blabels)
predictions = clf.predict(x_vectors)
acc = np.sum(predictions == male_blabels)/len(predictions)
print(acc)

0.7427


In [97]:
x_vectors = vectorizer.fit_transform(processed_female)
clf = sklearn.svm.SVC()
clf.fit(x_vectors, female_blabels)
predictions = clf.predict(x_vectors)
acc = np.sum(predictions == female_blabels)/len(predictions)
print(acc)

0.787
