# Dataset Processing

In [1]:
import os
import re 
import csv
import nltk
import sklearn
import preprocess
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt

In [22]:
%load_ext autoreload

%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


### Yelp

In [51]:
# Read Data
review_df = pd.read_csv('datasets/yelp_review.csv')
user_df = pd.read_csv('datasets/yelp_user.csv')

In [52]:
male_names, female_names, intersection_names = preprocess.load_names()
user_cdf = pd.DataFrame({'user_id': user_df['user_id'], 'name': user_df['name'], 'review_count': user_df['review_count'], 'average_stars': user_df['average_stars']})
user_cdf['gender'] = pd.Series([])
names = user_cdf['name']
print(user_cdf.keys())
gender = [] 
count_arr = [0, 0, 0, 0]
for name in names: 
    if isinstance(name, str): 
        name = name.lower()
    if name in intersection_names: 
        count_arr[3] += 1 
        gender.append('mf')
    elif name in male_names: 
        count_arr[0] += 1 
        gender.append('m')
    elif name in female_names:
        count_arr[1] += 1
        gender.append('f')
    else: 
        count_arr[2] += 1
        gender.append('na')
user_cdf['gender'] = pd.Series(gender)

number of male names:  2943
number of female names:  5001
number of male and female names:  365
Index(['average_stars', 'name', 'review_count', 'user_id', 'gender'], dtype='object')


In [58]:
# build userid to gender dictionary
id2gender = {} 
for ind, row in user_cdf.iterrows(): 
    id2gender[row['user_id']] = row['gender']

In [63]:
#match review with gender and save
male_texts = [] 
male_labels = []
female_texts = [] 
female_labels = [] 
with open('datasets/yelp.csv', 'w') as f: 
    writer = csv.writer(f)
    for ind, row in review_df.iterrows(): 
        try: 
            if id2gender[row['user_id']] == 'f': 
                process_line = preprocess.tokenize_reviews([row['text']])[0]
                star = 1 if row['stars'] > 3 else 0 
                female_texts.append(process_line)
                female_labels.append(row['stars'])
                writer.writerow([" ".join(process_line), 'f', star])
            elif id2gender[row['user_id']] == 'm':
                process_line = preprocess.tokenize_reviews([row['text']])[0]
                male_texts.append(process_line)
                star = 1 if row['stars'] > 3 else 0 
                male_labels.append(row['stars'])
                writer.writerow([" ".join(process_line), 'm', star])
        except KeyError: 
            pass 

In [8]:
# Save a smaller version
gender_review_df = pd.read_csv('datasets/yelp.csv')
sm_gender_review_df = gender_review_df.head(1000)
sm_gender_review_df.to_csv('datasets/yelp1000.csv', index=False)

### E-Commerce

In [2]:
review_df = pd.read_csv('datasets/clothing_raw.csv')
review_df.keys()

Index(['Unnamed: 0', 'Clothing ID', 'Age', 'Title', 'Review Text', 'Rating',
       'Recommended IND', 'Positive Feedback Count', 'Division Name',
       'Department Name', 'Class Name'],
      dtype='object')

In [7]:
user_cdf = pd.DataFrame()

# binarize rating
ratings = review_df['Rating'].tolist()
bratings = preprocess.binarize_reviews(ratings)
user_cdf['rating'] = pd.Series(bratings)

# preprocess text 
review_text = review_df['Review Text'].tolist() 
print(review_text[0])
review_text = ["" if not isinstance(t, str) else t for t in review_text] 
        
processed_text = preprocess.tokenize_reviews(review_text)
processed_text = [" ".join(p) for p in processed_text]
user_cdf['text'] = pd.Series(processed_text)

age_list = review_df['Age'].tolist() 
converted_age = [] 
for a in age_list:
    if a < 35: 
        converted_age.append('y')
    elif a < 55 and a >= 35: 
        converted_age.append('m')
    else: 
        converted_age.append('o')
user_cdf['age'] = pd.Series(converted_age)

user_cdf.to_csv("datasets/clothing.csv", header=True, columns=['text', 'age', 'rating'], index=False)
user_cdf.head(1000).to_csv('datasets/clothing1000.csv', header=True, 
                           columns=['text', 'age', 'rating'], index=False)

Absolutely wonderful - silky and sexy and comfortable


## Movies

In [3]:
texts = [] 
scores = [] 
names = [] 
with open('datasets/movies.txt', 'r', encoding='latin1') as f: 
    for line in f:
        if len(texts) > 2000000:
            break 
        elif line.startswith("review/score:"): 
            scores.append(float(line.lstrip("review/score: ")))
        elif line.startswith("review/profileName:"): 
            names.append(line.lstrip("review/profileName: "))
        elif line.startswith("review/text:"): 
            texts.append(line.lstrip("review/text:"))

In [19]:
male_names, female_names, intersection_names = preprocess.load_names()
counts = [0, 0, 0]
save_list = [] 
for i, n in enumerate(names): 
    try: 
        target_name = n.split()[0].lower()
    except: 
        pass 
    if target_name in intersection_names: 
        counts[2] += 1 
    elif target_name in male_names: 
        counts[0] += 1
        save_list.append((texts[i], 'm', scores[i]))
    elif target_name in female_names: 
        counts[1] += 1 
        save_list.append((texts[i], 'f', scores[i]))
print(counts)
print(male_names[:10])

number of male names:  2943
number of female names:  5001
number of male and female names:  365
[562310, 236385, 140148]
['aamir', 'aaron', 'abbey', 'abbie', 'abbot', 'abbott', 'abby', 'abdel', 'abdul', 'abdulkarim']


In [28]:
# tokenize and binarize
sm_texts = [t for (t, g, l) in save_list]
proccessed_texts = preprocess.tokenize_reviews(sm_texts)

In [49]:
proccessed_texts = [" ".join(t) for t in proccessed_texts]

In [32]:
labels = [l for (t, g, l) in save_list]
blabels = preprocess.binarize_reviews(labels)

In [50]:
gender = [g for (t, g, l) in save_list]
user_cdf = pd.DataFrame({'text': pd.Series(proccessed_texts), 
                         'gender': pd.Series(gender), 'rating': pd.Series(blabels)})
user_cdf.to_csv('datasets/movies.csv', header=True, columns=['text', 'gender', 'rating'], 
                index=False)

user_cdf.head(1000).to_csv('datasets/movies1000.csv', header=True, 
                           columns=['text', 'gender', 'rating'], index=False)

In [47]:
# top stats
mini_df = pd.read_csv('datasets/movies1000.csv')
print(mini_df['gender'].value_counts()) 
mini_df = pd.read_csv('datasets/yelp1000.csv', names=['text', 'gender', 'age'])
print(mini_df['gender'].value_counts()) 
mini_df = pd.read_csv('datasets/clothing1000.csv')
print(mini_df['age'].value_counts()) 

m    745
f    255
Name: gender, dtype: int64
f    676
m    325
Name: gender, dtype: int64
m    566
y    240
o    194
Name: age, dtype: int64


# Analysis

In [None]:
user_cdf.groupby('gender').mean()

In [None]:
processed_male = tokenize_reviews(male_texts[:10000])
processed_female = tokenize_reviews(female_texts[:10000])

In [None]:
male_blabels = binarize_reviews(male_labels[:10000])
female_blabels = binarize_reviews(female_labels[:10000])

In [None]:
mixed_reviews = tokenize_reviews(reviews[-20000:])
mixed_blabels = binarize_reviews(ratings[-20000:])

In [None]:
vectorizer = sklearn.feature_extraction.text.CountVectorizer(
    preprocessor=lambda x: x, tokenizer=lambda x: x, max_features=5000)
x_vectors = vectorizer.fit_transform(mixed_reviews)
clf = sklearn.svm.SVC()
clf.fit(x_vectors[:16000], mixed_blabels[:16000])
predictions = clf.predict(x_vectors[16000:])
acc = np.sum(predictions == mixed_blabels[16000:])/len(predictions)
print(acc)

In [None]:
x_vectors = vectorizer.fit_transform(processed_male)
clf = sklearn.svm.SVC()
clf.fit(x_vectors, male_blabels)
predictions = clf.predict(x_vectors)
acc = np.sum(predictions == male_blabels)/len(predictions)
print(acc)

In [None]:
x_vectors = vectorizer.fit_transform(processed_female)
clf = sklearn.svm.SVC()
clf.fit(x_vectors, female_blabels)
predictions = clf.predict(x_vectors)
acc = np.sum(predictions == female_blabels)/len(predictions)
print(acc)