## Import Dependencies

In [89]:
import pandas as pd
import math
import numpy as np
import matplotlib.pyplot as plt
import glob
import os
import time as time

## Read Comments
## Drop Unnessessary Column
## Use Ceiling function to Return the Rating as an Integer

In [2]:
comments = pd.read_csv('../data/large_sample.csv')
comments.drop(columns=['Unnamed: 0'], inplace=True)
comments.rating = np.ceil(comments.rating)

In [4]:
comments.head()

Unnamed: 0,commenter,gameID,rating,comment
0,172640,24068,7.0,Good: Unique take on the hidden role games. T...
1,86674,24068,7.0,A neat social deduction game with multiple tea...
2,10643,24068,7.0,Good hidden roles werewolf style game that can...
3,31171,24068,7.0,"Overall I hate Mafia/Werewolf, but this versio..."
4,165608,24068,7.0,Fun social deduction exercise that gets merrie...


## Remove Punctuation
## Remove Uppercase

In [7]:
## return a string in lower case
def make_it_lower(comment):
    new_comment = []
    for x in comment.split(' '):
        new_comment.append(x.lower())
    return ' '.join(new_comment)

## remove punctuation
def remove_punctuation(comment):
    table = str.maketrans(dict.fromkeys('@#$%^&*()_+=~`[{}]|\:;"<,>."/'))
    return comment.translate(table)

## replace punctuation
def replace_exclimation(comment):
    return comment.replace('!', ' <!> ').replace('?', ' <?> ')

In [8]:
comments['lower_comment'] = comments.comment.apply(lambda x: make_it_lower(x))
comments.drop(columns=['comment'], inplace=True)
comments.rename(columns={'lower_comment':'comment'}, inplace=True)
comments['remove_punctuation'] = comments.comment.apply(lambda x: remove_punctuation(x))
comments.drop(columns=['comment'], inplace=True)
comments.rename(columns={'remove_punctuation':'comment'}, inplace=True)
comments['replace_punctuation'] = comments.comment.apply(lambda x: replace_exclimation(x))
comments.drop(columns=['comment'], inplace=True)
comments.rename(columns={'replace_punctuation':'comment'}, inplace=True)

In [10]:
comments.head()

Unnamed: 0,commenter,gameID,rating,comment
0,172640,24068,7.0,good unique take on the hidden role games the...
1,86674,24068,7.0,a neat social deduction game with multiple tea...
2,10643,24068,7.0,good hidden roles werewolf style game that can...
3,31171,24068,7.0,overall i hate mafiawerewolf but this version ...
4,165608,24068,7.0,fun social deduction exercise that gets merrie...


## Drop Commenter and gameID

In [11]:
comments.drop(columns=['commenter', 'gameID'], inplace=True)

## Create a Dictionary of every word with its frequency

In [21]:
def create_count(comment):
    for x in comment.split(' '):
        if x in word_dictionary:
            word_dictionary[x] += 1
        else:
            word_dictionary[x] = 1
    return word_dictionary

In [33]:
%%time
word_dictionary = dict()
comments.comment.apply(create_count)

CPU times: user 13.4 s, sys: 525 ms, total: 14 s
Wall time: 14 s


In [36]:
word_dictionary

{'good': 142106,
 '': 1268570,
 'unique': 10304,
 'take': 31312,
 'on': 173501,
 'the': 1560992,
 'hidden': 6067,
 'role': 7607,
 'games': 143376,
 'and': 833047,
 'evil': 1586,
 'team': 4861,
 'win': 27673,
 'if': 112738,
 'they': 48130,
 'eliminate': 494,
 'each': 41510,
 'other': 65740,
 'where': 31856,
 'neutral': 631,
 'has': 81277,
 'objectives': 2148,
 'depending': 4230,
 'what': 68003,
 'was': 98404,
 'dealt': 1161,
 'bad': 24962,
 'component': 2621,
 'quality': 9785,
 'is': 549280,
 'lower': 2896,
 'end': 28146,
 'artwork': 13068,
 'lackluster': 329,
 'card': 61041,
 'stock': 2231,
 'flimsy': 419,
 'board': 45709,
 'design': 15571,
 'uninspired': 274,
 'playing': 79252,
 'this': 463479,
 'entertaining': 4234,
 'but': 374843,
 'its': 45123,
 'tough': 7061,
 'getting': 18428,
 'folks': 1971,
 'interested': 4969,
 'when': 70744,
 'newer': 1950,
 'have': 176728,
 'a': 999023,
 'much': 108750,
 'better': 64119,
 'graphical': 145,
 'presentation': 1090,
 'neat': 5886,
 'social': 460

In [37]:
len(word_dictionary)

246987

In [43]:
more_than_ten = {k: v for k, v in word_dictionary.items() if v > 10}
ten_or_less = {k: v for k, v in word_dictionary.items() if v <=10}

## Drop words that appear 10 or less times

In [40]:
print(len(more_than_ten))
print(len(ten_or_less))

27372
219615


In [41]:
%%time
sortedList=sorted(more_than_ten.values())
sorted_dict = dict()
for sortedKey in sortedList:
    for key, value in more_than_ten.items():
        if value==sortedKey:
            sorted_dict[key]=value

CPU times: user 2min, sys: 227 ms, total: 2min 1s
Wall time: 2min 1s


In [42]:
array_of_words = []
for key, value in sorted_dict.items():
    temp = [key,value]
    array_of_words.append(temp)

In [50]:
len(array_of_words)

27372

## Make reports of the frequency each word has in comments of different ratings

In [48]:
def word_count(comment, word, number):
    if word in comment.split(' '):
        if word in number:
            number[word] += comment.split(' ').count(word)
        else:
            number[word] = comment.split(' ').count(word)

In [92]:
def make_report(w, z, dictionary, dataframe):
    dict_list = dictionary
    testing_df = dataframe.copy()
    labels = ['word', 'average/std', 'percent_average', 'percent_std', 'rating', 'word_count', 'divisor', 'percent', 'time', 'type']
    to_return = pd.DataFrame(columns=labels)
    for word in dict_list[int(w):int(z)]:
        start_time = time.time()
        to_append = []
        std = 0
        average = 0
        for y in range(1,11):
            number = dict()
            testing_df.loc[testing_df.rating == y].comment.apply(word_count, word=word[0], number=number)
            divisor = testing_df.loc[testing_df.rating == y].comment.count()
            if word[0] in number:
                percent = number[word[0]]/divisor
            else:
                percent = 0
                number[word[0]] = 0
            to_append.append([word[0], None, None, None, y, number[word[0]], divisor, percent, None, 0])
        for_array = []
        for value in to_append:
            for_array.append(value[7])
        x = np.array(for_array)
        array1 = [[word[0], (np.sum(x)/10)/(np.std(x)), np.sum(x)/10, np.std(x), None, None, None, None, time.time()-start_time, 1]]
        for x in to_append:
            array1.append(x)
        inbetween_df = pd.DataFrame.from_records(array1, columns=labels)
        to_return = pd.concat([to_return, inbetween_df])
    return to_return

In [51]:
def write_csv(low, high, df, count):
    df.to_csv('new_report_for_{}_{}_to_{}.csv'.format(str(count).zfill(2), low, high))

In [52]:
def loop_write_report(dictionary, dataframe, number, count):
    num = int(number)
    size = int(count)
    first = 0
    counter = 0
    while first < num - 1:
        counter += 1
        if first + size < num - 2:
            report = make_report(first, first + size, dictionary, dataframe)
            write_csv(first, first + size, report, counter)
            print('dataframe written from {} to {}'.format(first, first + size))
            first = first + size + 1
        else:
            report = make_report(first, num - 1, dictionary, dataframe)
            write_csv(first, num - 1, report, counter)
            print('dataframe written from {} to {}'.format(first, num - 1))
            print('last dataframe')
            break

In [87]:
def loop_combine_reports_delete_old_reports(dictionary, dataframe, number, count):
    loop_write_report(dictionary, dataframe, number, count)
    glob_reports()    

In [74]:
def glob_reports():
    counter = 0
    df = pd.DataFrame()
    for report in glob.glob('new_report_for_*'):
        to_add = pd.read_csv(report)
        df = pd.concat([df, to_add])
        os.remove(report)
    df.to_csv('to_delete.csv')
        

In [94]:
len(array_of_words)

27372

In [None]:
loop_combine_reports_delete_old_reports(dictionary=array_of_words, dataframe=comments, number=len(array_of_words), count=100)

dataframe written from 0 to 100
dataframe written from 101 to 201
dataframe written from 202 to 302
dataframe written from 303 to 403


In [75]:
array_of_words[-10:]

[['is', 549280],
 ['it', 632237],
 ['i', 680129],
 ['of', 725069],
 ['and', 833047],
 ['game', 927618],
 ['to', 945946],
 ['a', 999023],
 ['', 1268570],
 ['the', 1560992]]

In [84]:
testing_df = comments.loc[comments.comment.apply(lambda x: len(x.split(' '))<10)]

In [85]:
len(testing_df)

142626