### Word frequency count to do list
1. Debug sorting and dropping duplicates at the same time. 
2. Figure out a way to visualise the word frequency
    Scatterplot? Word cloud certainly, but duplicates need to be sorted out before this would be meaningful. 
3. Potential to differentiate between positive reviews and negative reviews, using user_suggestion (1 = positive, 0 = negative)

In [29]:
import requests
import urllib.request
import os
import re
import os.path
import sys
import nltk
from nltk import word_tokenize , sent_tokenize , pos_tag
from tdmh import *
import numpy
import matplotlib.pyplot as plt
import sys

In [2]:
import requests
import re

steam_reviews = {
    'https://store.steampowered.com/appreviews/413150?json=1' : 
        'SteamReviewExample'
}

dir = 'Reviews'
if not os.path.exists(dir):
    os.mkdir(dir)
for url in steam_reviews:
    print("Downloading " + steam_reviews[url] + " ...")
    response = requests.get(url)
    title = re.sub( r'\s+' , '_' , steam_reviews[url])
    if response: 
        response.encoding = 'utf-8'
        full_text = remove_pg_boilerplate(response.text)
        path = os.path.join( dir , f'{title}.txt')
        out = open(path , 'w' , encoding = 'utf-8')
        
        out.write( full_text.strip())
        out.close()
        
        print('Done')

Downloading SteamReviewTest ...
Done


## Create a list of all the files in the corpus

In [3]:

import os
from os.path import isfile , join
import pandas as pd
texts = []
dir = 'Corpus' 

for file in os.listdir(dir):
    if re.search( r'csv$' , file ):
        path = os.path.join( dir , file )
        texts.append(path)




In [4]:
## create text analysis functions
def get_title(path):
    title = os.path.basename(path)
    if re.search( r'csv$' , title ):
        # Remove csv extension
        title = title[ :title.index('.csv') ]
        # remove commas and dots
        title = re.sub( r'[.,]' , '' , title )
    return title


## create data for all  the texts in the corpus

In [5]:
out = open( 'data.csv' , 'w' , encoding = 'utf-8' )

pos_tags = ['JJ' , 'MD' , 'JJR' , 'JJS' , 'VBD']

## Header of the CSV file
out.write('title,tokens,sentences,ttr')

for t in pos_tags:
    out.write(f',{t}')
out.write('\n')

for text in texts:
    
    data = dict()
    print( f'Analysing {text} ...')
    
    ## Get the title, based on the filename
    title = get_title( text )
    
    ## read the full text
    fh = open( text, encoding = 'utf-8')
    full_text = fh.read()
    
    ## count the number of sentences
    sentences = sent_tokenize(full_text)
    data['nr_sentences'] = len(sentences)
    
    # dictionary to count the POS tags
    freq_pos = dict()    
    
    # variables for the calculation of type-token ratio
    ttr_cap = 3000
    freq_ttr = dict()
 
    # token count is initalised at 0
    data['nr_tokens'] = 0

    for s in sentences:
        words = word_tokenize(s)
        words = remove_punctuation(words)
        
        tags = pos_tag(words)
        # Each tag consists of two values: 
        # [0]: the word and [1] the POS tag
        for word_tag in tags:
            word = word_tag[0]
            tag = word_tag[1]
            
            # count the tokens
            data['nr_tokens'] += 1
            
            # place tokens in dictionary freq_ttr
            # only if the word count is less than ttr_cap
            # The nr of items in the dictionary eventually equals the nr of types
            if data['nr_tokens'] <= ttr_cap:
                freq_ttr[ word ] = freq_ttr.get( word , 0 ) + 1
                
            ## Count frequencies of all the POS tags
            freq_pos[ tag ] = freq_pos.get( tag ,0) +1
            
    pos_tags = ['JJ' , 'MD' , 'JJR' , 'JJS' , 'VBD']
    for t in pos_tags:
        data[t] = freq_pos.get(t,0)
                
    # Calculate TTR: number of items in freq_ttr dictionary
    # divided by ttr_cap
    data['ttr'] = len( freq_ttr ) / ttr_cap
    
    # write the results to a CSV file    
    out.write( f"{title},{data['nr_tokens']},{data['nr_sentences']},{data['ttr']}" )
    for t in pos_tags:
        out.write( f",{data[t] / data['nr_tokens'] }"  )
    out.write('\n')
    
out.close()
print('Done!')

Analysing Corpus\steamreviews_all.csv ...
Done!


### Word Frequency

Firstly, I want to see which words feature the most per game, with each csv file representing a single video game on Steam. 

For some reason when running this code with multiple csv files, it gives the Key error: 'recommended'. I have no idea what this means. 

In [6]:
import os
from os.path import join
from nltk.corpus import stopwords
import pandas as pd
import csv 
import seaborn as sns

# Excluding stopwords.
stopwords = stopwords.words('english')
#Creating a csv file to save these results to. 
out = open( 'wordfreqSR.csv' , 'w' , encoding = 'utf-8' )

#Adding this function for later use. 
#Header for this csv file, this does not work for some reason. 


for text in texts:
    print( f'Analysing {text} ...')
    #out.write('Title,Word,Frequency')
    #out.write('\n')
    #Make a dictionary containing most frequent words. 
    freq = dict()
    
    #Get the title, based on the filename
    title = get_title( text )
    
    #Make sure the relevant column is read.
    us = pd.read_csv(text , usecols = ['user_review'] )
    
    #Converting user_review column into a list
    uslist = list(us['user_review'])
    
    #Converting the list into string.
    usstr = ' '.join(str(e) for e in uslist)
    
    #Tokenize!
    words = word_tokenize(usstr)
    words = remove_punctuation(words)
    
    #And now for the frequency analysis
    out = open( 'wordfreqSR.csv' , 'w' , encoding = 'utf-8' )
    for w in words: 
        w = w.lower()
        if w not in stopwords:
            freq[w] = freq.get( w, 0) + 1
        for f in freq: 
            word = 'occurrence'
            if freq[f] > 1:
                word+= 's'
        #Note: Keep the print commented out, as I accidentally broke this file by adding over 5 million lines of output. 
        #print( f' "{f}" => {freq[f]} {word}')
        #Save results of analysis to CSV file that was created earlier. Note: It also takes foreign characters as words. 
        out.write( f"{title},{f},{freq[f]}" )
        out.write('\n')
#out.close()
print('Done!')
        


Analysing Corpus\steamreviews_all.csv ...
Done!


In [23]:
#Since adding a header has not worked in the previous cell, this cell serves that purpose.
#Read the original file
file = pd.read_csv("wordfreqSR.csv")

#Create a header list, and add it to the csv file. 
headerList = ['Title', 'Word', 'Occurence']
file.to_csv('wordfreqSR.csv', header=headerList, index=False)

file2.sort_values(['Occurence'], axis=0, ascending=False, inplace=True)
#Doesn't work for some reason.
file2.drop_duplicates(subset='Word')
print(file2)


                   Title        Word  Occurence
43797   steamreviews_all      pootis       1143
43820   steamreviews_all      pootis       1143
43819   steamreviews_all      pootis       1143
43817   steamreviews_all      pootis       1143
43816   steamreviews_all      pootis       1143
...                  ...         ...        ...
225279  steamreviews_all  activision          1
225256  steamreviews_all  activision          1
225257  steamreviews_all  activision          1
225258  steamreviews_all  activision          1
227424  steamreviews_all   multitude          1

[227425 rows x 3 columns]


In [33]:
import seaborn as sns
import matplotlib.pyplot as plt

sns.set(style = 'white')
sns.scatterplot(data = file2, x = "Word" , y = "Occurence")
plt.show()

  font.set_text(s, 0.0, flags=flags)
  font.set_text(s, 0.0, flags=flags)
  font.set_text(s, 0.0, flags=flags)
  font.set_text(s, 0.0, flags=flags)
  font.set_text(s, 0.0, flags=flags)
  font.set_text(s, 0.0, flags=flags)
  font.set_text(s, 0.0, flags=flags)
  font.set_text(s, 0.0, flags=flags)
  font.set_text(s, 0.0, flags=flags)
  font.set_text(s, 0.0, flags=flags)
  font.set_text(s, 0.0, flags=flags)
  font.set_text(s, 0.0, flags=flags)
  font.set_text(s, 0.0, flags=flags)
  font.set_text(s, 0.0, flags=flags)
  font.set_text(s, 0.0, flags=flags)
  font.set_text(s, 0.0, flags=flags)
  font.set_text(s, 0.0, flags=flags)
  font.set_text(s, 0.0, flags=flags)
  font.set_text(s, 0.0, flags=flags)
  font.set_text(s, 0.0, flags=flags)
  font.set_text(s, 0.0, flags=flags)
  font.set_text(s, 0.0, flags=flags)


RuntimeError: In draw_glyphs_to_bitmap: Could not convert glyph to bitmap (error code 0x62)

<Figure size 432x288 with 1 Axes>