<h3>General Imports / Formatting</h3>

In [1]:
import sys
import sklearn
import numpy as py
import pandas as pd

%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt

# total data points in dataset
TOTAL_ROWS = 83560

# get rid of "np_float64" in array float outputs
py.set_printoptions(legacy='1.25')

<h3>Load Data in Pandas Dataframe</h3>
<b>  Include some minor preprocessing such as :</b>
<br>     > Target feature swapping
<br>     > NULL acknowledgement
<br>     > Column Swapping
<br>     > >=90% Data Loss Column Dropping
<br>     > ...

In [2]:
def fetchGamesData(path):
    return pd.read_csv(path)

gamesData = fetchGamesData("games_fixed.csv")

# confirm all features are present in dataset since prints will truncate
print(gamesData.columns.tolist())

gamesData.head()

['AppID', 'Name', 'Release date', 'Estimated owners', 'Peak CCU', 'Required age', 'Price', 'Discount', 'DLC count', 'About the game', 'Supported languages', 'Full audio languages', 'Reviews', 'Header image', 'Website', 'Support url', 'Support email', 'Windows', 'Mac', 'Linux', 'Metacritic score', 'Metacritic url', 'User score', 'Positive', 'Negative', 'Score rank', 'Achievements', 'Recommendations', 'Notes', 'Average playtime forever', 'Average playtime two weeks', 'Median playtime forever', 'Median playtime two weeks', 'Developers', 'Publishers', 'Categories', 'Genres', 'Tags', 'Screenshots', 'Movies']


Unnamed: 0,AppID,Name,Release date,Estimated owners,Peak CCU,Required age,Price,Discount,DLC count,About the game,...,Average playtime two weeks,Median playtime forever,Median playtime two weeks,Developers,Publishers,Categories,Genres,Tags,Screenshots,Movies
0,20200,Galactic Bowling,"Oct 21, 2008",0 - 20000,0,0,19.99,0,0,Galactic Bowling is an exaggerated and stylize...,...,0,0,0,Perpetual FX Creative,Perpetual FX Creative,"Single-player,Multi-player,Steam Achievements,...","Casual,Indie,Sports","Indie,Casual,Sports,Bowling",https://cdn.akamai.steamstatic.com/steam/apps/...,http://cdn.akamai.steamstatic.com/steam/apps/2...
1,655370,Train Bandit,"Oct 12, 2017",0 - 20000,0,0,0.99,0,0,THE LAW!! Looks to be a showdown atop a train....,...,0,0,0,Rusty Moyher,Wild Rooster,"Single-player,Steam Achievements,Full controll...","Action,Indie","Indie,Action,Pixel Graphics,2D,Retro,Arcade,Sc...",https://cdn.akamai.steamstatic.com/steam/apps/...,http://cdn.akamai.steamstatic.com/steam/apps/2...
2,1732930,Jolt Project,"Nov 17, 2021",0 - 20000,0,0,4.99,0,0,Jolt Project: The army now has a new robotics ...,...,0,0,0,Campião Games,Campião Games,Single-player,"Action,Adventure,Indie,Strategy",,https://cdn.akamai.steamstatic.com/steam/apps/...,http://cdn.akamai.steamstatic.com/steam/apps/2...
3,1355720,Henosis™,"Jul 23, 2020",0 - 20000,0,0,5.99,0,0,HENOSIS™ is a mysterious 2D Platform Puzzler w...,...,0,0,0,Odd Critter Games,Odd Critter Games,"Single-player,Full controller support","Adventure,Casual,Indie","2D Platformer,Atmospheric,Surreal,Mystery,Puzz...",https://cdn.akamai.steamstatic.com/steam/apps/...,http://cdn.akamai.steamstatic.com/steam/apps/2...
4,1139950,Two Weeks in Painland,"Feb 3, 2020",0 - 20000,0,0,0.0,0,0,ABOUT THE GAME Play as a hacker who has arrang...,...,0,0,0,Unusual Games,Unusual Games,"Single-player,Steam Achievements","Adventure,Indie","Indie,Adventure,Nudity,Violent,Sexual Content,...",https://cdn.akamai.steamstatic.com/steam/apps/...,http://cdn.akamai.steamstatic.com/steam/apps/2...


In [3]:
# move target feature to end of dataset
def moveTargetToEnd(dataset):
    tempCol = dataset.pop('Estimated owners')
    dataset.insert(39, 'Estimate owners', tempCol)
    return dataset

gamesData = moveTargetToEnd(gamesData)

gamesData.head()

Unnamed: 0,AppID,Name,Release date,Peak CCU,Required age,Price,Discount,DLC count,About the game,Supported languages,...,Median playtime forever,Median playtime two weeks,Developers,Publishers,Categories,Genres,Tags,Screenshots,Movies,Estimate owners
0,20200,Galactic Bowling,"Oct 21, 2008",0,0,19.99,0,0,Galactic Bowling is an exaggerated and stylize...,['English'],...,0,0,Perpetual FX Creative,Perpetual FX Creative,"Single-player,Multi-player,Steam Achievements,...","Casual,Indie,Sports","Indie,Casual,Sports,Bowling",https://cdn.akamai.steamstatic.com/steam/apps/...,http://cdn.akamai.steamstatic.com/steam/apps/2...,0 - 20000
1,655370,Train Bandit,"Oct 12, 2017",0,0,0.99,0,0,THE LAW!! Looks to be a showdown atop a train....,"['English', 'French', 'Italian', 'German', 'Sp...",...,0,0,Rusty Moyher,Wild Rooster,"Single-player,Steam Achievements,Full controll...","Action,Indie","Indie,Action,Pixel Graphics,2D,Retro,Arcade,Sc...",https://cdn.akamai.steamstatic.com/steam/apps/...,http://cdn.akamai.steamstatic.com/steam/apps/2...,0 - 20000
2,1732930,Jolt Project,"Nov 17, 2021",0,0,4.99,0,0,Jolt Project: The army now has a new robotics ...,"['English', 'Portuguese - Brazil']",...,0,0,Campião Games,Campião Games,Single-player,"Action,Adventure,Indie,Strategy",,https://cdn.akamai.steamstatic.com/steam/apps/...,http://cdn.akamai.steamstatic.com/steam/apps/2...,0 - 20000
3,1355720,Henosis™,"Jul 23, 2020",0,0,5.99,0,0,HENOSIS™ is a mysterious 2D Platform Puzzler w...,"['English', 'French', 'Italian', 'German', 'Sp...",...,0,0,Odd Critter Games,Odd Critter Games,"Single-player,Full controller support","Adventure,Casual,Indie","2D Platformer,Atmospheric,Surreal,Mystery,Puzz...",https://cdn.akamai.steamstatic.com/steam/apps/...,http://cdn.akamai.steamstatic.com/steam/apps/2...,0 - 20000
4,1139950,Two Weeks in Painland,"Feb 3, 2020",0,0,0.0,0,0,ABOUT THE GAME Play as a hacker who has arrang...,"['English', 'Spanish - Spain']",...,0,0,Unusual Games,Unusual Games,"Single-player,Steam Achievements","Adventure,Indie","Indie,Adventure,Nudity,Violent,Sexual Content,...",https://cdn.akamai.steamstatic.com/steam/apps/...,http://cdn.akamai.steamstatic.com/steam/apps/2...,0 - 20000


In [4]:
# Drops the features(col)
def dropFeatureCol(dataset, col):
    if(col in dataset.columns): # check if column even exists
        return dataset.drop(col, axis = 'columns')

    return dataset # if column doesnt exist

# Reviews have ~85% NULL, which is less than 90%,
# but close enough to see we probably wont glean anything
gamesData = dropFeatureCol(gamesData, 'Reviews')

# Website has ~60% NULL, which is less than 90%,
# but considering the dataset consists of a wide spread
# of data, including indie and company publishers,
# this data is spread enough to provide little with no
# way of replacing it since if a publisher has no website,
# then we have no replacement
gamesData = dropFeatureCol(gamesData, 'Website')

# same idea as above, links to website (usually w a '/contact' or '/support' argument)
# but again, wide spread of websites with no way to replace null data
gamesData = dropFeatureCol(gamesData, 'Support url')

# Ensures we dropped the appropriate column(s) since it may
# be inside the truncated section of the dataframe
print(gamesData.columns.tolist())

gamesData.head()

['AppID', 'Name', 'Release date', 'Peak CCU', 'Required age', 'Price', 'Discount', 'DLC count', 'About the game', 'Supported languages', 'Full audio languages', 'Header image', 'Support email', 'Windows', 'Mac', 'Linux', 'Metacritic score', 'Metacritic url', 'User score', 'Positive', 'Negative', 'Score rank', 'Achievements', 'Recommendations', 'Notes', 'Average playtime forever', 'Average playtime two weeks', 'Median playtime forever', 'Median playtime two weeks', 'Developers', 'Publishers', 'Categories', 'Genres', 'Tags', 'Screenshots', 'Movies', 'Estimate owners']


Unnamed: 0,AppID,Name,Release date,Peak CCU,Required age,Price,Discount,DLC count,About the game,Supported languages,...,Median playtime forever,Median playtime two weeks,Developers,Publishers,Categories,Genres,Tags,Screenshots,Movies,Estimate owners
0,20200,Galactic Bowling,"Oct 21, 2008",0,0,19.99,0,0,Galactic Bowling is an exaggerated and stylize...,['English'],...,0,0,Perpetual FX Creative,Perpetual FX Creative,"Single-player,Multi-player,Steam Achievements,...","Casual,Indie,Sports","Indie,Casual,Sports,Bowling",https://cdn.akamai.steamstatic.com/steam/apps/...,http://cdn.akamai.steamstatic.com/steam/apps/2...,0 - 20000
1,655370,Train Bandit,"Oct 12, 2017",0,0,0.99,0,0,THE LAW!! Looks to be a showdown atop a train....,"['English', 'French', 'Italian', 'German', 'Sp...",...,0,0,Rusty Moyher,Wild Rooster,"Single-player,Steam Achievements,Full controll...","Action,Indie","Indie,Action,Pixel Graphics,2D,Retro,Arcade,Sc...",https://cdn.akamai.steamstatic.com/steam/apps/...,http://cdn.akamai.steamstatic.com/steam/apps/2...,0 - 20000
2,1732930,Jolt Project,"Nov 17, 2021",0,0,4.99,0,0,Jolt Project: The army now has a new robotics ...,"['English', 'Portuguese - Brazil']",...,0,0,Campião Games,Campião Games,Single-player,"Action,Adventure,Indie,Strategy",,https://cdn.akamai.steamstatic.com/steam/apps/...,http://cdn.akamai.steamstatic.com/steam/apps/2...,0 - 20000
3,1355720,Henosis™,"Jul 23, 2020",0,0,5.99,0,0,HENOSIS™ is a mysterious 2D Platform Puzzler w...,"['English', 'French', 'Italian', 'German', 'Sp...",...,0,0,Odd Critter Games,Odd Critter Games,"Single-player,Full controller support","Adventure,Casual,Indie","2D Platformer,Atmospheric,Surreal,Mystery,Puzz...",https://cdn.akamai.steamstatic.com/steam/apps/...,http://cdn.akamai.steamstatic.com/steam/apps/2...,0 - 20000
4,1139950,Two Weeks in Painland,"Feb 3, 2020",0,0,0.0,0,0,ABOUT THE GAME Play as a hacker who has arrang...,"['English', 'Spanish - Spain']",...,0,0,Unusual Games,Unusual Games,"Single-player,Steam Achievements","Adventure,Indie","Indie,Adventure,Nudity,Violent,Sexual Content,...",https://cdn.akamai.steamstatic.com/steam/apps/...,http://cdn.akamai.steamstatic.com/steam/apps/2...,0 - 20000


<h3>NLP</h3>
<b>This section will work to convert the "About the game" column into normalized language, allowing our machine to better pick up trends</b>

In [5]:
# automatically installs nltk module if missing
%pip install nltk
%pip install sentence-transformers
%pip install tf-keras
import nltk
# automatically installed required dependencies for other functions inside nltk
nltk.download('punkt_tab') 
nltk.download('wordnet')
nltk.download('stopwords')

Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 24.2 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip


Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 24.2 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip


Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 24.2 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\rscal\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\rscal\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\rscal\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [6]:
# specific functions needed from nltk
from nltk.tokenize import word_tokenize as tokenize # tokenizing has never been easier
from nltk.stem import WordNetLemmatizer as lemmatizer # simplify words 'running' -> 'run' etc.
from nltk.corpus import stopwords # words like 'is', 'and', etc
# from sentence_transformers import SentenceTransformer

# description - gamesData['About the game'][i]
def preprocessDescs(description):

    # sentencesDescription = tokenize(description)
    tokens = tokenize(description) #[i]
    # print(tokens)

    # holds the intermediate step between generic token and stopword removal
    tempLemmaTokens = ""

    tokensWithoutStopWords = []

    for i in range(0, len(tokens)):
        tempLemmaTokens = lemmatizer().lemmatize(tokens[i])
        if(tempLemmaTokens not in stopwords.words()):
            tokensWithoutStopWords.append(tokens[i])
        

    # remove end of sentence markers
    separators = {'!', '.', '?'}
    tempAr  = [] # return value [tempStr(s)]
    k = 0 # index of tokensWithoutStopWords

    # remove separators and reconstruct string
    for i in range (0, len(tokensWithoutStopWords)):
        tempStr = "" # holds individual reconstructed sentence

        # load sentence until end of sentence marker
        while(k < len(tokensWithoutStopWords) and tokensWithoutStopWords[k] not in separators):
            tempStr += tokensWithoutStopWords[k]
            tempStr += ' '
            k += 1

        # tempStr += tokensWithoutStopWords[k]
        k+=1 # increment past separator

        tempAr.append(tempStr)
        
        
    return tempAr

# loop through above function TOTAL_ROWS times

# similarities = preprocessDescs(gamesData['About the game'][0])
# print(similarities.shape)
print(gamesData['About the game'][0])
newDescription = preprocessDescs(gamesData['About the game'][0])
print(newDescription)
# py.set_printoptions(threshold=sys.maxsize)
# print(similarities)
# print("PREPROCESS OUT", proprocessDescs(gamesData['About the game'][0]))


Galactic Bowling is an exaggerated and stylized bowling game with an intergalactic twist. Players will engage in fast-paced single and multi-player competition while being submerged in a unique new universe filled with over-the-top humor, wild characters, unique levels, and addictive game play. The title is aimed at players of all ages and skill sets. Through accessible and intuitive controls and game-play, Galactic Bowling allows you to jump right into the action. A single-player campaign and online play allow you to work your way up the ranks of the Galactic Bowling League! Whether you have hours to play or only a few minutes, Galactic Bowling is a fast paced and entertaining experience that will leave you wanting more! Full Single-player story campaign including 11 Characters and Environments. 2 Single-player play modes including Regular and Battle Modes. Head to Head Online Multiplayer play Modes. Super Powers, Special Balls, and Whammies. Unlockable Characters, Environments, and M