##Read Data

In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv('/content/drive/MyDrive/text_analytics/Text_Analytics_Project/games_review.csv')
df.head()

Unnamed: 0,Title,Abstract,Description,Author,Publish Date,URL
0,Batman: Arkham Shadow Review - I Am Batman,Camouflaj is no stranger to superhero VR games...,It's hard to believe it's been nearly a decade...,Mark Delaney,2024-10-28,https://www.gamespot.com/reviews/batman-arkham...
1,Redacted Review - Prison Break,"With a colorful art style and roguelike hooks,...",You wouldn't know just from looking at it--wit...,Richard Wakeling,2024-10-28,https://www.gamespot.com/reviews/redacted-revi...
2,Call Of Duty: Black Ops 6 Campaign Review,Treyarch's latest entry into the Call of Duty ...,Call of Duty: Black Ops 2remains my favorite o...,Phil Hornshaw,2024-10-28,https://www.gamespot.com/reviews/call-of-duty-...
3,Dragon Age: The Veilguard Review In Progress -...,The Veilguard feels like a return to form for ...,Each new entry in the Dragon Age series is alw...,Jordan Ramée,2024-10-28,https://www.gamespot.com/reviews/dragon-age-th...
4,A Quiet Place: The Road Ahead Review - Quite A...,"As noiselessly as a survivor in its world, A Q...",A Quiet Place has quickly grown into one of th...,Mark Delaney,2024-10-21,https://www.gamespot.com/reviews/a-quiet-place...


##Data Infomation

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 419 entries, 0 to 418
Data columns (total 6 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   Title         419 non-null    object
 1   Abstract      419 non-null    object
 2   Description   419 non-null    object
 3   Author        419 non-null    object
 4   Publish Date  419 non-null    object
 5   URL           419 non-null    object
dtypes: object(6)
memory usage: 19.8+ KB


In [5]:
df.shape

(419, 6)

In [6]:
df.isnull().sum()

Unnamed: 0,0
Title,0
Abstract,0
Description,0
Author,0
Publish Date,0
URL,0


In [7]:
df.iloc[0]

Unnamed: 0,0
Title,Batman: Arkham Shadow Review - I Am Batman
Abstract,Camouflaj is no stranger to superhero VR games...
Description,It's hard to believe it's been nearly a decade...
Author,Mark Delaney
Publish Date,2024-10-28
URL,https://www.gamespot.com/reviews/batman-arkham...


In [8]:
df['Description'].iloc[0]



##Text Preprocessing

In [10]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [11]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

# Step 1: Extract the Title and Description columns
titles = df['Title'].tolist()
descriptions = df['Description'].tolist()

# Step 2: Create documents with titles and descriptions
documents = [f"Game Title: {title}\nDescription: {desc.lower()}" for title, desc in zip(titles, descriptions)]

tokenized_documents = [word_tokenize(doc.lower()) for doc in documents]


print(f"Number of documents: {len(documents)}")

Number of documents: 419


In [12]:
from pprint import pprint
pprint(documents[0])

('Game Title: Batman: Arkham Shadow Review - I Am Batman\n'
 "Description: it's hard to believe it's been nearly a decade since the last "
 "mainline batman arkham game. since then, we've seen several arkham-adjacent "
 "projects come out, only to feel hamstrung or otherwise lacking. 2016's "
 "batman arkham vr was a neat tech demo, but it encompassed only the series' "
 'investigative elements. both traditional arkham studios, rocksteady and wb '
 'montreal, launched batman-esque co-op games in recent years, but each '
 'struggled for several, sometimes similar reasons. batman: arkham shadow '
 'stops the tailspin by authentically recapturing the essence of the arkham '
 'series in ways other recent batmanverse games disappointingly and '
 'intentionally avoided, making this the best batman game since arkham knight, '
 "even if it doesn't soar to the same heights as the series' finest moments. "
 'batman: arkham shadow is a vr-only, direct sequel to arkham origins, taking '
 'place ro

##Extractive Summarization

# LexRank

In [13]:
!pip install lexrank

Collecting lexrank
  Downloading lexrank-0.1.0-py3-none-any.whl.metadata (5.8 kB)
Collecting path.py>=10.5 (from lexrank)
  Downloading path.py-12.5.0-py3-none-any.whl.metadata (1.3 kB)
Collecting pyrsistent>=0.14.0 (from lexrank)
  Downloading pyrsistent-0.20.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (27 kB)
Collecting urlextract>=0.7 (from lexrank)
  Downloading urlextract-1.9.0-py3-none-any.whl.metadata (5.8 kB)
Collecting path (from path.py>=10.5->lexrank)
  Downloading path-17.0.0-py3-none-any.whl.metadata (6.4 kB)
Collecting uritools (from urlextract>=0.7->lexrank)
  Downloading uritools-4.0.3-py3-none-any.whl.metadata (4.7 kB)
Downloading lexrank-0.1.0-py3-none-any.whl (69 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m69.8/69.8 kB[0m [31m3.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading path.py-12.5.0-py3-none-any.whl (2.3 kB)
Downloading pyrsistent-0.20.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (117 kB)
[2K

In [16]:
from nltk.tokenize import word_tokenize, sent_tokenize
from lexrank import STOPWORDS, LexRank

lxr = LexRank(documents, stopwords=STOPWORDS['en'])
sentences = sent_tokenize(documents[0])

In [17]:
scores_count = lxr.rank_sentences(sentences,
                                   threshold=None,
                                   fast_power_method=False)

print(scores_count)

[2.73671653 0.39503839 2.01507434 1.11591691 2.76719622 2.16698598
 0.32813733 0.76911207 1.43218283 0.73246785 1.07852166 0.69712974
 1.28577161 0.3912139  0.91919967 1.         0.49319904 0.32731407
 0.80690606 1.13527953 0.30174339 1.25179128 0.90894696 0.85895394
 1.79561662 2.02273625 0.30491861 0.5797072  1.90836631 0.63899548
 1.09105421 0.91572227 2.34030818 1.75997476 1.22271131 0.87955975
 0.83546738 1.08171039 0.29857772 0.97497078 0.86687698 0.55031544
 0.50111952 0.29502459 1.34909828 0.36040409 1.11801027 0.49178779
 1.3134742  0.52626297 0.78734988 1.25388416 0.27170785 1.
 1.         2.28689849 0.39349089 0.80722759 1.49200557 0.795722
 0.43529527 0.37246538 1.24918408 0.52370681 0.78300063 0.53405203
 0.40503293 0.83124384 0.49802532 1.10131679 0.82171876 2.05315677
 0.72326837 0.55092583 1.48001182 0.33773125 1.27754395 0.80773009
 0.90327717 1.03003387 2.18865674 1.12701319 0.64275484 1.32336132
 0.43698469 0.6599156  0.7529028  0.37095234 0.26380707 1.62194774
 0.32

In [18]:
scores_count.shape

(97,)

Print high-ranked sentences.

In [21]:
summary = lxr.get_summary(sentences, summary_size=2, threshold=.3)
summary

["batman: arkham shadow stops the tailspin by authentically recapturing the essence of the arkham series in ways other recent batmanverse games disappointingly and intentionally avoided, making this the best batman game since arkham knight, even if it doesn't soar to the same heights as the series' finest moments.",
 "Game Title: Batman: Arkham Shadow Review - I Am Batman\nDescription: it's hard to believe it's been nearly a decade since the last mainline batman arkham game."]

In [22]:
# get summary with continuous LexRank
summary_cont = lxr.get_summary(sentences, threshold=0.25)
summary_cont

["Game Title: Batman: Arkham Shadow Review - I Am Batman\nDescription: it's hard to believe it's been nearly a decade since the last mainline batman arkham game."]

In [23]:
pprint(documents[0])

('Game Title: Batman: Arkham Shadow Review - I Am Batman\n'
 "Description: it's hard to believe it's been nearly a decade since the last "
 "mainline batman arkham game. since then, we've seen several arkham-adjacent "
 "projects come out, only to feel hamstrung or otherwise lacking. 2016's "
 "batman arkham vr was a neat tech demo, but it encompassed only the series' "
 'investigative elements. both traditional arkham studios, rocksteady and wb '
 'montreal, launched batman-esque co-op games in recent years, but each '
 'struggled for several, sometimes similar reasons. batman: arkham shadow '
 'stops the tailspin by authentically recapturing the essence of the arkham '
 'series in ways other recent batmanverse games disappointingly and '
 'intentionally avoided, making this the best batman game since arkham knight, '
 "even if it doesn't soar to the same heights as the series' finest moments. "
 'batman: arkham shadow is a vr-only, direct sequel to arkham origins, taking '
 'place ro

#TextRank

In [24]:
!pip install summa

Collecting summa
  Downloading summa-1.2.0.tar.gz (54 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/54.9 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m54.9/54.9 kB[0m [31m2.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: summa
  Building wheel for summa (setup.py) ... [?25l[?25hdone
  Created wheel for summa: filename=summa-1.2.0-py3-none-any.whl size=54387 sha256=ef3c0ddaa50f595f445975b5e75cd279e692e6075ac4df2fed06b568c416637d
  Stored in directory: /root/.cache/pip/wheels/4a/ca/c5/4958614cfba88ed6ceb7cb5a849f9f89f9ac49971616bc919f
Successfully built summa
Installing collected packages: summa
Successfully installed summa-1.2.0


In [26]:
sentences = documents[0]
sentences



In [28]:
from summa.summarizer import summarize

sum = summarize(sentences, ratio=0.25)
sum



In [29]:
len(sum)

4474

In [30]:
documents[0]



In [31]:
len(documents[0])

15284