<a href="https://colab.research.google.com/github/pabloisaiahs/DS-3001-Project-1/blob/main/clean_and_tokenize_data.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Install and download SpaCy model if not already installed
!pip install spacy
!python -m spacy download en_core_web_sm

In [1]:
import pandas as pd
import spacy
import re

# Load SpaCy's English model
nlp = spacy.load("en_core_web_sm")

# Load the dataset
file_path = "truncated_data.csv"  # Update path if necessary
df = pd.read_csv(file_path)

# Display column names to confirm structure
print(df.columns)

# Use the correct column name for reviews
column_name = 'review'  # Update based on actual dataset

# Drop missing values in review column
df = df.dropna(subset=[column_name])

# Text Cleaning Function
def clean_text(text):
    text = str(text).lower()  # Convert to lowercase
    text = re.sub(r'[^a-z\s]', '', text)  # Remove special characters and numbers
    return text

df['cleaned_text'] = df[column_name].apply(clean_text)

# Tokenization using SpaCy
df['tokens'] = df['cleaned_text'].apply(lambda text: [token.text for token in nlp(text)])

# Save cleaned data
df.to_csv("truncated_data_cleaned.csv", index=False)

print("Cleaning and tokenization complete. File saved as 'steam_reviews_cleaned.csv'.")

Index(['X', 'app_id', 'app_name', 'review_id', 'language', 'review',
       'timestamp_created', 'timestamp_updated', 'recommended',
       'votes_helpful', 'votes_funny', 'weighted_vote_score', 'comment_count',
       'steam_purchase', 'received_for_free', 'written_during_early_access',
       'author.steamid', 'author.num_games_owned', 'author.num_reviews',
       'author.playtime_forever', 'author.playtime_last_two_weeks',
       'author.playtime_at_review', 'author.last_played'],
      dtype='object')
Cleaning and tokenization complete. File saved as 'steam_reviews_cleaned.csv'.


In [2]:
df.head(10)

Unnamed: 0,X,app_id,app_name,review_id,language,review,timestamp_created,timestamp_updated,recommended,votes_helpful,...,written_during_early_access,author.steamid,author.num_games_owned,author.num_reviews,author.playtime_forever,author.playtime_last_two_weeks,author.playtime_at_review,author.last_played,cleaned_text,tokens
0,1708660,8930,Sid Meier's Civilization V,10119388,english,Great Game if you like Strategy Games. I would...,1400168834,1400168834,True,0,...,False,76561198132502300,14,1,2135,0,879.0,1416590688,great game if you like strategy games i would ...,"[great, game, if, you, like, strategy, games, ..."
1,1616223,8930,Sid Meier's Civilization V,41348992,english,Much inferior version of CIV 4. The only posit...,1523134094,1523134094,False,0,...,False,76561198012377800,137,18,3382,0,3382.0,1366354800,much inferior version of civ the only positiv...,"[much, inferior, version, of, civ, , the, onl..."
2,1699760,8930,Sid Meier's Civilization V,11281803,english,Do you like losing time? Do you like miss...,1405564142,1405564142,True,1,...,False,76561198064377700,231,4,72192,0,15114.0,1609388410,do you like losing time do you like missi...,"[ , do, you, like, losing, time, do, you, ..."
3,4437439,227300,Euro Truck Simulator 2,20641037,english,(y),1453293826,1453293826,True,0,...,False,76561198182331700,39,2,325,0,162.0,1582858855,y,[y]
4,3294571,275390,Guacamelee! Super Turbo Championship Edition,46022055,english,bom,1542833752,1542833752,True,0,...,False,76561198080521200,132,6,863,0,863.0,1524755224,bom,[bom]
5,5716437,413150,Stardew Valley,53227635,english,Good game! Farm and do other stuff. Love it!,1562098071,1562098071,True,0,...,False,76561198081613800,229,1,2920,0,2504.0,1609450155,good game farm and do other stuff love it,"[good, game, farm, and, do, other, stuff, love..."
6,1877269,381210,Dead by Daylight,65059694,english,its great. need i say more.,1584237891,1584237891,True,0,...,False,76561198156042400,199,23,2310,0,1822.0,1589077795,its great need i say more,"[its, great, need, i, say, more]"
7,2916537,242760,The Forest,46729731,english,I died at caves...,1543047361,1543047361,True,0,...,False,76561198113814200,152,13,3849,368,2249.0,1610470353,i died at caves,"[i, died, at, caves]"
8,5570322,413150,Stardew Valley,82524476,english,vloe gaems so goodp woud recomencad,1608157188,1608157188,True,0,...,False,76561198804048400,67,5,2664,0,2422.0,1608692186,vloe gaems so goodp woud recomencad,"[vloe, gaems, so, goodp, woud, recomencad]"
9,4673448,431960,Wallpaper Engine,58659053,english,This app is amazing,1575076707,1575076707,True,0,...,False,76561198009016800,3606,66,16272,0,10278.0,1607315645,this app is amazing,"[this, app, is, amazing]"
