<a href="https://colab.research.google.com/github/nicole-whitlock/games/blob/main/Data_Loading_and_preprocessingipynb.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
import pandas as pd
from langdetect import detect, DetectorFactory
import kagglehub
import os
from google.colab import drive
import string
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
nltk.download('stopwords')
nltk.download('wordnet')
import numpy as np

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


In [4]:
# Download the dataset
print("Downloading dataset...")
path = kagglehub.dataset_download("davutb/metacritic-games")
print(f"Dataset downloaded to: {path}")

# Find the games_reviews.csv
csv_path = None
for root, dirs, files in os.walk(path):
    for file in files:
        if file.lower() == "games_reviews.csv":
            csv_path = os.path.join(root, file)
            break
    if csv_path:
        break

if not csv_path:

    for root, dirs, files in os.walk(path):
        for file in files:
            if file.endswith('.csv') and ('review' in file.lower() or 'game' in file.lower()):
                csv_path = os.path.join(root, file)
                print(f"Found alternative CSV file: {file}")
                break
        if csv_path:
            break

if not csv_path:
    raise FileNotFoundError("Could not find games_reviews.csv in the downloaded dataset")

# Load the dataset
print(f"\nLoading dataset from: {csv_path}")
try:

    encodings = ['utf-8', 'latin1', 'iso-8859-1']
    for encoding in encodings:
        try:
            df = pd.read_csv(csv_path, encoding=encoding)
            print(f"Successfully loaded with {encoding} encoding")
            break
        except UnicodeDecodeError:
            continue
except Exception as e:
    print(f"Error loading CSV: {e}")
    raise

print("\nDataset columns:", df.columns.tolist())



Downloading dataset...
Dataset downloaded to: /kaggle/input/metacritic-games

Loading dataset from: /kaggle/input/metacritic-games/games_reviews.csv
Successfully loaded with utf-8 encoding

Dataset columns: ['id', 'title', 'quote', 'score', 'date', 'platform', 'author', 'publicationName', 'review_type']


In [5]:
df.head()

Unnamed: 0,id,title,quote,score,date,platform,author,publicationName,review_type
0,1300001290,The Legend of Zelda: Ocarina of Time,C’est jeu est le meilleur jeu qui ai pu existé...,100.0,2023-01-30,Nintendo 64,Filipe69,,user
1,1300001290,The Legend of Zelda: Ocarina of Time,"Marcou minha infância, e realmente é um dos me...",100.0,2022-05-04,Nintendo 64,MetalLiquido,,user
2,1300001290,The Legend of Zelda: Ocarina of Time,"an open world on the classic console, perhaps ...",100.0,2023-02-03,Nintendo 64,aaronnmp96,,user
3,1300001290,The Legend of Zelda: Ocarina of Time,This masterpiece holds a special place in the ...,100.0,2022-10-24,Nintendo 64,AmadouIraklidis,,user
4,1300001290,The Legend of Zelda: Ocarina of Time,10 out of 10 easily one of the best games of h...,100.0,2023-01-31,Nintendo 64,slushy,,user


In [6]:
# Identify the review text
text_col = None
possible_cols = ['quote']
for col in possible_cols:
    if col in df.columns:
        text_col = col
        break

if not text_col:
    # If none of the expected names exist, show available columns
    print("\nCould not identify review text column. Available columns:")
    print(df.columns.tolist())
    text_col = input("Please enter the name of the column containing review text: ")

# Function to detect English text
def is_english(text):
    try:
        if pd.isna(text) or str(text).strip() == '':
            return False
        # Skip very short texts that can't be reliably detected
        if len(str(text).strip()) < 20:
            return False
        return detect(str(text)) == 'en'
    except:
        return False

# Filter for English reviews
print(f"\nFiltering English reviews from column '{text_col}'...")
english_df = df[df[text_col].apply(is_english)].copy()
print(f"Found {len(english_df)} English reviews ({len(english_df)/len(df)*100:.1f}% of total)")

# Save results
output_path = '/content/metacritic_english_reviews.csv'
english_df.to_csv(output_path, index=False)
print(f"\nSaved English reviews to {output_path}")

print("\nPreview of English reviews:")
display(english_df.head())


Filtering English reviews from column 'quote'...
Found 1487401 English reviews (89.8% of total)

Saved English reviews to /content/metacritic_english_reviews.csv

Preview of English reviews:


Unnamed: 0,id,title,quote,score,date,platform,author,publicationName,review_type
2,1300001290,The Legend of Zelda: Ocarina of Time,"an open world on the classic console, perhaps ...",100.0,2023-02-03,Nintendo 64,aaronnmp96,,user
3,1300001290,The Legend of Zelda: Ocarina of Time,This masterpiece holds a special place in the ...,100.0,2022-10-24,Nintendo 64,AmadouIraklidis,,user
4,1300001290,The Legend of Zelda: Ocarina of Time,10 out of 10 easily one of the best games of h...,100.0,2023-01-31,Nintendo 64,slushy,,user
5,1300001290,The Legend of Zelda: Ocarina of Time,Absolutely the best game ever made. Completely...,100.0,2022-06-12,Nintendo 64,Konnor1224,,user
6,1300001290,The Legend of Zelda: Ocarina of Time,the people rating this game a 0 will absolutel...,100.0,2022-11-21,Nintendo 64,Pokemandeluxe,,user


In [14]:
english_df.head()
english_df.shape

(1487401, 9)

In [21]:
english_df.dropna(subset=['score'], inplace=True)
english_df.shape
english_df.to_csv('/content/english_reviews.csv', index=False)