Run detection language for all the downloaded reviews.

### Installation

To run this notebook, it is recommended to create a virtual enviroment.

```
brew install pyenv pyenv-virtualenv protobuf@21
pyenv virtualenv 3.11 app-reviews-scraper
pyenv local app-reviews-scraper
export PATH="/opt/homebrew/opt/protobuf@21/bin:$PATH"
export CFLAGS='-I/opt/homebrew/opt/protobuf@21/include'
export LDFLAGS=-L/opt/homebrew/opt/protobuf@21/lib
pip install -r requirements.txt
```

* [CLD3 Documentation](https://github.com/google/cld3)
* [Install on MacOS with M1](https://github.com/google/cld3/issues/80#issuecomment-1466463935)
* [Introduction to Googleâ€™s Compact Language Detector v3 in Python](https://towardsdatascience.com/introduction-to-googles-compact-language-detector-v3-in-python-b6887101ae47?gi=52f72b831292)

### How to use it

`cp .env.example .env`

Adjust the file with the corresponding paths

# Detect Review Language

In [1]:
%env PATH="/opt/homebrew/opt/protobuf@21/bin:$PATH"
%env CFLAGS='-I/opt/homebrew/opt/protobuf@21/include'
%env LDFLAGS=-L/opt/homebrew/opt/protobuf@21/lib

env: PATH="/opt/homebrew/opt/protobuf@21/bin:$PATH"
env: CFLAGS='-I/opt/homebrew/opt/protobuf@21/include'
env: LDFLAGS=-L/opt/homebrew/opt/protobuf@21/lib


In [1]:
import os
import cld3 as gcld3 
import json
import pandas as pd
import re
from pydantic_settings import BaseSettings, SettingsConfigDict

ImportError: dlopen(/Users/mast6838/.pyenv/versions/3.10.14/envs/app-reviews-scraper/lib/python3.10/site-packages/cld3/_cld3.cpython-310-darwin.so, 0x0002): symbol not found in flat namespace '__ZTIN6google8protobuf11MessageLiteE'

In [None]:
class Settings(BaseSettings):
    project_name: str
    data_path: str
    app_list_path: str
    ios_reviews_path: str = "/app-store/reviews/"
    android_reviews_path: str = "/play-store/reviews/"
    model_config = SettingsConfigDict(env_file='.env')

In [None]:
settings = Settings()

In [None]:
ANDROID_PATH = f'{settings.data_path}/{settings.project_name}/{settings.android_reviews_path}'
IOS_PATH = f'{settings.data_path}/{settings.project_name}/{settings.ios_reviews_path}'
MIN_NUMBER_BYTES = 0
MAX_NUMBER_BYTES = 5000

In [None]:
with open(settings.app_list_path) as json_file:
    apps = json.load(json_file)

In [None]:
detector = gcld3.NNetLanguageIdentifier(min_num_bytes = MIN_NUMBER_BYTES, max_num_bytes = MAX_NUMBER_BYTES)

In [None]:
def FindReviewLanguage(x):
    try:
        if isinstance(x, str) and x != "":
            result = detector.FindLanguage(text=x)
            return pd.Series([result.language, result.is_reliable, result.proportion, result.probability])
        else:
            return pd.Series(["", "", "", ""])
    except Exception as ex:
        print(ex)

In [None]:
# Pandas is deprecating some params
# We need to address them at some point
import warnings
warnings.filterwarnings('ignore')

In [None]:
# Merge all reviews into one single file
reviews_filename = f"{settings.data_path}/{settings.project_name}-lang-detection.csv"

In [None]:
# Create empty file
reviews = pd.DataFrame(columns=['text', 'score', 'date', 'os', 'app', 'store'])
reviews.to_csv(reviews_filename, index=False)
del reviews

In [None]:
for app in apps['apps']:
    try:
        # Play Store (Android)
        app_name_dashed = app["android"]["id"].replace(".", "-")
        # app_name_dashed = app["android"]["id"]
        for filename in os.listdir(os.path.join(ANDROID_PATH, app_name_dashed)):
            if filename.endswith('.csv'):
                df = pd.read_csv(os.path.join(ANDROID_PATH, app_name_dashed, filename), usecols=['content', 'score', 'at'], low_memory=False)
                df = df.rename(columns={"content": "text", "at": "date"}) 
                df['store'] = filename.split(".")[0]
                df['os'] = 'Android'
                df['app'] = app['android']['id']
                df.to_csv(reviews_filename, mode="a", header=False, index=False)

        # App Store (iOS)
        for filename in os.listdir(os.path.join(IOS_PATH, app['ios']['id'])):
            if filename.endswith('.csv'):
                df = pd.read_csv(os.path.join(IOS_PATH, app['ios']['id'], filename), usecols=['date', 'review', 'rating'], low_memory=False)
                df = df.rename(columns={"review": "text", "rating": "score"}) 
                df['store'] = re.search('(.+?)(\.[^.]*$|$)', filename).group(1)
                df['os'] = 'iOS'
                df['app'] = app['ios']['name']
                df.to_csv(reviews_filename, mode="a", header=False, index=False)
                
    except Exception as ex:
        print(f'Error {app} {filename}')
        print(ex)

In [None]:
# Create empty file
reviews = pd.DataFrame(columns=['text', 'score', 'date', 'os', 'app', 'store', 'language', 'is_reliable', 'proportion', 'probability'])
reviews.to_csv(f"{settings.data_path}/{settings.project_name}-lang-detection_processed.csv", index=False)
del reviews

In [None]:
chunksize = 10 ** 3
data = pd.read_csv(reviews_filename, low_memory=False, chunksize=chunksize)
for chunk in data:
    chunk[['language', 'is_reliable', 'proportion', 'probability']] = chunk['text'].apply(lambda x: FindReviewLanguage(x))
    chunk.to_csv(f"{settings.data_path}/{settings.project_name}-lang-detection_processed.csv", mode="a", header=False, index=False)

In [None]:
# Count

In [None]:
data = pd.read_csv(f"{settings.data_path}/{settings.project_name}-lang-detection_processed.csv", low_memory=False)

In [None]:
rows = data.shape[0]

In [None]:
count = data['language'].value_counts()

In [None]:
count.to_csv(f"{settings.data_path}/{settings.project_name}-count.csv")

In [None]:
count2 = count.div(rows)

In [None]:
count2.to_csv(f"{settings.data_path}/{settings.project_name}-count2.csv")