[guide](https://towardsdatascience.com/introduction-to-googles-compact-language-detector-v3-in-python-b6887101ae47?gi=52f72b831292)

```
 brew install protobuf
```

[CLD3 Documentation](https://github.com/google/cld3)
[Install on MacOS with M1](https://github.com/google/cld3/issues/80#issuecomment-1466463935)

# Detect Review Language

In [None]:
PROJECT = 'dating'
APP_LIST = 'dating-apps'

In [None]:
import os
import gcld3
import json
import pandas as pd
import re

In [None]:
ANDROID_PATH = '/Users/pietro/Downloads/dl-dating/data/' + PROJECT + '/play-store/reviews/'
IOS_PATH = '/Users/pietro/Downloads/dl-dating/data/' + PROJECT + '/app-store/reviews/'
MIN_NUMBER_BYTES = 0
MAX_NUMBER_BYTES = 5000

In [None]:
fpath = f"/Users/pietro/Downloads/dl-dating/data/apps/{APP_LIST}.json"
with open(fpath) as json_file:
    apps = json.load(json_file)

In [None]:
detector = gcld3.NNetLanguageIdentifier(min_num_bytes = MIN_NUMBER_BYTES, max_num_bytes = MAX_NUMBER_BYTES)

In [None]:
def FindReviewLanguage(x):
    try:
        if isinstance(x, str) and x != "":
            result = detector.FindLanguage(text=x)
            return pd.Series([result.language, result.is_reliable, result.proportion, result.probability])
        else:
            return pd.Series(["", "", "", ""])
    except Exception as ex:
        print(ex)

In [None]:
# Pandas is deprecating some params
# We need to address them at some point
import warnings
warnings.filterwarnings('ignore')

In [None]:
# Merge all reviews into one single file
reviews_filename = "/Users/pietro/Downloads/dl-dating/data/" + PROJECT + '-lang-detection.csv'

In [None]:
# Create empty file
reviews = pd.DataFrame(columns=['text', 'score', 'date', 'os', 'app', 'store'])
reviews.to_csv(reviews_filename, index=False)
del reviews

In [None]:
for app in apps['apps']:
    try:
        # Play Store (Android)
        app_name_dashed = app["android"]["id"].replace(".", "-")
        # app_name_dashed = app["android"]["id"]
        for filename in os.listdir(os.path.join(ANDROID_PATH, app_name_dashed)):
            if filename.endswith('.csv'):
                df = pd.read_csv(os.path.join(ANDROID_PATH, app_name_dashed, filename), usecols=['content', 'score', 'at'], low_memory=False)
                df = df.rename(columns={"content": "text", "at": "date"}) 
                df['store'] = filename.split(".")[0]
                df['os'] = 'Android'
                df['app'] = app['android']['id']
                df.to_csv(reviews_filename, mode="a", header=False, index=False)

        # App Store (iOS)
        for filename in os.listdir(os.path.join(IOS_PATH, app['ios']['id'])):
            if filename.endswith('.csv'):
                df = pd.read_csv(os.path.join(IOS_PATH, app['ios']['id'], filename), usecols=['date', 'review', 'rating'], low_memory=False)
                df = df.rename(columns={"review": "text", "rating": "score"}) 
                df['store'] = re.search('(.+?)(\.[^.]*$|$)', filename).group(1)
                df['os'] = 'iOS'
                df['app'] = app['ios']['name']
                df.to_csv(reviews_filename, mode="a", header=False, index=False)
                
    except Exception as ex:
        print(f'Error {app} {filename}')
        print(ex)

In [None]:
# Create empty file
reviews = pd.DataFrame(columns=['text', 'score', 'date', 'os', 'app', 'store', 'language', 'is_reliable', 'proportion', 'probability'])
reviews.to_csv("/Users/pietro/Downloads/dl-dating/data/" + PROJECT + '-lang-detection_processed.csv', index=False)
del reviews

In [None]:
chunksize = 10 ** 3
data = pd.read_csv(reviews_filename, low_memory=False, chunksize=chunksize)
for chunk in data:
    chunk[['language', 'is_reliable', 'proportion', 'probability']] = chunk['text'].apply(lambda x: FindReviewLanguage(x))
    chunk.to_csv("/Users/pietro/Downloads/dl-dating/data/" + PROJECT + '-lang-detection_processed.csv', mode="a", header=False, index=False)

In [None]:
# Count

In [None]:
data = pd.read_csv("/Users/pietro/Downloads/dl-dating/data/" + PROJECT + '-lang-detection_processed.csv', low_memory=False)

In [None]:
rows = data.shape[0]

In [None]:
count = data['language'].value_counts()

In [None]:
count.to_csv("/Users/pietro/Downloads/dl-dating/data/" + PROJECT + '-count.csv')

In [None]:
count2 = count.div(rows)

In [None]:
count2.to_csv("/Users/pietro/Downloads/dl-dating/data/" + PROJECT + '-count2.csv')