# Language Filtering

In [1]:
# import packages
import pandas as pd
from langdetect import detect_langs
import multiprocessing as mp
import tqdm
import re

In [2]:
# import data
data = pd.read_csv('clean_data_odd.csv')
data.head()

Unnamed: 0,author,subreddit,created_utc,score,controversiality,body
0,CryptoHODLer101,AMA,1513292107,0,0,No BTC is not fiat. I get paid in Bitcoin. As ...
1,nappiestapparatus,AMA,1395194709,0,0,I think you're overestimating your hashing pow...
2,Skating2Death,AMA,1390016247,0,0,"I've heard of it, but with the high volatility..."
3,OmarJunkman,AMA,1528300155,0,0,Do you have any bitcoin?
4,evanc1411,ASU,1525380807,0,1,"BTC -? Damn, when you buy a Bitcoin you're ..."


In [3]:
# filter out coments where there are no alphabetical characters
# will mess with language detection if left in!
data['has_char'] = data['body'].apply(str).str.contains('[a-zA-Z]+')
data.head()

Unnamed: 0,author,subreddit,created_utc,score,controversiality,body,has_char
0,CryptoHODLer101,AMA,1513292107,0,0,No BTC is not fiat. I get paid in Bitcoin. As ...,True
1,nappiestapparatus,AMA,1395194709,0,0,I think you're overestimating your hashing pow...,True
2,Skating2Death,AMA,1390016247,0,0,"I've heard of it, but with the high volatility...",True
3,OmarJunkman,AMA,1528300155,0,0,Do you have any bitcoin?,True
4,evanc1411,ASU,1525380807,0,1,"BTC -? Damn, when you buy a Bitcoin you're ...",True


In [4]:
# see how many returned
data['has_char'].value_counts()

True     1607781
False          1
Name: has_char, dtype: int64

In [5]:
# filter out the flase results
data = data.loc[data['has_char'] == True]

#check to make sure that they were filtered out
data['has_char'].value_counts()

True    1607781
Name: has_char, dtype: int64

In [6]:
# reindex since data has been filtered out
data = data.reset_index(drop = True)

## method 1
- #### filter via subreddit using R
- #### go to part 2_clean_reddit_comments.md 

## > method 2
- #### filter via language detection

```Python
# create new ['language'] column
data['language'] = data['body'].apply(detect_langs)

# new column ['english'] returns 'True' if english has any probability of being the language
data['english'] = data['language'].apply(str).str.contains('en')

# see how many were classified as each
data['english'].value_counts()

# see what was classified as false
data[data['english'] == False].head()

# parse data to only return values where ['english'] is True
data = data[(data['english'] == True)]

#reset index as indexes of filtered data are deleted
data = data.reset_index(drop = True)

# drop data that we don't need anymore
del data['language']
del data['english']
```

## > method 3
- #### method 2 but as a function

```Python
# Arguments: (df = your dataframe), (series = name of column in df as string), (language_select = two letter string of language code that you want)

def language_filter(df, series = str, language_select = str):
    
    # create copied df
    df_copy = df.copy()
    
    # create ['languague'] from output of detect_langs
    df_copy['language'] = df_copy[series].apply(detect_langs)
    
    # new column ['contains_your_language'] returns 'True' if ['language'] contains any probability of your language
    df_copy['contains_your_language'] = df_copy['language'].apply(str).str.contains(language_select)
    
    # parse data to only return values where ['contains_your_language'] is True
    df_copy = df_copy.loc[df_copy['contains_your_language'] == True]
    
    # remove ['language'] and ['contains_your_language'] as they are no longer needed
    del df_copy['language']
    del df_copy['contains_your_language']
    
    # reindex df
    df_copy = df_copy.reset_index(drop = True)
    
   # return your new filtered dataframe
    return df_copy
```

```Python
# run function
data = language_filter(df = data, series = 'body', language_select = 'en')
data.head()
```

## > method 4
- #### parallel processing

The parallel processing package doesn't have a version of pandas `.apply`, it has a version of `map` (which is a list's version of apply). This means we need to transform our data into a list and then use the map function. below are some examples:

```Python
# list comprehension = this is the function that is going to be parallel processed
test_list = [i for i in map(detect_langs, data['body'])]

# same thing as list comprehension above
test_list = []
for i in data['body']:
    results = detect_langs(i)
    test_list.append(results)
```

In [7]:
# number of parallel processes to create
# should be number of cpu cores that your computer has
num_processes = mp.cpu_count()
num_processes

8

Below, `pool.map` works just the same as `map`. Chunksize is the amount of data that each process will work on at a time. To get the progress bar, simply wrap `tqdm.tqdm` around the object that you are iterating over, which in this case is `data['body']`

In [8]:
# parallel process with progress bar
if __name__ == '__main__':
    with mp.Pool(num_processes) as pool:
       data['language'] = [i for i in pool.map(detect_langs, tqdm.tqdm(data['body']), chunksize = 10)]

100%|██████████| 1607781/1607781 [33:00<00:00, 811.75it/s]


In [9]:
# new column ['english'] returns 'True' if english has any probability of being the language
data['english'] = data['language'].apply(str).str.contains('en')
data.head()

Unnamed: 0,author,subreddit,created_utc,score,controversiality,body,has_char,language,english
0,CryptoHODLer101,AMA,1513292107,0,0,No BTC is not fiat. I get paid in Bitcoin. As ...,True,[en:0.999994581371662],True
1,nappiestapparatus,AMA,1395194709,0,0,I think you're overestimating your hashing pow...,True,[en:0.9999973707254454],True
2,Skating2Death,AMA,1390016247,0,0,"I've heard of it, but with the high volatility...",True,[en:0.999997406214342],True
3,OmarJunkman,AMA,1528300155,0,0,Do you have any bitcoin?,True,"[en:0.5714282300865294, tr:0.4285707383592642]",True
4,evanc1411,ASU,1525380807,0,1,"BTC -? Damn, when you buy a Bitcoin you're ...",True,"[en:0.8571382001868462, cy:0.1428609631258628]",True


In [10]:
# see how many were classified as each
data['english'].value_counts()

True     1603318
False       4463
Name: english, dtype: int64

In [11]:
# parse data to only return values where ['english'] is True
data = data[(data['english'] == True)]

In [12]:
#reset index as indexes of filtered data are deleted
data = data.reset_index(drop = True)

In [13]:
# drop data that we don't need anymore
del data['language']
del data['english']

In [None]:
data.write_csv('filtered_data_odd.csv')