In [2]:
import os
import pandas as pd
import csv
from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()
english_words_list = []

with open('english_words_corpora/english_corpus.csv', 'r') as read_obj:
    csv_reader = csv.reader(read_obj)
    list_of_csv = list(csv_reader)
    english_words_list = list_of_csv[0]      
english_words_set = set(english_words_list)


def create_relative_english_frequency_col(df):
    def calculate_relative_frequency(content):
        english_word_bools = [lemmatizer.lemmatize(word) in english_words_set for word in str(content).split() if word.isalpha()]
        if not english_word_bools:
            return 0
        return english_word_bools.count(True)/len(english_word_bools) 
    df["english_relative_frequency"] = df.cleaned_content.apply(calculate_relative_frequency)
    return df

def filter_by_english_thresholds(min_thresh, max_thresh, original_dataset, create_file = False):
    output_dir = os.path.splitext(original_dataset)[0]
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
        
    with open(original_dataset, "r") as infile:
        source_df = pd.read_csv(infile, index_col=[0])
        out_df = create_relative_english_frequency_col(source_df)
        out_df = out_df[(min_thresh <= out_df["english_relative_frequency"]) & (out_df["english_relative_frequency"] < max_thresh)]
    
    if create_file: 
        with open(output_dir + f"/{min_thresh}_to_{max_thresh}_english_words.csv", "w") as outfile:
            outfile.write(out_df.to_csv())
            
    return out_df

In [4]:
for i in range(10):
    filter_by_english_thresholds(i/10, (i+1)/10, 'data/Manila/24400_tweets_over_period.csv', create_file = True)
filter_by_english_thresholds(1, 1.01, 'data/Manila/24400_tweets_over_period.csv')

Unnamed: 0,id,date,rawContent,cleaned_content,english_relative_frequency
1,2,2022-01-01 23:58:22+00:00,Solemnity of the Epiphany of the Lord (@ Chris...,solemnity of the epiphany of the lord christ t...,1.0
3,4,2022-01-01 23:57:22+00:00,Self reminder https://t.co/v40AwnBIHd,self reminder,1.0
9,10,2022-01-01 23:54:52+00:00,Day 1: AM DUTY Internal Medicine 🙏🏻😅,day 1 am duty internal medicine,1.0
14,15,2022-01-01 23:50:32+00:00,Happy New Year https://t.co/SFoi989b4N,happy new year,1.0
15,16,2022-01-01 23:50:06+00:00,Get to know @perthppe more! 🖤#ThailandComingXP...,get to know more part 1 now available on youtube,1.0
...,...,...,...,...,...
94,24295,2022-08-31 23:48:01+00:00,🙏🏻🙏🏻🙏🏻😊 https://t.co/snXEXraL1S,,1.0
95,24296,2022-08-31 23:47:49+00:00,@_WHOPPERJr_ who hates blue ❤️❤️❤️❤️❤️,who hates blue,1.0
96,24297,2022-08-31 23:47:44+00:00,@MJDZN @ayel0103 CPR ready yan,cpr ready yan,1.0
97,24298,2022-08-31 23:47:41+00:00,Hello Ber months! https://t.co/fQuU30ETz2,hello ber months,1.0
