In [2]:
import os
import pandas as pd
import csv
from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()
english_words_list = []

with open('english_words_corpora/english_corpus.csv', 'r') as read_obj:
    csv_reader = csv.reader(read_obj)
    list_of_csv = list(csv_reader)
    english_words_list = list_of_csv[0]      
english_words_set = set(english_words_list)


def create_relative_english_frequency_col(df):
    def calculate_relative_frequency(content):
        english_word_bools = [lemmatizer.lemmatize(word) in english_words_set for word in str(content).split() if word.isalpha()]
        if not english_word_bools:
            return 0
        return english_word_bools.count(True)/len(english_word_bools) 
    df["english_relative_frequency"] = df.cleaned_content.apply(calculate_relative_frequency)
    return df

def filter_by_english_thresholds(min_thresh, max_thresh, original_dataset, create_file = False):
    output_dir = os.path.splitext(original_dataset)[0]
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
        
    with open(original_dataset, "r") as infile:
        source_df = pd.read_csv(infile, index_col=[0])
        out_df = create_relative_english_frequency_col(source_df)
        out_df = out_df[(min_thresh <= out_df["english_relative_frequency"]) & (out_df["english_relative_frequency"] < max_thresh)]
    
    if create_file: 
        with open(output_dir + f"/{min_thresh}_to_{max_thresh}_english_words.csv", "w") as outfile:
            outfile.write(out_df.to_csv())
            
    return out_df

In [4]:
for i in range(10):
    filter_by_english_thresholds(i/10, (i+1)/10, 'data/Accra/24400_tweets_over_period.csv', create_file = True)
filter_by_english_thresholds(1, 1.01, 'data/Accra/24400_tweets_over_period.csv')

Unnamed: 0,id,date,rawContent,cleaned_content,english_relative_frequency
2,3,2022-01-01 23:59:12+00:00,@kay_bee28 😂😂,,1.0
3,4,2022-01-01 23:59:07+00:00,@shattawalegh Sell Out,sell out,1.0
6,7,2022-01-01 23:58:29+00:00,Happy new year baby ❤️😍 https://t.co/xAG3pa8O86,happy new year baby,1.0
7,8,2022-01-01 23:58:16+00:00,@Airkuya Jon😆,jon,1.0
9,10,2022-01-01 23:57:45+00:00,The space way time them go close?,the space way time them go close,1.0
...,...,...,...,...,...
94,24295,2022-08-31 23:13:51+00:00,#GodDid https://t.co/x1cFRzDCVa,,1.0
95,24296,2022-08-31 23:13:46+00:00,@jordan1zz Okay sure thank you,okay sure thank you,1.0
96,24297,2022-08-31 23:13:38+00:00,@cfcjakes @okoyeKennedy4 @Harriso58693132 @Abs...,he signed this season rather,1.0
97,24298,2022-08-31 23:13:37+00:00,@1RealJunior_ I am waiting,i am waiting,1.0
