In [2]:
import os
import pandas as pd
import csv
from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()
english_words_list = []

with open('english_words_corpora/english_corpus.csv', 'r') as read_obj:
    csv_reader = csv.reader(read_obj)
    list_of_csv = list(csv_reader)
    english_words_list = list_of_csv[0]      
english_words_set = set(english_words_list)


def create_relative_english_frequency_col(df):
    def calculate_relative_frequency(content):
        english_word_bools = [lemmatizer.lemmatize(word) in english_words_set for word in str(content).split() if word.isalpha()]
        if not english_word_bools:
            return 0
        return english_word_bools.count(True)/len(english_word_bools) 
    df["english_relative_frequency"] = df.cleaned_content.apply(calculate_relative_frequency)
    return df

def filter_by_english_thresholds(min_thresh, max_thresh, original_dataset, create_file = False):
    output_dir = os.path.splitext(original_dataset)[0]
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
        
    with open(original_dataset, "r") as infile:
        source_df = pd.read_csv(infile, index_col=[0])
        out_df = create_relative_english_frequency_col(source_df)
        out_df = out_df[(min_thresh <= out_df["english_relative_frequency"]) & (out_df["english_relative_frequency"] < max_thresh)]
    
    if create_file: 
        with open(output_dir + f"/{min_thresh}_to_{max_thresh}_english_words.csv", "w") as outfile:
            outfile.write(out_df.to_csv())
            
    return out_df

In [3]:
for i in range(10):
    filter_by_english_thresholds(i/10, (i+1)/10, 'data/Kingston/24400_tweets_over_period.csv', create_file = True)
filter_by_english_thresholds(1, 1.01, 'data/Kingston/24400_tweets_over_period.csv')

Unnamed: 0,id,date,rawContent,cleaned_content,english_relative_frequency
2,3,2022-01-01 23:56:48+00:00,@Dark_Emperorr A who fi cook it?,a who fi cook it,1.0
3,4,2022-01-01 23:53:04+00:00,Very reassuring and genuine. A rallying cry fo...,very reassuring and genuine a rallying cry for...,1.0
5,6,2022-01-01 23:51:04+00:00,Excellent #PNPSTRONG with @MarkJGolding . http...,excellent with,1.0
6,7,2022-01-01 23:50:41+00:00,Just remembered today is not Sunday. Was looki...,just remembered today is not sunday was lookin...,1.0
10,11,2022-01-01 23:47:58+00:00,Book now ‚õ±Ô∏èüèùÔ∏è https://t.co/BmNHqd3dPg,book now,1.0
...,...,...,...,...,...
93,24294,2022-08-31 22:37:44+00:00,Escape plan https://t.co/gb8AlfMiTC,escape plan,1.0
94,24295,2022-08-31 22:36:33+00:00,The Grey https://t.co/EsChAtTh3Y,the grey,1.0
95,24296,2022-08-31 22:34:49+00:00,@MayDenn88559924 @jayjaybrown97 Not really the...,not really the uk 5 note changed to polymer in...,1.0
97,24298,2022-08-31 22:33:17+00:00,@indramistress meeee !!!!,meeee,1.0
