In [2]:
import os
import pandas as pd
import csv
from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()
english_words_list = []

with open('english_words_corpora/english_corpus.csv', 'r') as read_obj:
    csv_reader = csv.reader(read_obj)
    list_of_csv = list(csv_reader)
    english_words_list = list_of_csv[0]      
english_words_set = set(english_words_list)


def create_relative_english_frequency_col(df):
    def calculate_relative_frequency(content):
        english_word_bools = [lemmatizer.lemmatize(word) in english_words_set for word in str(content).split() if word.isalpha()]
        if not english_word_bools:
            return 0
        return english_word_bools.count(True)/len(english_word_bools) 
    df["english_relative_frequency"] = df.cleaned_content.apply(calculate_relative_frequency)
    return df

def filter_by_english_thresholds(min_thresh, max_thresh, original_dataset, create_file = False):
    output_dir = os.path.splitext(original_dataset)[0]
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
        
    with open(original_dataset, "r") as infile:
        source_df = pd.read_csv(infile, index_col=[0])
        out_df = create_relative_english_frequency_col(source_df)
        out_df = out_df[(min_thresh <= out_df["english_relative_frequency"]) & (out_df["english_relative_frequency"] < max_thresh)]
    
    if create_file: 
        with open(output_dir + f"/{min_thresh}_to_{max_thresh}_english_words.csv", "w") as outfile:
            outfile.write(out_df.to_csv())
            
    return out_df

In [5]:
for i in range(10):
    filter_by_english_thresholds(i/10, (i+1)/10, 'data/New Delhi/24400_tweets_over_period.csv', create_file = True)
filter_by_english_thresholds(1, 1.01, 'data/New Delhi/24400_tweets_over_period.csv')

Unnamed: 0,id,date,rawContent,cleaned_content,english_relative_frequency
0,1,2022-01-01 23:53:54+00:00,https://t.co/6G6D6MPByk,,1.0
3,4,2022-01-01 23:51:34+00:00,6. Please share as much as you can. https://t....,6 please share as much as you can,1.0
9,10,2022-01-01 23:48:25+00:00,बग्गा जी मुझे कांग्रेस ने कुछ करने ही नहीं दिय...,,1.0
11,12,2022-01-01 23:45:03+00:00,https://t.co/JAuCPf05Pc,,1.0
12,13,2022-01-01 23:44:40+00:00,3. Please share as much as you can. https://t....,3 please share as much as you can,1.0
...,...,...,...,...,...
93,24294,2022-08-31 22:22:38+00:00,@JeffRolandMagic @00point1 @OhHungryArtist The...,the colors are magnificent and purity of the w...,1.0
94,24295,2022-08-31 22:22:36+00:00,Do I really like him or do I like the idea of ...,do i really like him or do i like the idea of him,1.0
96,24297,2022-08-31 22:19:24+00:00,@ColinsDoodles @00point1 @OhHungryArtist Beaut...,beautiful work colin,1.0
98,24299,2022-08-31 22:18:34+00:00,@JeffRolandMagic @00point1 @OhHungryArtist Abs...,absolutely beautiful work jeff,1.0
