In [12]:
def generate_correct_wordset(in_filename, out_filename, mincount):
    ''' Generates a wordset trimming words that do not meet the required min. occurrences
    
    A rough method to generate a 'correct' wordset for autocorrect, by removing the least common inputs,
    keeping the most common as the 'correct' ones.
    
    :param in_filename: input filename
    :param out_filename: output filename
    :param mincount: minimum occurrences for words to be kept in wordset
    
    Returns dict of words kept in the set and their counts
    '''
    from collections import Counter
    
    with open(in_filename, "r") as ifile:
        words = [line.rstrip('\n') for line in ifile]
        ifile.close()
        
    word_count = Counter(words)
    final_wordset = {key : val for key, val in word_count.items() if (val >= mincount)} 
    
    with open(out_filename, "w") as ofile:
        for key, val in final_wordset.items():
            for i in range(val):
                ofile.write(key + "\n")
                
        ofile.close()

    return final_wordset

In [13]:
if __name__ == '__main__':
    print(generate_correct_wordset("input.txt", "wordsets/pledge_semester.txt", 2))

{'Fall 19': 20, 'Spring 18': 14, 'Spring 19': 11, 'Fall 18': 14, 'Spring 20': 16, 'Spring 17': 11, 'Fall 17': 13, 'Fall 16': 8, 'Fall 2019': 5, 'Fall 2017': 6, 'Spring 2019': 6, 'Fall 2018': 6, 'Spring 2020': 6, 'Spring 2018': 5}
