# Comparing Groups Function Building

Below, I build a function that compares two corpuses by providing statistics for each one based on its tokens that provide length, lexical diversity, unique tokens, top words used, and ratios that look at how often one word appears in corpus one compared to corpus two.

In [1]:
import nltk
from collections import Counter
from nltk.corpus import stopwords
from nltk import FreqDist

sw = stopwords.words("english")

In [2]:
def lexical_diversity(text):
    return len(set(text)) / len(text)

Below is the function that will process two different corpuses and provide some basic comparisons

In [3]:
def compare_groups(corpus_1, corpus_2, num_words, ratio_cutoff):
    
    ##Creating a dictionary for the results
    results = {'one': [],
               'two': [],
               'one_vs_two': {'ratios': []},
               'two_vs_one': {'ratios':[]}}
    
    #Opening each file
    open_corpus1 = open(corpus_1, "r", encoding = "Latin-1")
    open_corpus2 = open(corpus_2, "r", encoding = "Latin-1")
    
    read_c1 = open_corpus1.read()
    read_c2 = open_corpus2.read()
    
    tokenize_c1 = nltk.word_tokenize(read_c1)
    tokenize_c2 = nltk.word_tokenize(read_c2)
    
    ##Cleaning the tokens
    c1_clean_tokens = []
    c2_clean_tokens = []
    
    for tokens in tokenize_c1:
        if tokens not in sw and tokens.isalpha():
            c1_clean_tokens.append(tokens.lower())
            
    for tokens in tokenize_c2:
        if tokens not in sw and tokens.isalpha():
            c2_clean_tokens.append(tokens.lower())
    
    #Basic Statistics
    unique_tokens_c1= len(set(c1_clean_tokens))
    unique_tokens_c2= len(set(c2_clean_tokens))
                   
    lexical_diversity_c1= lexical_diversity(c1_clean_tokens)
    lexical_diversity_c2= lexical_diversity(c2_clean_tokens)
                   
    top_words_c1= Counter(c1_clean_tokens).most_common(num_words)
    top_words_c2= Counter(c2_clean_tokens).most_common(num_words)

    all_lengths_1 = []
    num_of_strings_1 = len(c1_clean_tokens)

    for item in c1_clean_tokens:
        string_size = len(item)
        all_lengths_1.append(string_size)
        total_size = sum(all_lengths_1)
    ave_size_1 = float(total_size) / float(num_of_strings_1)              
    
    all_lengths_2=[]
    num_of_strings_2=len(c2_clean_tokens)
    
    for item in c2_clean_tokens:
        string_size = len(item)
        all_lengths_2.append(string_size)
        total_size= sum(all_lengths_2)
    ave_size_2 = float(total_size) / float(num_of_strings_2)
    
    #Trying to generate num_words amount of words with highest one v two ratios
    ratio_cutoff_words_1=set()               
    for words in c1_clean_tokens : ##Only words that are used a certain amount of times are allowed
        if c1_clean_tokens.count(words) >= ratio_cutoff:
            
            ratio_cutoff_words_1.add(words)
    
    freq_list_1=[] 
    for word in ratio_cutoff_words_1: ##Create a new list of times each word appears divided by length of clean tokens
                 
        p1=((c1_clean_tokens.count(word))/len(c1_clean_tokens))
        item=word,p1
        freq_list_1.append(item)
           
    ratio_cutoff_words_2 = set()       
    for words in c2_clean_tokens: ##Only words that are used a certain amount of times are allowed
        if c2_clean_tokens.count(words) >= ratio_cutoff:
            
            ratio_cutoff_words_2.add(words)
    
    freq_list_2=[] 
    for word in ratio_cutoff_words_2:  ##Create a new list of times each word appears divided by length of clean tokens
               
        p2=((c2_clean_tokens.count(word))/len(c2_clean_tokens))
        item=word,p2
        freq_list_2.append(item)
        
    ##Now I have two lists that contain each corpus' words and their frequencies
    ##I need to make a new list that divides each frequency of each word
    #in corpus 1 divided by freq in corpus 2
    
    
    
    one_vs_two_ratios=[]
    
    for word1,freq1 in freq_list_1:
        for word2,freq2 in freq_list_2:
            if word1==word2:
                ratio=freq1/freq2
                pair=word2,ratio
                one_vs_two_ratios.append(pair)
        
    ##Taking num_words amount of the highest ratio words        
    one_vs_two_ratios.sort(reverse=True)
    
    cutoff_words=one_vs_two_ratios[0:num_words]
    
    results['one_vs_two']= {'ratios':cutoff_words}   
    
    
    ##Doing the same for two vs one
    
    two_vs_one_ratios=[]
    
    for word1,freq1 in freq_list_1:
        for word2,freq2 in freq_list_2:
            if word1==word2:
                ratio=freq2/freq1
                pair=word2,ratio
                two_vs_one_ratios.append(pair)
        
    ##Taking num_words amount of the highest ratio words        
    two_vs_one_ratios.sort(reverse=True)
    
    cutoff_words2=two_vs_one_ratios[0:num_words]
    
    
    ##Inserting the summary stats into the dictionary
    
    
    results['two_vs_one']= {'ratios':cutoff_words2}
                   
    results['one'] = {"tokens": [len(c1_clean_tokens)],
                     "unique_tokens": [unique_tokens_c1],
                     "avg_token_length": [ave_size_1],
                     "lexical_diversity": [lexical_diversity_c1],
                     "top_words": [top_words_c1]}
    results['two'] = {"tokens": [len(c2_clean_tokens)],
                      "unique_tokens": [unique_tokens_c2],
                      "avg_token_length": [ave_size_2],
                      "lexical_diversity": [lexical_diversity_c2],
                      "top_words": [top_words_c2]}               
   

    return(results)

# Comparing Republican/Democratic Conventions

In [4]:
ratio_cutoff=5
num_words=10

In [5]:
file_1="Combined_Republican.txt"

file_2="Combined_Democratic.txt"

In [6]:
compare_groups(file_1,file_2, num_words, ratio_cutoff)

{'one': {'tokens': [48677],
  'unique_tokens': [6882],
  'avg_token_length': [5.854797953859112],
  'lexical_diversity': [0.1413809396634961],
  'top_words': [[('i', 1177),
    ('trump', 751),
    ('president', 742),
    ('america', 426),
    ('we', 347),
    ('american', 335),
    ('people', 304),
    ('donald', 295),
    ('country', 288),
    ('the', 286)]]},
 'two': {'tokens': [43545],
  'unique_tokens': [6118],
  'avg_token_length': [5.6596624181880815],
  'lexical_diversity': [0.1404983350556895],
  'top_words': [[('i', 910),
    ('joe', 703),
    ('biden', 644),
    ('us', 448),
    ('president', 398),
    ('and', 373),
    ('we', 333),
    ('speaker', 329),
    ('people', 305),
    ('america', 253)]]},
 'one_vs_two': {'ratios': [('your', 1.2779576156530366),
   ('young', 0.5659526583606304),
   ('you', 1.059721776672287),
   ('york', 3.354638741089221),
   ('yet', 2.546084788108742),
   ('yes', 0.4472851654785628),
   ('years', 2.1876310820678797),
   ('year', 0.6890609306021103

# Comparing Democratic Convention to big.txt

In [7]:
ratio_cutoff=5
num_words=5

In [8]:
file_1="big.txt"

file_2="Combined_Democratic.txt"

In [9]:
compare_groups(file_1,file_2, num_words, ratio_cutoff)

{'one': {'tokens': [582554],
  'unique_tokens': [28560],
  'avg_token_length': [6.083523587512917],
  'lexical_diversity': [0.049025498065415396],
  'top_words': [[('i', 7632),
    ('the', 7134),
    ('said', 3463),
    ('one', 3299),
    ('may', 2549)]]},
 'two': {'tokens': [43545],
  'unique_tokens': [6118],
  'avg_token_length': [5.6596624181880815],
  'lexical_diversity': [0.1404983350556895],
  'top_words': [[('i', 910),
    ('joe', 703),
    ('biden', 644),
    ('us', 448),
    ('president', 398)]]},
 'one_vs_two': {'ratios': [('your', 1.5910738379127576),
   ('young', 0.9534239201298882),
   ('you', 1.07062758922844),
   ('york', 1.8219931113682166),
   ('yet', 2.8059412650025704)]},
 'two_vs_one': {'ratios': [('your', 0.6285063434339697),
   ('young', 1.048851385922609),
   ('you', 0.9340315998401101),
   ('york', 0.5488494955115693),
   ('yet', 0.3563866473160421)]}}