In [1]:
import re
import pandas as pd
import os
import numpy as np

In [2]:
def head_file(file_path, num_rows):
    ## num_rows has to be a 2 elements list
    with open(file_path, mode='r', newline='') as file:
        row_range=range(num_rows[0],num_rows[1]+1)
        i=0
        for line in file:
            if (i in row_range):               
                   print (line)
            i+=1
    return None

def filter_alphanumeric(word):
#\w matches any alphanumeric character
    merge_words_no_digit=''
    if (word!=''):
        all_match = re.findall('\w+', word)
        all_match = list(filter(None, all_match))
        merge_words=''
        for item in all_match:
            merge_words+=item
        ## Find All unicode, then all non digits
        merge_words=re.findall('[^(\_|\d)]', merge_words)
        for item in merge_words:
            merge_words_no_digit+=item        
    else:
        #print ("Warning, One word is empty.")
        return '' 
    return merge_words_no_digit.lower()

def return_word_list_from_file(Path_File):
    list_words=list()
    with open(Path_File,  mode='r', newline='') as file:
        for line in file:
            for word in (re.split("\s+", line.rstrip('\n'))):
                if (word !=''):
                    list_words.append(filter_alphanumeric(word))
## if different languague, above line has to be changed
    list_words = list(filter(None, list_words))
    return list_words


def return_sentence_list_from_file(Path_File):
    list_sentences=list()
    with open(Path_File,  mode='r', newline='') as file:
        for line in file:
            if (len(line)>1):
                sentence = line.rstrip('\n').rstrip('\r')
                #marked_sentence = #'<^> '+line.rstrip('\n').rstrip('\r')+' </s>'
                list_sentences.append(sentence)
    list_sentences = list(filter(None, list_sentences))
    return list_sentences

def return_vocabulary_from_sentence_list(sentence_list):
    total_vocabulary=set({'^','$'})
    for sentence in sentence_list:
        word_list = re.split("\s+", sentence.rstrip('\n'))
        for word in word_list:
            filtered_word = filter_alphanumeric(word)
            if(filtered_word!=''):
                total_vocabulary.add(filtered_word)
    return sorted(list(total_vocabulary))

In [39]:
def return_unigram_counts(sentence_list, vocabulary):
    count_matrix =np.zeros((len(vocabulary)))
    count_matrix += len(vocabulary) ######## Add-one smoothing
    ## Set value for sentence start <s>
    count_matrix[vocabulary.index('^')] += len(sentence_list)
    count_matrix[vocabulary.index('$')] += len(sentence_list)
    for tem_sentence in sentence_list:
        word_list = re.split("\s+", tem_sentence.rstrip('\n'))
        for word in word_list:
            filtered_word = filter_alphanumeric(word)
            if(filtered_word in vocabulary):
                word_index = vocabulary.index(filtered_word)
                count_matrix[word_index]+=1
    return count_matrix

def Return_c_from_Good_Turing_Smoothing(unigram_count_matrix):
    N_1=0
    N_2=0
    N_words = unigram_count_matrix.sum()
    for x in unigram_count_matrix:
        if(x==1):
            N_1+=1
        elif(x==2):
            N_2+=1
        else:
            continue
    c_for_zero = 1.0*N_1/N_words
    c_for_once  = 1.0*(1+1)*(N_2/N_1)
    return c_for_zero


def return_bigram_word_counts(sentence_list, vocabulary):
    count_matrix=np.zeros((len(vocabulary),len(vocabulary)))
    #count_matrix+=1  ######## Add-one smoothing
    num_word=0
    for tem_sentence in sentence_list:
        word_list = re.split("\s+", tem_sentence.rstrip('\n'))
        word_list = list(filter(None, word_list))
        
        for i in range(0,len(word_list)-1):
            num_word+=1
            if (i==0):
                first_word = filter_alphanumeric(word_list[0])
                if(first_word in vocabulary):
                    count_matrix[vocabulary.index('^'), vocabulary.index(first_word)] += 1
            tem_bigram_count_pairs = word_list[i:i+2]
            first_word=filter_alphanumeric(tem_bigram_count_pairs[0])
            second_word=filter_alphanumeric(tem_bigram_count_pairs[1])
            
            if((first_word in vocabulary) & (second_word in vocabulary)):
                first_digit=vocabulary.index(first_word)
                second_digit=vocabulary.index(second_word)
                count_matrix[first_digit,second_digit]+=1
    print ("Number of Words: "+ str(num_word))
    return count_matrix

def return_Trigram_word_counts(sentence_list, vocabulary):
    count_matrix=np.zeros((len(vocabulary),len(vocabulary)))
    #count_matrix+=1  ######## Add-one smoothing
    num_word=0
    for tem_sentence in sentence_list:
        word_list = re.split("\s+", tem_sentence.rstrip('\n'))
        word_list = list(filter(None, word_list))
        
        for i in range(0,len(word_list)-2):
            num_word+=1
            if (i==0):
                first_word = filter_alphanumeric(word_list[0])
                second_word = filter_alphanumeric(word_list[1])
                if((first_word in vocabulary) & ((second_word in vocabulary))):
                    count_matrix[vocabulary.index(first_word), vocabulary.index(second_word)] += 1
            tem_trigram_count_chain = word_list[i:i+3]
            pre_two_words=tem_trigram_count_chain[0:2]
            
            third_word=filter_alphanumeric(tem_trigram_count_chain[2])
            if((third_word in vocabulary)):
                first_digit=vocabulary.index(pre_two_words)
                second_digit=vocabulary.index(third_word)
                count_matrix[first_digit,second_digit]+=1
    print ("Number of Words: "+ str(num_word))
    return count_matrix


def Norm_Bigram(unigram_counts, bigram_counts, vocabulary):
    #c_0 = Return_c_from_Good_Turing_Smoothing(unigram_counts)
    c_0=1.0
    norm_bigram = np.zeros((len(vocabulary),len(vocabulary))) 
    for i in range(len(vocabulary)):
        norm_bigram[i,:]=(bigram_counts[i,:]+c_0)/(unigram_counts[i])
    return norm_bigram

def Norm_Trigram(unigram_counts, bigram_counts, trigram_counts, vocabulary):
    #c_0 = Return_c_from_Good_Turing_Smoothing(unigram_counts)
    norm_trigram = np.zeros((len(vocabulary),len(vocabulary)))
    for i in range(len(vocabulary)):
            norm_trigram[i,:]=(norm_trigram[i,:]+c_0)/(bigram_counts[i])
    return norm_trim
    

In [49]:
def return_Probability_of_Sentence_Word(sentence, norm_bigram, vocabulary, p_unseen):
    word_list = re.split("\s+", sentence.rstrip('\n'))
    Log_Probability_sentence = 0 
    ## Filter Null Element
    word_list = list(filter(None, word_list))
    
    for i in range(0,len(word_list)-1):
        if (i==0):
            first_word=filter_alphanumeric(word_list[0])
            if(first_word in vocabulary):
                Log_Probability_sentence += np.log(norm_bigram[vocabulary.index('^'), vocabulary.index(first_word)])
            else:
                ## assuming new word as a probability of 1/V
                ## Good Turing Smoothing for zero count
                Log_Probability_sentence += np.log(p_unseen) # np.log(1/len(vocabulary)**2) #
        tem_bigram_count_pairs = word_list[i:i+2]
        first_word=filter_alphanumeric(tem_bigram_count_pairs[0])
        second_word=filter_alphanumeric(tem_bigram_count_pairs[1])
        if((first_word in vocabulary) & (second_word in vocabulary)):
            first_index=vocabulary.index(first_word)
            second_index=vocabulary.index(second_word)
            prob_word_pair=norm_bigram[first_index, second_index]
            if (prob_word_pair!=0):
                Log_Probability_sentence += np.log(prob_word_pair)
            else:
                Log_Probability_sentence += np.log(p_unseen)
            #print (first_word + " "+ second_word )
            #print (Log_Probability_sentence)
        else:
            ## assuming new word as a probability of 1/V
            Log_Probability_sentence += np.log(p_unseen) #np.log(1/len(vocabulary)**2)
    return np.e**Log_Probability_sentence

def return_Probability_of_Sentence_trigram_Word(sentence, norm_bigram, vocabulary, p_unseen):
    word_list = re.split("\s+", sentence.rstrip('\n'))
    Log_Probability_sentence = 0 
    ## Filter Null Element
    word_list = list(filter(None, word_list))
    
    for i in range(0,len(word_list)-2):
        if (i==0):
            first_word=filter_alphanumeric(word_list[0])
            second_word=filter_alphanumeric(word_list[1])
            if ((first_word in vocabulary) & (second_word in vocabulary)):
                Log_Probability_sentence += np.log(norm_bigram[vocabulary.index('^'), vocabulary.index(first_word)])
                Log_Probability_sentence += np.log(norm_bigram[vocabulary.index(first_word),vocabulary.index(second_word)])
            else:
                Log_Probability_sentence += 2*np.log(p_unseen) # np.log(1/len(vocabulary)**2) #
        tem_trigram_count_pairs = word_list[i:i+3]
        first_word=filter_alphanumeric(tem_trigram_count_pairs[0])
        second_word=filter_alphanumeric(tem_trigram_count_pairs[1])
        third_word=filter_alphanumeric(tem_trigram_count_pairs[2])
        if((first_word in vocabulary) & (second_word in vocabulary) & (third_word in vocabulary)):
            first_index=vocabulary.index(first_word)
            second_index=vocabulary.index(second_word)
            third_index=vocabulary.index(third_word)                                                  
            Log_Probability_sentence += np.log(norm_bigram[first_index,second_index])    
            Log_Probability_sentence += np.log(norm_bigram[second_index,third_index])       
        else:
            Log_Probability_sentence += 2*np.log(p_unseen) #np.log(1/len(vocabulary)**2)
    return np.e**Log_Probability_sentence


def return_probability(Path_Test_Data, norm_bigram, vocabulary):
    count_matrix=list()
    P_unseen=norm_bigram.min()
    with open(Path_Test_Data,  mode='r', newline='') as file:
        i=0
        for line in file:
            results= return_Probability_of_Sentence_trigram_Word(line, norm_bigram, vocabulary, P_unseen)
            count_matrix.append(results)
            i+=1
    return count_matrix



In [109]:
## Main
def test_with_word_model(path_train_data, path_test_data):  
    ## first generate sentence list
    sentence_list = return_sentence_list_from_file(path_train_data)
    ## building vocabulary
    vocabulary = return_vocabulary_from_sentence_list(sentence_list)

    raw_unigram_word_counts = return_unigram_counts(sentence_list, vocabulary)
    raw_bigram_word_counts  = return_bigram_word_counts(sentence_list,vocabulary)   
    norm_bigram_word_counts = Norm_Bigram(raw_unigram_word_counts, raw_bigram_word_counts, vocabulary)
    
    # test data output
    prob = return_probability(path_test_data, norm_bigram_word_counts, vocabulary)
    return prob

def Save_Results(En_prob,FR_prob,GR_prob, NAME):
    data_prob_output = np.transpose([En_prob,FR_prob,GR_prob])
    index_LANG=['EN','FR','GR']
    #out_index=list()
    file1 = open('Results_'+ NAME + '_Model.txt',"w")
    file1.writelines(['ID','LANG'])
    i=1
    for prob_row in data_prob_output[:]:
        index_l = list(prob_row).index(max(prob_row))
        #out_index.append([i,index_LANG[index_l]])
        file1.writelines([str(i),index_LANG[index_l]]) 
        i+=1
    file1.close() #to change file access modes
    
    #pd.DataFrame(out_index,columns=['ID','LANG']).set_index('ID').to_csv('Results_'+ NAME + '_Model.txt', sep="\t")
    print("Output Results can be found at the current directory!")
    print("Output Name is: "+ 'Results_'+ NAME + '_Model.txt')
    return data_prob_output

In [None]:
file1 = open("myfile.txt","w") 
L = ["ID\t","LANG\n"]  
  
# \n is placed to indicate EOL (End of Line) 

file1.writelines(["a\t",'b']) 
file1.close() #to change file access modes 

In [110]:
Path_Data='Data/'
Train_Data_Set=['EN.txt', 'FR.txt', 'GR.txt']
Path_test_data = Path_Data+'LangID.test.txt'
All_prob=list()
for name_train in Train_Data_Set:
    Path_train_data = Path_Data+name_train
    All_prob.append(test_with_word_model(Path_train_data, Path_test_data))


NAME='Word_Trigram'
output = Save_Results(All_prob[0],All_prob[1],All_prob[2], NAME)

Number of Words: 30142
Number of Words: 34062
Number of Words: 26913
Output Results can be found at the current directory!
Output Name is: Results_Word_Trigram_Model.txt


In [10]:
Path_Data='Data/'
Train_Data_Set=['EN.txt', 'FR.txt', 'GR.txt']
Path_Train_Data=Path_Data+Train_Data_Set[0]
Path_test_data = Path_Data+'LangID.test.txt'

sentence_list=return_sentence_list_from_file(Path_Train_Data)
vocabulary = return_vocabulary_from_sentence_list(sentence_list)

In [None]:
vocabulary[2:4]

In [5]:
Path_Data='Data/'
Train_Data_Set=['EN.txt', 'FR.txt', 'GR.txt']
Path_Train_Data=Path_Data+Train_Data_Set[0]
Path_test_data = Path_Data+'LangID.test.txt'

sentence_list=return_sentence_list_from_file(Path_Train_Data)
vocabulary = return_vocabulary_from_sentence_list(sentence_list)

raw_unigram_word_counts =return_unigram_counts(sentence_list,vocabulary)
raw_bigram_word_counts  =return_bigram_word_counts(sentence_list,vocabulary)
raw_trigram_word_counts =return_Trigram_word_counts(sentence_list,vocabulary)
#norm_bigram_word_counts = Norm_Bigram(raw_unigram_word_counts, raw_bigram_word_counts, vocabulary)

norm_trigram_word_counts = Norm_Trigram(raw_unigram_word_counts, raw_bigram_word_counts, raw_trigram_word_counts, vocabulary)

#Return_c_from_Good_Turing_Smoothing(raw_unigram_word_counts)


Number of Words: 30142


MemoryError: Unable to allocate array with shape (4402, 4402, 4402) and data type float64

In [64]:
Path='Data/'
xx = os.listdir(Path)
print (xx)
df_letter = pd.read_csv(Path+'Results_Letter_Bigram_Model.txt', sep="\t").rename(columns={'LANG':'LANG_letter'})
df_Word = pd.read_csv(Path+'Results_Word_Bigram_Model.txt', sep="\t").rename(columns={'LANG':'LANG_Word'})
df_Word_Good = pd.read_csv(Path+'Results_Word_Bigram_Good_Turing_Smoothing_Model.txt', sep="\t").rename(columns={'LANG':'LANG_Word_Good_Turing'})
df_reference = pd.read_csv(Path+'LangID.gold.txt', sep=" ").rename(columns={'LANG':'LANG_ref'})
df_trigram = pd.read_csv(Path+'Results_Word_Trigram_Model.txt', sep="\t").rename(columns={'LANG':'LANG_Trigram'})
df_reference.head()

['EN.txt', 'FR.txt', 'GR.txt', 'LangID.gold.txt', 'LangID.test.txt', 'Letter_Bigram_Model.py', 'Results_Letter_Bigram_Model.txt', 'Results_Word_Bigram_Good_Turing_Smoothing_Model.txt', 'Results_Word_Bigram_Model.txt', 'Results_Word_Trigram_Model.txt', 'run_all_python_scripts_for_HW1.sh', 'Word_Bigram_Model.py', 'Word_Bigram_Model_bp.py', 'Word_Bigram_Model_Good_Turing_Smoothing.py', 'Word_Trigram_Model.py']


Unnamed: 0,ID,LANG_ref
0,1.0,EN
1,2.0,EN
2,3.0,EN
3,4.0,EN
4,5.0,EN


In [65]:
df_sum = df_reference.merge(df_Word.merge(df_Word_Good.merge(df_letter.merge(df_trigram, on='ID'), on='ID'), on='ID'), on='ID')
df_sum.head()

Unnamed: 0,ID,LANG_ref,LANG_Word,LANG_Word_Good_Turing,LANG_letter,LANG_Trigram
0,1.0,EN,EN,EN,EN,EN
1,2.0,EN,EN,GR,EN,EN
2,3.0,EN,EN,GR,GR,EN
3,4.0,EN,EN,EN,EN,EN
4,5.0,EN,EN,EN,EN,EN


In [66]:
print (len(df_sum[df_sum['LANG_ref']==df_sum['LANG_letter']])/150.0)

print (len(df_sum[df_sum['LANG_ref']==df_sum['LANG_Word']])/150.0)

print (len(df_sum[df_sum['LANG_ref']==df_sum['LANG_Word_Good_Turing']])/150.0)

print (len(df_sum[df_sum['LANG_ref']==df_sum['LANG_Trigram']])/150.0)

0.8666666666666667
0.9733333333333334
0.92
0.7733333333333333


In [96]:
#4402**3/1024**3
np.zeros((4402,4402,4402))

MemoryError: Unable to allocate array with shape (4402, 4402, 4402) and data type float64

In [93]:
Accuracy=[1.0, len(df_sum[df_sum['LANG_ref']==df_sum['LANG_letter']])/150.0, len(df_sum[df_sum['LANG_ref']==df_sum['LANG_Word']])/150.0,
        len(df_sum[df_sum['LANG_ref']==df_sum['LANG_Word_Good_Turing']])/150.0, len(df_sum[df_sum['LANG_ref']==df_sum['LANG_Trigram']])/150.0]
Accuracy
#df.append({'foo':1, 'bar':2}, ignore_index=True)
df_sum.set_index('ID').append({'LANG_ref':1.0, 'LANG_Word':len(df_sum[df_sum['LANG_ref']==df_sum['LANG_Word']])/150.0 , 
                               'LANG_Word_Good_Turing':len(df_sum[df_sum['LANG_ref']==df_sum['LANG_Word_Good_Turing']])/150.0 ,
                               'LANG_Trigram':len(df_sum[df_sum['LANG_ref']==df_sum['LANG_Trigram']])/150.0,
                               'LANG_letter':len(df_sum[df_sum['LANG_ref']==df_sum['LANG_letter']])/150.0 
                              },ignore_index=True).to_csv('All_Models_Summary.txt', sep='\t', index=None)

#### Here I stop