### Library

In [None]:
import scipy
import nltk
import spacy
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split 
from sklearn.metrics import log_loss,accuracy_score
from sklearn.metrics import mean_squared_error, r2_score

from processing import accessfile, one, Word_count, table_count, dataFrame_create, distance_between_nouns, distance_between_DET, Word_verbtype_count, tagging, tagging_dataFrame

### File

In [None]:
filename = 'Odam0829.xml'
title_list1 = ['AFFlores Teneraca1','ELAN 01','ELAN 02','ELAN 03','ELAN 04','ELAN 11','ELAN 14','ELAN 15']
title_list2 = ["Gu a'lhich ja'tkam","Gu bhiich kulierdam","Gu Bib","Gu chio'ñ gux chuk t+t+'kam","Gu J+b++lh Gio Gu Tanoolh","Gu joob nat bh+m gu tai","Gu Kooxi'"]
title_list3 = ["Gu mamra'n nat mai' ja iobu","Gu naks+r","gu tur","Jix Chuumñigam (AD &amp; GG 2013: 58)","Nat tum sur",'Historia de Charcos','Teneraca 5 Familia','Teneraca 7 Varios Cuentos']
text_list = title_list1 + title_list2 + title_list3
title_tagging = ['AFFlores TAGGING','Gu joob TAGGING']
body, namespace = accessfile(filename)

### Word

In [None]:
print('DET')
DET_list = []
for i in range(len(text_list)):
    text = one(body, namespace,text_list[i])
    Word_noun = Word_count(text, namespace,"Interlin Word Gloss es",'DET')
    DET_list.append(Word_noun)
    print(text_list[i], ':', Word_noun)

In [None]:
print('Noun')
sus_list = []
for i in range(len(text_list)):
    text = one(body, namespace,text_list[i])
    Word_noun = Word_count(text, namespace,"Interlin Word POS",'sus')
    sus_list.append(Word_noun)
    print(text_list[i], ':', Word_noun)

In [None]:
print('1pl, 2pl')
pro_list = []
#minus the number of pronoun
#Interlin Morpheme Gloss es: 1pl, 2pl
for i in range(len(text_list)):
    text = one(body, namespace,text_list[i])
    Morph_pro = table_count(text, namespace,'Interlin Morpheme Gloss es', ['1pl', '2pl'])
    pro_list.append(Morph_pro)
    print(text_list[i], ':', Morph_pro)

In [None]:
verb_list = []
for i in range(len(text_list)):
    text = one(body, namespace,text_list[i])
    Word_verb = Word_count(text, namespace,"Interlin Word POS",'v')
    Word_cop = Word_count(text, namespace,"Interlin Word POS",'cop')
    total = Word_verb + Word_cop
    verb_list.append(total)
    print(text_list[i], ':', total)

In [None]:
verb_type_list = []
for i in range(len(text_list)):
    #vi, vt, cop, v.ctrl, vb, v, Verb
    text = one(body, namespace,text_list[i])
    Word_verbtype = Word_verbtype_count(text, namespace,"Interlin Word POS")
    total = Word_verbtype[0]+Word_verbtype[1]*2+Word_verbtype[2]+Word_verbtype[3]*2+Word_verbtype[4]*3+Word_verbtype[5]+Word_verbtype[6]-pro_list[i]
    verb_type_list.append(total)
    print(text_list[i], ':', total)

In [None]:
post_list = []
for i in range(len(text_list)):
    text = one(body, namespace,text_list[i])
    Morph_Post = table_count(text, namespace,'Interlin Morpheme POS', ['post'])
    post_list.append(Morph_Post)
    print(text_list[i], ':', Morph_Post)

### Text Statistics

In [None]:
log_noun_list = [np.log(e) for e in sus_list]
log_verb_list = [np.log(e) for e in verb_list]

plt.plot(text_list,log_noun_list,label='Nouns')
plt.plot(text_list,log_verb_list,label='Verbs')
plt.xticks(fontsize=8, rotation=90)
plt.legend()

In [None]:
x = np.arange(len(text_list)) 
width = 0.35 

fig, ax = plt.subplots()
rects1 = ax.bar(x - width/2, sus_list, width, label='Noun')
rects2 = ax.bar(x + width/2, verb_list, width, label='Verb')

ax.set_xlabel('Text')
ax.set_title('Sus and Verb Plot')
ax.set_xticks(x)
ax.set_xticklabels(text_list)
ax.legend()

fig.tight_layout()

plt.xticks(fontsize=8, rotation=90)
plt.show()

In [None]:
x = np.arange(len(text_list)) 
width = 0.35 

fig, ax = plt.subplots()
rects1 = ax.bar(x - width/2, DET_list, width, label='DET')
rects2 = ax.bar(x + width/2, sus_list, width, label='sus')

ax.set_xlabel('Text')
ax.set_ylabel('count')
ax.set_title('DET and sus Plot')
ax.set_xticks(x)
ax.set_xticklabels(text_list)
ax.legend()

fig.tight_layout()

plt.xticks(fontsize=8, rotation=90)
plt.show()

In [None]:
x = np.arange(len(text_list)) 
width = 0.35 

fig, ax = plt.subplots()
rects1 = ax.bar(x - width/2, sus_list, width, label='Noun')
rects2 = ax.bar(x + width/2, verb_type_list, width, label='ExpNoun')

ax.set_xlabel('Text')
ax.set_ylabel('Diff')
ax.set_title('Noun vs ExpNoun')
ax.set_xticks(x)
ax.set_xticklabels(text_list)
ax.legend()

fig.tight_layout()

plt.xticks(fontsize=8, rotation=90)
plt.show()

### Distance

In [None]:
dist_text_list = []
for i in range(len(text_list)):
    text = one(body, namespace,text_list[i])
    dist = distance_between_nouns(text, namespace)
    dist_text_list += dist
    print(text_list[i], ':', dist)

In [None]:
from collections import Counter

def remove_outliers(data):
    q1 = np.percentile(data, 25)
    q3 = np.percentile(data, 75)
    iqr = q3 - q1
    lower_bound = q1 - 1.5 * iqr
    upper_bound = q3 + 1.5 * iqr
    return [x for x in data if lower_bound <= x <= upper_bound]

filtered_dist = remove_outliers(dist_text_list)

# Count the frequency of each element in the filtered list
counter = Counter(filtered_dist)

# Separate the keys and values for plotting
elements = list(counter.keys())
frequencies = list(counter.values())

# Create the bar plot
plt.figure(figsize=(10, 6))
plt.bar(elements, frequencies, color='skyblue')
plt.xlabel('Elements')
plt.ylabel('Frequency')
plt.title('Distribution of Elements in the List (Outliers Removed)')
plt.xticks(elements)  # Set x-ticks to the unique elements
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.show()

### Data Processing for Modeling