# Corpus YouTube

## 1. Lectura de datos!

In [1]:
## abrimos los archivos

import glob

lista_files = glob.glob('YTessays/*.txt')
corpus = []

for file in lista_files:
    with open(file, 'r', encoding='utf8') as f:
        corpus+=[f.read()]

In [2]:
## número de textos

len(corpus)

24

In [3]:
corpus[0][:250]

'("Faceshopping" by Sophie)\n♪ My face is the front of shop ♪\n♪ My face is the real shop front ♪\n♪ My shop is the face I front ♪\n♪ I\'m real when I shop my face ♪\n♪ Artificial bloom ♪\n♪ Hydroponic skin ♪\n♪ Chemical release ♪\n♪ Synthesize the real ♪\n(upb'

## 2. Preprocesamiento de los datos

In [4]:
## librerias

import spacy
import nltk
import string
from nltk import sent_tokenize
import numpy as np

In [5]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\pauba\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [6]:
## función para remover () y []

def a(test_str):
    ret = ''
    skip1c = 0
    skip2c = 0
    for i in test_str:
        if i == '[':
            skip1c += 1
        elif i == '(':
            skip2c += 1
        elif i == ']' and skip1c > 0:
            skip1c -= 1
        elif i == ')'and skip2c > 0:
            skip2c -= 1
        elif skip1c == 0 and skip2c == 0:
            ret += i
    return ret

In [7]:
## aplicamos la función "a" a cada texto del corpus

corpus = [a(c) for c in corpus]

In [8]:
corpus[0][:250]

"\n♪ My face is the front of shop ♪\n♪ My face is the real shop front ♪\n♪ My shop is the face I front ♪\n♪ I'm real when I shop my face ♪\n♪ Artificial bloom ♪\n♪ Hydroponic skin ♪\n♪ Chemical release ♪\n♪ Synthesize the real ♪\n - Hey guys, it's Natalie,\nwel"

In [9]:
## dividimos por salto de linea cada texto

corpus = [c.replace('♪','').split('\n') for c in corpus]

In [10]:
corpus[0][:5]

['',
 ' My face is the front of shop ',
 ' My face is the real shop front ',
 ' My shop is the face I front ',
 " I'm real when I shop my face "]

In [11]:
## eliminamos strings vacíos (cada texto)

corpus = [[sentence.strip() for sentence in c if len(sentence)>0] for c in corpus]

In [12]:
## volvemos a juntar las oraciones

corpus = [' '.join(c) for c in corpus]

In [13]:
corpus[0][:250]

"My face is the front of shop My face is the real shop front My shop is the face I front I'm real when I shop my face Artificial bloom Hydroponic skin Chemical release Synthesize the real - Hey guys, it's Natalie, welcome back to my channel. Today I'm"

In [14]:
## dividimos en oraciones cada texto

sentence_list = []

for texto in corpus:
    sentences = sent_tokenize(texto)
    sentence_list+=[sentences]

In [15]:
len(sentence_list)

24

In [16]:
sentence_list[0][:5]

["My face is the front of shop My face is the real shop front My shop is the face I front I'm real when I shop my face Artificial bloom Hydroponic skin Chemical release Synthesize the real - Hey guys, it's Natalie, welcome back to my channel.",
 "Today I'm gonna do a makeup tutorial, as always, but first, story time, story time, story time!",
 "I just wanna be upfront with you guys and let you know that I've had some facial surgery.",
 "I'm always gonna be honest with you guys about this kind of thing because you mean so much to me.",
 "Like you've been here with me since the beginning, and you've seen my story, my whole entire journey, this journey I've been on as a transgender woman."]

In [17]:
## filtramos por textos con al menos 2 oraciones

sentence_list = [s for s in sentence_list if len(s)>1]

In [18]:
len(sentence_list)

23

In [19]:
#numero de oraciones por texto

numero_oraciones = sum([len(s) for s in sentence_list])
promedio_oraciones_por_texto = np.mean([len(s) for s in sentence_list])

In [20]:
numero_oraciones, promedio_oraciones_por_texto

(7355, 319.7826086956522)

In [21]:
## tokens y types

tokens = []

for texto in sentence_list:
    for sentence in texto:
        tokens+=sentence.split(' ')

tokens = [w.lower() for w in tokens]

In [22]:
## número de tokens y types en el corpus

len(tokens),len(set(tokens))

(142029, 17593)

In [23]:
# número de oraciones por texto
num_sen = [len(s) for s in sentence_list]
print(num_sen)

[322, 1041, 915, 73, 170, 128, 259, 264, 170, 333, 272, 367, 369, 424, 310, 425, 222, 317, 173, 318, 122, 201, 160]


## análisis!

### 1. Nominalizaciones
El trabajo aquí es con listas de oraciones. Extraemos los sustantivos :)

In [24]:
!pip install spacy
!spacy download en



In [25]:
oraciones_nlp = []

In [26]:
## identificamos el lema y el pos

import spacy
nlp = spacy.load("en_core_web_sm")

for sentence_text in sentence_list:
    if len(sentence_text)>1:
        sent = []
        for sentence in sentence_text:
            doc = nlp(sentence)
            sent += [(token.lemma_.lower(),token.pos_) for token in doc]
        oraciones_nlp += [sent]

In [27]:
len(oraciones_nlp)

23

In [28]:
def number_tokens_types(oraciones):
    number=[pair[0] for pair in oraciones]
    return len(number),len(set(number)),len(set(number))/len(number)*100

In [29]:
## número de tokens y types para los 23 textos 

for oraciones in oraciones_nlp:
    print(number_tokens_types(oraciones))

(6448, 1301, 20.176799007444167)
(20874, 2303, 11.032863849765258)
(16843, 2277, 13.518969304755684)
(2089, 425, 20.34466251795117)
(2998, 529, 17.6450967311541)
(2792, 576, 20.630372492836678)
(8217, 1496, 18.20615796519411)
(7271, 1251, 17.2053362673635)
(3523, 660, 18.734033494181094)
(6488, 1253, 19.31257706535142)
(5241, 1013, 19.328372448006107)
(8595, 1507, 17.53344968004654)
(8649, 1441, 16.660885651520406)
(8296, 1256, 15.139826422372227)
(7519, 1532, 20.37504987365341)
(10654, 1488, 13.966585320067582)
(6639, 1086, 16.357885223678263)
(9705, 1423, 14.662545079855743)
(3206, 796, 24.828446662507798)
(8006, 1033, 12.902822882837873)
(4441, 867, 19.52263003827967)
(4947, 926, 18.718415201132)
(3566, 837, 23.471676948962422)


In [30]:
def number_nouns(oraciones):
    number = [pair for pair in oraciones if pair[1]=='NOUN']
    return len(number)

In [31]:
## número de nouns tokens para los 23 textos 

for oraciones in oraciones_nlp:
    print(number_nouns(oraciones))

1043
2891
2848
312
382
382
1292
1095
438
968
804
1296
1389
1486
1526
1729
1077
1594
504
953
618
579
474


In [32]:
## la pregunta es: ¿De este número de nouns cuántos son nominalizaciones?

In [33]:
## reglas de nominalization

no_nom = ['thing', 'things', 'somethings', 'something', 'anything', 'everything', 'nothing','original', 'special', 'normal', 'version', 'tutorial', 'moment', 'comment', 'criminal', 'morning', 'tradition', 'notification','question', 'element', 'quality']
terminacion = ['ment','ments','tions', 'tion','sions', 'sion', 'ibility','ibilities', 'ity','ities', 'ness','nesses', 'al','als','ings', 'ing'] 

In [34]:
def nominalization(oraciones):
    nom = []
    for pair in oraciones:
        if pair[0] not in no_nom:
            if pair[1]=='NOUN':
                for END in terminacion:
                    if pair[0].endswith(END):
                        nom+=[pair[0]]
    return nom

In [35]:
nom_list = []

for oraciones in oraciones_nlp:
    nom_list += [nominalization(oraciones)]

In [36]:
nom_list[0]

['beginning',
 'feminization',
 'feminization',
 'contouring',
 'incision',
 'fragment',
 'incision',
 'reconstruction',
 'incision',
 'exhibiting',
 'anticipation',
 'encouragement',
 'jackal',
 'vanity',
 'feminization',
 'reassignment',
 'identity',
 'argument',
 'reality',
 'loathing',
 'ideal',
 'transition',
 'removal',
 'thinking',
 'realness',
 'painting',
 'femininity',
 'aspiration',
 'intensity',
 'artificial',
 'skinmaxing',
 'business',
 'discussion',
 'masculinization',
 'looksmaxing',
 'deal',
 'arousal',
 'evolution',
 'contouring',
 'ingenuity',
 'individuality',
 'femininity',
 'exploitation',
 'bartering',
 'judgment',
 'exaggeration',
 'delusion',
 'embellishment',
 'darkness',
 'doctoring',
 'invention',
 'darkness',
 'aging',
 'ritual',
 'cleansing',
 'corporation',
 'kindness',
 'saving',
 'collection',
 'facial',
 'futility',
 'solution',
 'spiral',
 'obsession',
 'advertising',
 'relation',
 'awareness',
 'contemplation',
 'revolution',
 'revolution',
 'critiqu

In [37]:
num_nom_list = []

for oraciones in oraciones_nlp:
    num_nom_list += [len(nominalization(oraciones)), len(nominalization(oraciones))/number_nouns(oraciones)*100]

In [38]:
## porcentaje de nominalizaciones con respecto al total de tokens sustantivos para cada texto

num_nom_list

[84,
 8.053691275167784,
 377,
 13.04047042545832,
 338,
 11.867977528089888,
 38,
 12.179487179487179,
 34,
 8.900523560209423,
 26,
 6.806282722513089,
 147,
 11.377708978328172,
 81,
 7.397260273972603,
 41,
 9.360730593607306,
 115,
 11.8801652892562,
 78,
 9.701492537313433,
 135,
 10.416666666666668,
 136,
 9.791216702663787,
 234,
 15.746971736204577,
 253,
 16.579292267365663,
 164,
 9.485251590514748,
 95,
 8.82079851439183,
 125,
 7.841907151819323,
 62,
 12.3015873015873,
 104,
 10.912906610703043,
 83,
 13.430420711974108,
 75,
 12.953367875647666,
 43,
 9.071729957805907]

## 2. Academic World List

In [39]:
#abriendo la lista de palabras academicas
with open('AcademicWordList.txt') as f:
    AWL=f.read()

In [40]:
#dividir por salto de linea
AWL= AWL.split('\n')

In [41]:
##eliminar espacios en blanco
AWL = [palabra.split(' ') for palabra in AWL if len(palabra)>0]
AWL = [item for sublist in AWL for item in sublist]

In [42]:
AWL.remove('-')
AWL.remove('comments')

In [43]:
texto = list(zip(*oraciones_nlp[0]))[0]

In [44]:
texto[:10]

('my', 'face', 'be', 'the', 'front', 'of', 'shop', 'my', 'face', 'be')

In [57]:
len(tokens)

142029

In [58]:
# función separadora de academic words
def academic(tokens):
    aca_list=[]
    for word in tokens:
        if word in AWL:
            aca_list+=[word]
    return aca_list

In [59]:
oraciones_nlp[0][:10]

[('my', 'PRON'),
 ('face', 'NOUN'),
 ('be', 'AUX'),
 ('the', 'DET'),
 ('front', 'NOUN'),
 ('of', 'ADP'),
 ('shop', 'NOUN'),
 ('my', 'PRON'),
 ('face', 'NOUN'),
 ('be', 'VERB')]

In [60]:
oraciones_nlp[:10]

[[('my', 'PRON'),
  ('face', 'NOUN'),
  ('be', 'AUX'),
  ('the', 'DET'),
  ('front', 'NOUN'),
  ('of', 'ADP'),
  ('shop', 'NOUN'),
  ('my', 'PRON'),
  ('face', 'NOUN'),
  ('be', 'VERB'),
  ('the', 'DET'),
  ('real', 'ADJ'),
  ('shop', 'NOUN'),
  ('front', 'NOUN'),
  ('my', 'PRON'),
  ('shop', 'NOUN'),
  ('be', 'VERB'),
  ('the', 'DET'),
  ('face', 'NOUN'),
  ('i', 'PRON'),
  ('front', 'NOUN'),
  ('i', 'PRON'),
  ('be', 'VERB'),
  ('real', 'ADJ'),
  ('when', 'ADV'),
  ('i', 'PRON'),
  ('shop', 'VERB'),
  ('my', 'PRON'),
  ('face', 'NOUN'),
  ('artificial', 'PROPN'),
  ('bloom', 'NOUN'),
  ('hydroponic', 'PROPN'),
  ('skin', 'NOUN'),
  ('chemical', 'PROPN'),
  ('release', 'NOUN'),
  ('synthesize', 'PROPN'),
  ('the', 'DET'),
  ('real', 'ADJ'),
  ('-', 'PUNCT'),
  ('hey', 'INTJ'),
  ('guy', 'NOUN'),
  (',', 'PUNCT'),
  ('it', 'PRON'),
  ('be', 'VERB'),
  ('natalie', 'PROPN'),
  (',', 'PUNCT'),
  ('welcome', 'VERB'),
  ('back', 'ADP'),
  ('to', 'ADP'),
  ('my', 'PRON'),
  ('channel', 'NOUN

In [61]:
aca_words_percentage = []

aca_words_per_text = []

for oracion in oraciones_nlp:
        texto = list(zip(*oracion))[0]
        aca_words = academic(texto)
        aca_words_per_text+=[aca_words]
        aca_words_percentage += [len(aca_words)/len(texto)*100]

In [62]:
aca_words_percentage

[1.271712158808933,
 1.446775893455974,
 1.733657899424093,
 1.9626615605552895,
 0.733822548365577,
 0.6446991404011462,
 1.2413289521723256,
 1.2515472424700866,
 2.8952597218279874,
 1.495067817509248,
 2.0415951154359857,
 1.6055846422338569,
 1.5839981500751534,
 3.483606557377049,
 3.1653145365075144,
 2.543645579125211,
 2.123813827383642,
 2.215352910870685,
 2.6512788521522146,
 1.1241568823382464,
 1.3960819635217292,
 1.4958560743885183,
 1.261918115535614]

## READABILITY


In [63]:
pip install py-readability-metrics

Note: you may need to restart the kernel to use updated packages.


In [64]:
a = ((1,2),(3,4))
print(a)
b,c = zip(*a)
print(b)

((1, 2), (3, 4))
(1, 3)


In [65]:
##Flesch Kincaid Grade Level

from readability import Readability

#----
#El zip() devuelve dos valores, asi que lo tienes que guardar en dos variables antes de hacerles cualquier cosa. No se con cual de los dos valores
#te querias quedar, asi que voy a usar el primero
#Otra cosa es que tu funcion pedia string, asi que lo pase a string
#En el print fk.grade_lever, daba error, asi que asumi que era en realidad fk.grade_level y lo cambie
#----

#text = list(zip(*oracion))[0]

#----
text, aux = (zip(*oracion))
text = str(text)
#----

r=Readability(text)
fk=r.flesch_kincaid()

print(fk.score)
print(fk.grade_level)

7.363546576879912
7


## Passive Voice
Esto debería separarme oraciones que están en voz pasiva

In [66]:
pip install https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.0.0/en_core_web_sm-3.0.0.tar.gz


Collecting https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.0.0/en_core_web_sm-3.0.0.tar.gz
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.0.0/en_core_web_sm-3.0.0.tar.gz (13.7 MB)
Note: you may need to restart the kernel to use updated packages.


In [67]:
import spacy
spacy.cli.download("en_core_web_md")

✔ Download and installation successful
You can now load the package via spacy.load('en_core_web_md')


In [68]:
import spacy
import pandas as pd

# function to check the type of sentence
def checkForSentType(inputSentence):   
    # running the model on sentence
    getDocFile = nlp(inputSentence)
    
    # getting the syntactic dependency 
    getAllTags = [token.dep_ for token in getDocFile]
    
    # checking for 'agent' tag
    checkPassiveTest = any(['agent' in sublist for sublist in getAllTags])
    
    # checking for 'nsubjpass' tag
    checkPassiveTestTwo = any(['nsubjpass' in sublist for sublist in getAllTags])
    return checkPassiveTest or checkPassiveTestTwo

# Spacy model imported
nlp = spacy.load('en_core_web_md')

# reading the list of test sentences
dfs = pd.read_csv(glob.glob('YTessays/*.txt'))
sentences = dfs.values.tolist()

finalResult = []

# checking each sentence for its type
for sentence in sentences:
    result = checkForSentType(str(sentence))
    if(result):
        finalResult.append('Passive Sentence')
    else:
        finalResult.append('Active Sentence')
        
# storing the result in a new file and converting to csv
newDf = pd.DataFrame({'Sentences':sentences,'Answers':finalResult})

newDf.to_csv('Sentence_Identified.csv')

ValueError: Invalid file path or buffer object type: <class 'list'>