# Corpus YouTube

## 1. Lectura de datos!

In [1]:
## abrimos los archivos

import glob

lista_files = glob.glob('YTessays/*.txt')
corpus = []

for file in lista_files:
    with open(file, 'r', encoding='utf8') as f:
        corpus+=[f.read()]

In [2]:
## número de textos

len(corpus)

24

In [3]:
corpus[0][:250]

'("Faceshopping" by Sophie)\n♪ My face is the front of shop ♪\n♪ My face is the real shop front ♪\n♪ My shop is the face I front ♪\n♪ I\'m real when I shop my face ♪\n♪ Artificial bloom ♪\n♪ Hydroponic skin ♪\n♪ Chemical release ♪\n♪ Synthesize the real ♪\n(upb'

## 2. Preprocesamiento de los datos

In [4]:
## librerias

import spacy
import nltk
import string
from nltk import sent_tokenize
import numpy as np

2022-02-25 21:49:19.769085: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2022-02-25 21:49:19.769129: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.


In [5]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /home/cerdamara/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [6]:
## función para remover () y []

def a(test_str):
    ret = ''
    skip1c = 0
    skip2c = 0
    for i in test_str:
        if i == '[':
            skip1c += 1
        elif i == '(':
            skip2c += 1
        elif i == ']' and skip1c > 0:
            skip1c -= 1
        elif i == ')'and skip2c > 0:
            skip2c -= 1
        elif skip1c == 0 and skip2c == 0:
            ret += i
    return ret

In [7]:
## aplicamos la función "a" a cada texto del corpus

corpus = [a(c) for c in corpus]

In [8]:
corpus[0][:250]

"\n♪ My face is the front of shop ♪\n♪ My face is the real shop front ♪\n♪ My shop is the face I front ♪\n♪ I'm real when I shop my face ♪\n♪ Artificial bloom ♪\n♪ Hydroponic skin ♪\n♪ Chemical release ♪\n♪ Synthesize the real ♪\n - Hey guys, it's Natalie,\nwel"

In [9]:
## dividimos por salto de linea cada texto

corpus = [c.replace('♪','').split('\n') for c in corpus]

In [10]:
corpus[0][:5]

['',
 ' My face is the front of shop ',
 ' My face is the real shop front ',
 ' My shop is the face I front ',
 " I'm real when I shop my face "]

In [11]:
## eliminamos strings vacíos (cada texto)

corpus = [[sentence.strip() for sentence in c if len(sentence)>0] for c in corpus]

In [12]:
## volvemos a juntar las oraciones

corpus = [' '.join(c) for c in corpus]

In [13]:
corpus[0][:250]

"My face is the front of shop My face is the real shop front My shop is the face I front I'm real when I shop my face Artificial bloom Hydroponic skin Chemical release Synthesize the real - Hey guys, it's Natalie, welcome back to my channel. Today I'm"

In [14]:
## dividimos en oraciones cada texto

sentence_list = []

for texto in corpus:
    sentences = sent_tokenize(texto)
    sentence_list+=[sentences]

In [15]:
len(sentence_list)

24

In [16]:
sentence_list[0][:5]

["My face is the front of shop My face is the real shop front My shop is the face I front I'm real when I shop my face Artificial bloom Hydroponic skin Chemical release Synthesize the real - Hey guys, it's Natalie, welcome back to my channel.",
 "Today I'm gonna do a makeup tutorial, as always, but first, story time, story time, story time!",
 "I just wanna be upfront with you guys and let you know that I've had some facial surgery.",
 "I'm always gonna be honest with you guys about this kind of thing because you mean so much to me.",
 "Like you've been here with me since the beginning, and you've seen my story, my whole entire journey, this journey I've been on as a transgender woman."]

In [17]:
## filtramos por textos con al menos 2 oraciones

sentence_list = [s for s in sentence_list if len(s)>1]

In [18]:
len(sentence_list)

23

In [19]:
#numero de oraciones por texto

numero_oraciones = sum([len(s) for s in sentence_list])
promedio_oraciones_por_texto = np.mean([len(s) for s in sentence_list])

In [20]:
numero_oraciones, promedio_oraciones_por_texto

(7355, 319.7826086956522)

In [21]:
## tokens y types

tokens = []

for texto in sentence_list:
    for sentence in texto:
        tokens+=sentence.split(' ')

tokens = [w.lower() for w in tokens]

In [22]:
## número de tokens y types en el corpus

len(tokens),len(set(tokens))

(142029, 17593)

In [23]:
# número de oraciones por texto
num_sen = [len(s) for s in sentence_list]
print(num_sen)

[322, 1041, 915, 73, 170, 128, 259, 264, 170, 333, 272, 367, 369, 424, 310, 425, 222, 317, 173, 318, 122, 201, 160]


## análisis!

### 1. Nominalizaciones
El trabajo aquí es con listas de oraciones. Extraemos los sustantivos :)

In [24]:
!pip install spacy
!spacy download en

  from cryptography.utils import int_from_bytes
  from cryptography.utils import int_from_bytes
2022-02-25 21:49:35.526380: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2022-02-25 21:49:35.526436: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.
[38;5;3m⚠ As of spaCy v3.0, shortcuts like 'en' are deprecated. Please use the
full pipeline package name 'en_core_web_sm' instead.[0m
  from cryptography.utils import int_from_bytes
  from cryptography.utils import int_from_bytes
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


In [25]:
oraciones_nlp = []

In [26]:
## identificamos el lema y el pos

import spacy
nlp = spacy.load("en_core_web_sm")

for sentence_text in sentence_list:
    if len(sentence_text)>1:
        sent = []
        for sentence in sentence_text:
            doc = nlp(sentence)
            sent += [(token.lemma_.lower(),token.pos_) for token in doc]
        oraciones_nlp += [sent]

In [27]:
len(oraciones_nlp)

23

In [28]:
def number_tokens_types(oraciones):
    number=[pair[0] for pair in oraciones]
    return len(number),len(set(number)),len(set(number))/len(number)*100

In [29]:
## número de tokens y types para los 23 textos 

for oraciones in oraciones_nlp:
    print(number_tokens_types(oraciones))

(6448, 1297, 20.114764267990072)
(20874, 2296, 10.999329309188465)
(16843, 2280, 13.536780858516892)
(2089, 422, 20.20105313547152)
(2998, 526, 17.545030020013343)
(2792, 572, 20.487106017191977)
(8217, 1491, 18.14530850675429)
(7271, 1246, 17.136569935359645)
(3523, 653, 18.53533919954584)
(6488, 1249, 19.250924784217016)
(5241, 1010, 19.27113146346117)
(8595, 1502, 17.475276323443865)
(8649, 1442, 16.672447681812926)
(8296, 1247, 15.031340405014465)
(7519, 1529, 20.335150950924323)
(10654, 1479, 13.882110005631688)
(6639, 1079, 16.252447657779786)
(9705, 1426, 14.693456980937661)
(3206, 794, 24.76606363069245)
(8006, 1025, 12.802897826630028)
(4441, 860, 19.36500788110786)
(4947, 920, 18.597129573478878)
(3566, 834, 23.38754907459338)


In [30]:
def number_nouns(oraciones):
    number = [pair for pair in oraciones if pair[1]=='NOUN']
    return len(number)

In [31]:
## número de nouns tokens para los 23 textos 

for oraciones in oraciones_nlp:
    print(number_nouns(oraciones))

1031
2832
2727
320
381
383
1282
1077
433
956
804
1283
1375
1469
1511
1711
1075
1522
508
947
627
573
478


In [32]:
## la pregunta es: ¿De este número de nouns cuántos son nominalizaciones?

In [33]:
## reglas de nominalization

## reglas de nominalization

no_nom = ['thing', 'things', 'somethings', 'something', 'anything', 'everything', 'nothing','original', 'special', 'normal', 'version', 'tutorial', 'moment', 'comment', 'criminal', 'morning', 'tradition', 'notification','question', 'element', 'quality']
terminacion = ['ment','ments','tions', 'tion','sions', 'sion', 'ibility','ibilities', 'ity','ities', 'ness','nesses', 'al','als','ings', 'ing'] 

In [34]:
def nominalization(oraciones):
    nom = []
    for pair in oraciones:
        if pair[0] not in no_nom:
            if pair[1]=='NOUN':
                for END in terminacion:
                    if pair[0].endswith(END):
                        nom+=[pair[0]]
    return nom

In [35]:
nom_list = []

for oraciones in oraciones_nlp:
    nom_list += [nominalization(oraciones)]

In [36]:
nom_list[0]

['beginning',
 'feminization',
 'feminization',
 'contouring',
 'contouring',
 'incision',
 'fragment',
 'incision',
 'reconstruction',
 'incision',
 'anticipation',
 'encouragement',
 'jackal',
 'vanity',
 'feminization',
 'reassignment',
 'identity',
 'argument',
 'reality',
 'ideal',
 'transition',
 'removal',
 'grooming',
 'thinking',
 'realness',
 'painting',
 'femininity',
 'aspiration',
 'aging',
 'intensity',
 'artificial',
 'skinmaxing',
 'business',
 'discussion',
 'masculinization',
 'looksmaxing',
 'deal',
 'arousal',
 'evolution',
 'contouring',
 'ingenuity',
 'resourcefulness',
 'individuality',
 'femininity',
 'exploitation',
 'bartering',
 'judgment',
 'exaggeration',
 'delusion',
 'embellishment',
 'darkness',
 'invention',
 'darkness',
 'aging',
 'ritual',
 'cleansing',
 'corporation',
 'kindness',
 'saving',
 'collection',
 'facial',
 'futility',
 'solution',
 'spiral',
 'obsession',
 'advertising',
 'relation',
 'awareness',
 'contemplation',
 'revolution',
 'revolu

In [37]:
num_nom_list = []

for oraciones in oraciones_nlp:
    num_nom_list += [len(nominalization(oraciones)), len(nominalization(oraciones))/number_nouns(oraciones)*100]

In [38]:
## porcentaje de nominalizaciones con respecto al total de tokens sustantivos para cada texto

num_nom_list

[83,
 8.050436469447138,
 366,
 12.923728813559322,
 326,
 11.954528786211954,
 38,
 11.875,
 34,
 8.923884514435695,
 26,
 6.7885117493472595,
 148,
 11.54446177847114,
 83,
 7.706592386258125,
 37,
 8.545034642032332,
 113,
 11.820083682008368,
 76,
 9.45273631840796,
 131,
 10.210444271239282,
 135,
 9.818181818181818,
 234,
 15.929203539823009,
 250,
 16.545334215751158,
 168,
 9.818819403857393,
 96,
 8.930232558139535,
 126,
 8.278580814717477,
 62,
 12.204724409448819,
 102,
 10.770855332629356,
 83,
 13.237639553429027,
 75,
 13.089005235602095,
 40,
 8.368200836820083]

## 2. Academic World List

In [39]:
#abriendo la lista de palabras academicas
with open('AcademicWordList.txt') as f:
    AWL=f.read()

In [40]:
#dividir por salto de linea
AWL= AWL.split('\n')

In [41]:
##eliminar espacios en blanco
AWL = [palabra.split(' ') for palabra in AWL if len(palabra)>0]
AWL = [item for sublist in AWL for item in sublist]

In [42]:
AWL.remove('-')
AWL.remove('comments')

In [43]:
texto = list(zip(*oraciones_nlp[0]))[0]

In [44]:
texto[:10]

('my', 'face', 'be', 'the', 'front', 'of', 'shop', 'my', 'face', 'be')

In [45]:
len(tokens)

142029

In [46]:
# función separadora de academic words
def academic(tokens):
    aca_list=[]
    for word in tokens:
        if word in AWL:
            aca_list+=[word]
    return aca_list

In [47]:
oraciones_nlp[0][:10]

[('my', 'PRON'),
 ('face', 'NOUN'),
 ('be', 'AUX'),
 ('the', 'DET'),
 ('front', 'NOUN'),
 ('of', 'ADP'),
 ('shop', 'NOUN'),
 ('my', 'PRON'),
 ('face', 'NOUN'),
 ('be', 'AUX')]

In [48]:
oraciones_nlp[:10]

[[('my', 'PRON'),
  ('face', 'NOUN'),
  ('be', 'AUX'),
  ('the', 'DET'),
  ('front', 'NOUN'),
  ('of', 'ADP'),
  ('shop', 'NOUN'),
  ('my', 'PRON'),
  ('face', 'NOUN'),
  ('be', 'AUX'),
  ('the', 'DET'),
  ('real', 'ADJ'),
  ('shop', 'NOUN'),
  ('front', 'NOUN'),
  ('my', 'PRON'),
  ('shop', 'NOUN'),
  ('be', 'AUX'),
  ('the', 'DET'),
  ('face', 'NOUN'),
  ('i', 'PRON'),
  ('front', 'VERB'),
  ('i', 'PRON'),
  ('be', 'AUX'),
  ('real', 'ADJ'),
  ('when', 'SCONJ'),
  ('i', 'PRON'),
  ('shop', 'VERB'),
  ('my', 'PRON'),
  ('face', 'NOUN'),
  ('artificial', 'PROPN'),
  ('bloom', 'NOUN'),
  ('hydroponic', 'ADJ'),
  ('skin', 'NOUN'),
  ('chemical', 'PROPN'),
  ('release', 'NOUN'),
  ('synthesize', 'PROPN'),
  ('the', 'DET'),
  ('real', 'ADJ'),
  ('-', 'PUNCT'),
  ('hey', 'ADJ'),
  ('guy', 'NOUN'),
  (',', 'PUNCT'),
  ('it', 'PRON'),
  ('be', 'AUX'),
  ('natalie', 'PROPN'),
  (',', 'PUNCT'),
  ('welcome', 'VERB'),
  ('back', 'ADP'),
  ('to', 'ADP'),
  ('my', 'PRON'),
  ('channel', 'NOUN'),
 

In [49]:
aca_words_percentage = []

aca_words_per_text = []

for oracion in oraciones_nlp:
        texto = list(zip(*oracion))[0]
        aca_words = academic(texto)
        aca_words_per_text+=[aca_words]
        aca_words_percentage += [len(aca_words)/len(texto)*100]

In [50]:
aca_words_percentage

[1.2872208436724566,
 1.4324039474944907,
 1.739595084011162,
 1.9626615605552895,
 0.7004669779853235,
 0.6446991404011462,
 1.2413289521723256,
 1.2515472424700866,
 2.8952597218279874,
 1.495067817509248,
 2.0415951154359857,
 1.6172193135543922,
 1.5839981500751534,
 3.483606557377049,
 3.152014895597819,
 2.534259433076779,
 2.0786262991414373,
 2.1947449768160743,
 2.6512788521522146,
 1.1116662503122658,
 1.3960819635217292,
 1.4958560743885183,
 1.261918115535614]

## READABILITY
Aquí deberían darme números entre 0 y 100

In [51]:
pip install py-readability-metrics

  from cryptography.utils import int_from_bytes
  from cryptography.utils import int_from_bytes
Note: you may need to restart the kernel to use updated packages.


In [52]:
a = ((1,2),(3,4))
print(a)
b,c = zip(*a)
print(b)

((1, 2), (3, 4))
(1, 3)


In [53]:
##Flesch Kincaid Grade Level

from readability import Readability

#----
#El zip() devuelve dos valores, asi que lo tienes que guardar en dos variables antes de hacerles cualquier cosa. No se con cual de los dos valores
#te querias quedar, asi que voy a usar el primero
#Otra cosa es que tu funcion pedia string, asi que lo pase a string
#En el print fk.grade_lever, daba error, asi que asumi que era en realidad fk.grade_level y lo cambie
#----

#text = list(zip(*oracion))[0]

#----
text, aux = (zip(*oracion))
text = str(text)
#----

r=Readability(text)
fk=r.flesch_kincaid()

print(fk.score)
print(fk.grade_level)

7.280973061503293
7


## Passive Voice
Esto debería separarme oraciones que están en voz pasiva

In [54]:
pip install https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.0.0/en_core_web_sm-3.0.0.tar.gz


  from cryptography.utils import int_from_bytes
  from cryptography.utils import int_from_bytes
Collecting https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.0.0/en_core_web_sm-3.0.0.tar.gz
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.0.0/en_core_web_sm-3.0.0.tar.gz (13.7 MB)
[K     |████████████████████████████████| 13.7 MB 999 kB/s eta 0:00:01
[?25hCollecting spacy<3.1.0,>=3.0.0
  Using cached spacy-3.0.7-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (6.5 MB)
Building wheels for collected packages: en-core-web-sm
  Building wheel for en-core-web-sm (setup.py) ... [?25ldone
[?25h  Created wheel for en-core-web-sm: filename=en_core_web_sm-3.0.0-py3-none-any.whl size=13704321 sha256=2a52cfeaca784405d246a720019c75e34a0edffb83071b287be7287e8b16dc65
  Stored in directory: /home/cerdamara/.cache/pip/wheels/8b/21/c1/257748af7399fdaf1b2afc39c92fb839c436f42e67b656ff7e
Successfully built en-core-web-sm
[31mE

In [55]:
import spacy
spacy.cli.download("en_core_web_md")

  from cryptography.utils import int_from_bytes
  from cryptography.utils import int_from_bytes


Collecting spacy<3.3.0,>=3.2.0
  Using cached spacy-3.2.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (6.2 MB)


ERROR: en-core-web-sm 3.0.0 has requirement spacy<3.1.0,>=3.0.0, but you'll have spacy 3.2.2 which is incompatible.


Installing collected packages: spacy
  Attempting uninstall: spacy
    Found existing installation: spacy 3.0.7
    Uninstalling spacy-3.0.7:
      Successfully uninstalled spacy-3.0.7
Successfully installed spacy-3.2.2
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_md')


In [56]:
import spacy
import pandas as pd

# function to check the type of sentence
def checkForSentType(inputSentence):   
    # running the model on sentence
    getDocFile = nlp(inputSentence)
    
    # getting the syntactic dependency 
    getAllTags = [token.dep_ for token in getDocFile]
    
    # checking for 'agent' tag
    checkPassiveTest = any(['agent' in sublist for sublist in getAllTags])
    
    # checking for 'nsubjpass' tag
    checkPassiveTestTwo = any(['nsubjpass' in sublist for sublist in getAllTags])
    return checkPassiveTest or checkPassiveTestTwo

# Spacy model imported
nlp = spacy.load('en_core_web_md')

# reading the list of test sentences
dfs = pd.read_csv(glob.glob('YTessays/*.txt'))
sentences = dfs.values.tolist()

finalResult = []

# checking each sentence for its type
for sentence in sentences:
    result = checkForSentType(str(sentence))
    if(result):
        finalResult.append('Passive Sentence')
    else:
        finalResult.append('Active Sentence')
        
# storing the result in a new file and converting to csv
newDf = pd.DataFrame({'Sentences':sentences,'Answers':finalResult})

newDf.to_csv('Sentence_Identified.csv')