# Corpus YouTube

## 1. Lectura de datos!

In [1]:
## abrimos los archivos

import glob

lista_files = glob.glob('youtube/*.txt')
corpus = []

for file in lista_files:
    with open(file, 'r', encoding='utf8') as f:
        corpus+=[f.read()]

In [2]:
## número de textos

len(corpus)

24

In [3]:
corpus[0][:250]

'("Faceshopping" by Sophie)\n♪ My face is the front of shop ♪\n♪ My face is the real shop front ♪\n♪ My shop is the face I front ♪\n♪ I\'m real when I shop my face ♪\n♪ Artificial bloom ♪\n♪ Hydroponic skin ♪\n♪ Chemical release ♪\n♪ Synthesize the real ♪\n(upb'

## 2. Preprocesamiento de los datos

In [4]:
## librerias

import spacy
import nltk
import string
from nltk import sent_tokenize
import numpy as np

In [5]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\jxver\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [6]:
## función para remover () y []

def a(test_str):
    ret = ''
    skip1c = 0
    skip2c = 0
    for i in test_str:
        if i == '[':
            skip1c += 1
        elif i == '(':
            skip2c += 1
        elif i == ']' and skip1c > 0:
            skip1c -= 1
        elif i == ')'and skip2c > 0:
            skip2c -= 1
        elif skip1c == 0 and skip2c == 0:
            ret += i
    return ret

In [7]:
## aplicamos la función "a" a cada texto del corpus

corpus = [a(c) for c in corpus]

In [8]:
corpus[0][:250]

"\n♪ My face is the front of shop ♪\n♪ My face is the real shop front ♪\n♪ My shop is the face I front ♪\n♪ I'm real when I shop my face ♪\n♪ Artificial bloom ♪\n♪ Hydroponic skin ♪\n♪ Chemical release ♪\n♪ Synthesize the real ♪\n - Hey guys, it's Natalie,\nwel"

In [9]:
## dividimos por salto de linea cada texto

corpus = [c.replace('♪','').split('\n') for c in corpus]

In [10]:
corpus[0][:5]

['',
 ' My face is the front of shop ',
 ' My face is the real shop front ',
 ' My shop is the face I front ',
 " I'm real when I shop my face "]

In [11]:
## eliminamos strings vacíos (cada texto)

corpus = [[sentence.strip() for sentence in c if len(sentence)>0] for c in corpus]

In [12]:
## volvemos a juntar las oraciones :)

corpus = [' '.join(c) for c in corpus]

In [13]:
corpus[0][:250]

"My face is the front of shop My face is the real shop front My shop is the face I front I'm real when I shop my face Artificial bloom Hydroponic skin Chemical release Synthesize the real - Hey guys, it's Natalie, welcome back to my channel. Today I'm"

In [14]:
## dividimos en oraciones cada texto

sentence_list = []

for texto in corpus:
    sentences = sent_tokenize(texto)
    sentence_list+=[sentences]

In [15]:
len(sentence_list)

24

In [16]:
sentence_list[0][:5]

["My face is the front of shop My face is the real shop front My shop is the face I front I'm real when I shop my face Artificial bloom Hydroponic skin Chemical release Synthesize the real - Hey guys, it's Natalie, welcome back to my channel.",
 "Today I'm gonna do a makeup tutorial, as always, but first, story time, story time, story time!",
 "I just wanna be upfront with you guys and let you know that I've had some facial surgery.",
 "I'm always gonna be honest with you guys about this kind of thing because you mean so much to me.",
 "Like you've been here with me since the beginning, and you've seen my story, my whole entire journey, this journey I've been on as a transgender woman."]

In [17]:
## filtramos por textos con al menos 2 oraciones

sentence_list = [s for s in sentence_list if len(s)>1]

In [55]:
len(sentence_list)

23

In [56]:
#numero de oraciones por texto

numero_oraciones = sum([len(s) for s in sentence_list])
promedio_oraciones_por_texto = np.mean([len(s) for s in sentence_list])

In [57]:
numero_oraciones, promedio_oraciones_por_texto

(7355, 319.7826086956522)

In [58]:
## tokens y types

tokens = []

for texto in sentence_list:
    for sentence in texto:
        tokens+=sentence.split(' ')

tokens = [w.lower() for w in tokens]

In [59]:
## número de tokens y types en el corpus

len(tokens),len(set(tokens))

(142029, 17593)

In [60]:
# número de oraciones por texto
num_sen = [len(s) for s in sentence_list]
print(num_sen)

[322, 1041, 915, 73, 170, 128, 259, 264, 170, 333, 272, 367, 369, 424, 310, 425, 222, 317, 173, 318, 122, 201, 160]


## análisis!

### 1. Nominalizaciones
El trabajo aquí es con listas de oraciones. Extraemos los sustantivos :)

In [23]:
!pip install spacy
!spacy download en

Collecting en-core-web-sm==3.2.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.2.0/en_core_web_sm-3.2.0-py3-none-any.whl (13.9 MB)
[!] As of spaCy v3.0, shortcuts like 'en' are deprecated. Please use the full
pipeline package name 'en_core_web_sm' instead.
[+] Download and installation successful
You can now load the package via spacy.load('en_core_web_sm')


In [61]:
oraciones_nlp = []

In [62]:
## identificamos el lema y el pos

import spacy
nlp = spacy.load("en_core_web_sm")

for sentence_text in sentence_list:
    if len(sentence_text)>1:
        sent = []
        for sentence in sentence_text:
            doc = nlp(sentence)
            sent += [(token.lemma_.lower(),token.pos_) for token in doc]
        oraciones_nlp += [sent]

In [63]:
len(oraciones_nlp)

23

In [64]:
def number_tokens_types(oraciones):
    number = []
    for oracion in oraciones:
        number+=[pair[0] for pair in oracion]
    return len(number),len(set(number)), len(set(number))/len(number)*100

In [65]:
## número de tokens y types para los 23 textos 

for oraciones in oraciones_nlp:
    print(number_tokens_types(oraciones))

(12896, 50, 0.38771712158808935)
(41748, 54, 0.12934751365334868)
(33686, 58, 0.17217835302499554)
(4178, 37, 0.8855911919578746)
(5996, 40, 0.66711140760507)
(5584, 45, 0.8058739255014327)
(16434, 53, 0.32250212973104536)
(14542, 52, 0.35758492642002476)
(7046, 45, 0.6386602327561737)
(12976, 50, 0.38532675709001235)
(10482, 48, 0.45792787635947335)
(17190, 53, 0.30831878999418266)
(17298, 43, 0.2485836512891664)
(16592, 48, 0.28929604628736744)
(15038, 52, 0.3457906636520814)
(21308, 55, 0.25811901633189416)
(13278, 54, 0.4066877541798464)
(19410, 56, 0.28851107676455434)
(6412, 50, 0.7797878976918279)
(16012, 50, 0.31226580064951287)
(8882, 47, 0.5291600990767845)
(9894, 56, 0.5659995957145745)
(7132, 48, 0.6730229949523275)


In [71]:
def number_nouns(oraciones):
    number = [pair for pair in oraciones if pair[1]=='NOUN']
    return len(number)

In [72]:
## número de nouns tokens para los 23 textos 

for oraciones in oraciones_nlp:
    print(number_nouns(oraciones))

1031
2832
2727
320
381
383
1282
1077
433
956
804
1283
1375
1469
1511
1711
1075
1522
508
947
627
573
478


In [73]:
## la pregunta es: ¿De este número de nouns cuántos son nominalizaciones?

In [74]:
## reglas de nominalization

## reglas de nominalization
## puse las palabras lematizadas, en ese caso, solo es necesario poner las palabras en singular

no_nom = ['thing', 'things', 'somethings', 'something', 'anything', 'everything', 'nothing','original', 'special', 'normal', 'version', 'tutorial', 'moment', 'comment', 'criminal', 'morning', 'tradition', 'notification','question', 'element', 'quality']
terminacion = ['ment','ments','tions', 'tion','sions', 'sion', 'ibility','ibilities', 'ity','ities', 'ness','nesses', 'al','als','ings', 'ing'] 

In [77]:
def nominalization(oraciones):
    nom = []
    for pair in oraciones:
        if pair[0] not in no_nom:
            if pair[1]=='NOUN':
                for END in terminacion:
                    if pair[0].endswith(END):
                        nom+=[pair[0]]
    return nom

In [78]:
nom_list = []

for oraciones in oraciones_nlp:
    nom_list += [nominalization(oraciones)]

In [79]:
nom_list[0]

['beginning',
 'feminization',
 'feminization',
 'contouring',
 'contouring',
 'incision',
 'fragment',
 'incision',
 'reconstruction',
 'incision',
 'anticipation',
 'encouragement',
 'jackal',
 'vanity',
 'feminization',
 'reassignment',
 'identity',
 'argument',
 'reality',
 'ideal',
 'transition',
 'removal',
 'grooming',
 'thinking',
 'realness',
 'painting',
 'femininity',
 'aspiration',
 'aging',
 'intensity',
 'artificial',
 'skinmaxing',
 'business',
 'discussion',
 'masculinization',
 'looksmaxing',
 'deal',
 'arousal',
 'evolution',
 'contouring',
 'ingenuity',
 'resourcefulness',
 'individuality',
 'femininity',
 'exploitation',
 'bartering',
 'judgment',
 'exaggeration',
 'delusion',
 'embellishment',
 'darkness',
 'invention',
 'darkness',
 'aging',
 'ritual',
 'cleansing',
 'corporation',
 'kindness',
 'saving',
 'collection',
 'facial',
 'futility',
 'solution',
 'spiral',
 'obsession',
 'advertising',
 'relation',
 'awareness',
 'contemplation',
 'revolution',
 'revolu

In [80]:
num_nom_list = []

for oraciones in oraciones_nlp:
    num_nom_list += [len(nominalization(oraciones)), len(nominalization(oraciones))/number_nouns(oraciones)*100]

In [81]:
## porcentaje de nominalizaciones con respecto al total de tokens sustantivos para cada texto

num_nom_list

[83,
 8.050436469447138,
 366,
 12.923728813559322,
 326,
 11.954528786211954,
 38,
 11.875,
 34,
 8.923884514435695,
 26,
 6.7885117493472595,
 148,
 11.54446177847114,
 83,
 7.706592386258125,
 37,
 8.545034642032332,
 113,
 11.820083682008368,
 76,
 9.45273631840796,
 131,
 10.210444271239282,
 135,
 9.818181818181818,
 234,
 15.929203539823009,
 250,
 16.545334215751158,
 168,
 9.818819403857393,
 96,
 8.930232558139535,
 126,
 8.278580814717477,
 62,
 12.204724409448819,
 102,
 10.770855332629356,
 83,
 13.237639553429027,
 75,
 13.089005235602095,
 40,
 8.368200836820083]

## 2. Academic World List

In [82]:
#abriendo la lista de palabras academicas
with open('AcademicWordList.txt') as f:
    AWL=f.read()

In [83]:
#dividir por salto de linea
AWL= AWL.split('\n')

In [84]:
##eliminar espacios en blanco
AWL = [palabra.split(' ') for palabra in AWL if len(palabra)>0]
AWL = [item for sublist in AWL for item in sublist]

In [85]:
AWL.remove('-')
AWL.remove('comments')

In [86]:
texto = list(zip(*oraciones_nlp[0]))[0]

In [87]:
texto[:10]

('my', 'face', 'be', 'the', 'front', 'of', 'shop', 'my', 'face', 'be')

In [88]:
len(tokens)

142029

# PROBLEMAS AQUÍ

In [89]:
def academic(tokens):
    aca_list=[]
    for word in tokens:
        if word in AWL:
            aca_list+=[word]
    return aca_list

In [90]:
oraciones_nlp[0][:10]

[('my', 'PRON'),
 ('face', 'NOUN'),
 ('be', 'AUX'),
 ('the', 'DET'),
 ('front', 'NOUN'),
 ('of', 'ADP'),
 ('shop', 'NOUN'),
 ('my', 'PRON'),
 ('face', 'NOUN'),
 ('be', 'AUX')]

In [91]:
texto = list(zip(*oracion))[0]

In [92]:
aca_words_percentage = []

aca_words_per_text = []

for oracion in oraciones_nlp:
        texto = list(zip(*oracion))[0]
        aca_words = academic(texto)
        aca_words_per_text+=[aca_words]
        aca_words_percentage += [len(aca_words)/len(texto)*100]

In [93]:
aca_words_percentage

[1.163151364764268,
 1.1497556769186548,
 1.4605474084189278,
 1.8669219722355195,
 0.5670446964643095,
 0.5372492836676217,
 0.9492515516611902,
 1.0314949800577637,
 2.7249503264263413,
 1.155980271270037,
 1.2020606754436176,
 1.1634671320535195,
 1.1793270898369754,
 2.941176470588235,
 2.3673360819257883,
 2.1869720292847754,
 1.6418135261334537,
 1.4940752189592994,
 2.2145976294447913,
 0.7869098176367724,
 0.990767845079937,
 1.1926420052557105,
 1.0936623667975323]

In [94]:
def number_aca_list(palabras):
    number_aca = []
    for palabra in palabras:
        number_aca+=[pair[0] for pair in palabra]
    return len(number_aca), len(number_aca)/len(tokens)*100

In [96]:
for oraciones in oraciones_nlp:
    print(number_aca_list(oraciones))

(12896, 9.07983580818002)
(41748, 29.393997000612547)
(33686, 23.71769145737842)
(4178, 2.941652761055841)
(5996, 4.221673038604791)
(5584, 3.931591435551894)
(16434, 11.570876370318738)
(14542, 10.238754057269993)
(7046, 4.960958677453196)
(12976, 9.13616233304466)
(10482, 7.380182920389498)
(17190, 12.103162030289589)
(17298, 12.179202838856854)
(16592, 11.682121256926402)
(15038, 10.587978511430764)
(21308, 15.002569897696949)
(13278, 9.348794964408677)
(19410, 13.666223095283359)
(6412, 4.514570967900921)
(16012, 11.27375395165776)
(8882, 6.253652423096691)
(9894, 6.966182962634392)
(7132, 5.0215096916826845)
