Integrantes: Lina Beltrán (lbeltra1@eafit.edu.co), Jose Muñoz (jjmunozm@eafit.edu.co)

Materia: Alm. & Recu. de información.

Universidad EAFIT - Maestria ciencia en datos y analítica

Trabajo 2 Unidad 2 recuperación de texto

# Configurar Pyspark y Drive

In [None]:
# configuracion de drive

from google.colab import drive
drive.mount('/content/gdrive', force_remount=True)

# Manejo de directorios
from os import listdir
from os.path import isfile, join

Mounted at /content/gdrive


In [None]:
# [link text](https://)Directorios (path) de entrada y salida:

path_in="/content/gdrive/MyDrive/EAFIT/Maestria/alm-recu-info/Trabajo2/dataset/"
path_out="/content/gdrive/MyDrive/EAFIT/Maestria/alm-recu-info/Trabajo2/OutPySpark/"

In [None]:
# Se comprueba acceso y contenido del path_in

!ls '/content/gdrive/MyDrive/EAFIT/Maestria/alm-recu-info/Trabajo2/dataset/'

wiki-multiple-files  wiki-single-file.txt


In [None]:
# instalación de java y spark

!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q https://downloads.apache.org/spark/spark-3.2.1/spark-3.2.1-bin-hadoop3.2.tgz
!tar xf spark-3.2.1-bin-hadoop3.2.tgz
!pip install -q findspark
!pip install pyspark



In [None]:
# Seteando variables de entorno para Java y Spark
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-3.2.1-bin-hadoop3.2"

In [None]:
# Instalación del paquete findSpark para acceder a Spark desde cualquier entorno de trabajo Python
import findspark
findspark.init()

# Uso de DataSets y DataFrames con Spark
from pyspark.sql import SparkSession, Row
spark = SparkSession.builder.master("local[*]").getOrCreate()
sc = spark.sparkContext

# Procesamiento de datos usando Pyspark



---



In [None]:
# Carga del Dataset multi archivos

MyRDD = sc.wholeTextFiles(path_in + "wiki-multiple-files/*.txt")
df = MyRDD.toDF(schema=['filename','content'])

In [None]:
# exploración simple del DF

df.printSchema()
df.show(10,False)
df.count()

root
 |-- filename: string (nullable = true)
 |-- content: string (nullable = true)

+-------------------------------------------------------------------------------------------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

4396

# Tokenización

In [44]:
# Librerias para trabajar con caracteristicas

from pyspark.ml.feature import *
from pyspark.sql.functions import udf
from pyspark.sql.types import IntegerType

In [None]:
# Remover signos de puntuación con REGEX
from pyspark.sql.functions import regexp_replace

REGEX = '[_():;,.!?\\-=]'

df= df.withColumn('content', regexp_replace(df.content, REGEX, ''))
df.show(10,False)

+-------------------------------------------------------------------------------------------------------+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [56]:
# Tokenización simple

tokenized_df= Tokenizer(inputCol='content',outputCol='tokens').transform(df)
tokenized_df.printSchema()

countTokens = udf(lambda t: len(t), IntegerType())
countTokensDF = tokenized_df.withColumn('Token_Count', countTokens(col('tokens')))
countTokensDF.orderBy(rand()).show(10)

tokenized_df.select(['content','tokens']).show(20,False)

root
 |-- filename: string (nullable = true)
 |-- content: string (nullable = true)
 |-- tokens: array (nullable = true)
 |    |-- element: string (containsNull = true)

+--------------------+--------------------+--------------------+-----------+
|            filename|             content|              tokens|Token_Count|
+--------------------+--------------------+--------------------+-----------+
|file:/content/gdr...|@@14590514 Somdet...|[@@14590514, somd...|        576|
|file:/content/gdr...|@@2340514  The Za...|[@@2340514, , the...|        506|
|file:/content/gdr...|@@24462514 ' ' Ha...|[@@24462514, ', '...|         85|
|file:/content/gdr...|@@24053514 TransA...|[@@24053514, tran...|        313|
|file:/content/gdr...|@@39312514 Sandhu...|[@@39312514, sand...|         86|
|file:/content/gdr...|@@1813514 The fre...|[@@1813514, the, ...|       1232|
|file:/content/gdr...|@@7735514 width 6...|[@@7735514, width...|        526|
|file:/content/gdr...|@@24702514 The Lo...|[@@24702514, the,

In [62]:
# Remoción de caracteres especiales y tokens que se consideren irrelevantes usando el patron \\w+

from pyspark.sql.functions import *


wordTokenizer_df= RegexTokenizer(inputCol='content',outputCol="tokensRefined",minTokenLength=3,pattern="\\W+").setGaps(True).transform(tokenized_df)
countTokensRefined = udf(lambda s: len(s), IntegerType())
refinedCountDF = wordTokenizer_df.withColumn('token_count_Refined', countTokensRefined(col('tokensRefined')))

#Visualización del DF'

wordTokenizer_df.printSchema()
wordTokenizer_df.select(["*",
                         lit(countTokens(col('tokens'))).alias('Token_Count'),
                         lit(countTokensRefined(col('tokensRefined'))).alias('token_count_Refined')]).show(10)

root
 |-- filename: string (nullable = true)
 |-- content: string (nullable = true)
 |-- tokens: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- tokensRefined: array (nullable = true)
 |    |-- element: string (containsNull = true)

+--------------------+--------------------+--------------------+--------------------+-----------+-------------------+
|            filename|             content|              tokens|       tokensRefined|Token_Count|token_count_Refined|
+--------------------+--------------------+--------------------+--------------------+-----------+-------------------+
|file:/content/gdr...|@@1514 Albert of ...|[@@1514, albert, ...|[1514, albert, pr...|       2273|               1527|
|file:/content/gdr...|@@19514 # Events ...|[@@19514, #, even...|[19514, events, 1...|       5508|               3437|
|file:/content/gdr...|@@185514 Spiritua...|[@@185514, spirit...|[185514, spiritua...|       1430|                949|
|file:/content/gdr...|@@529551

# Remoción de StopWords

In [67]:
# Remoción de StopWords

stopword_removal= StopWordsRemover(inputCol='tokensRefined',outputCol='tokensRefined2').transform(wordTokenizer_df)
countTokensRefinedSW = udf(lambda a: len(a), IntegerType())
refinedCountSWDF= stopword_removal.withColumn('token_count_Refined2', countTokensRefinedSW(col('tokensRefined2')))

#Visualización del DF''

stopword_removal.printSchema()
stopword_removal.select(['tokens', 'tokensRefined', 'tokensRefined2', 
                         lit(countTokens(col('tokens'))).alias('Token_Count'),
                         lit(countTokensRefined(col('tokensRefined'))).alias('token_count_Refined'),
                         lit(countTokensRefinedSW(col('tokensRefined2'))).alias('token_count_Refined2')]).show(20)

root
 |-- filename: string (nullable = true)
 |-- content: string (nullable = true)
 |-- tokens: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- tokensRefined: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- tokensRefined2: array (nullable = true)
 |    |-- element: string (containsNull = true)

+--------------------+--------------------+--------------------+-----------+-------------------+--------------------+
|              tokens|       tokensRefined|      tokensRefined2|Token_Count|token_count_Refined|token_count_Refined2|
+--------------------+--------------------+--------------------+-----------+-------------------+--------------------+
|[@@1514, albert, ...|[1514, albert, pr...|[1514, albert, pr...|       2273|               1527|                1049|
|[@@19514, #, even...|[19514, events, 1...|[19514, events, 1...|       5508|               3437|                3113|
|[@@185514, spirit...|[185514, spiritua...|[185514, spi

# Count Vectorizer

In [71]:
from pyspark.ml.feature import CountVectorizer

countVec = CountVectorizer(inputCol='tokensRefined2',outputCol='features',vocabSize=1000)
countVec_df= countVec.fit(stopword_removal).transform(stopword_removal)
countVec_df.printSchema()

countVec_df.select(['tokensRefined2','features']).show(20)

root
 |-- filename: string (nullable = true)
 |-- content: string (nullable = true)
 |-- tokens: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- tokensRefined: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- tokensRefined2: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- features: vector (nullable = true)

+--------------------+--------------------+
|      tokensRefined2|            features|
+--------------------+--------------------+
|[1514, albert, pr...|(1000,[0,1,2,4,6,...|
|[19514, events, 1...|(1000,[0,1,2,3,6,...|
|[185514, spiritua...|(1000,[0,1,3,4,6,...|
|[5295514, narrato...|(1000,[2,3,4,5,8,...|
|[5297514, lisy, l...|(1000,[3,5,12,26,...|
|[5299514, indian,...|(1000,[4,12,18,24...|
|[5307514, many, 5...|(1000,[2,3,18,21,...|
|[5322514, british...|(1000,[11,15,31,8...|
|[5323514, femalew...|(1000,[0,1,2,3,5,...|
|[5325514, gluvian...|(1000,[3,18,43,13...|
|[5332514, birthpl...|(1000,[0,2,

In [74]:
bow = countVec.fit(stopword_removal).vocabulary
print(len(bow))

print(bow)

1000
['also', 'first', 'new', 'one', 'two', 'small', 'time', 'may', 'born', 'years', 'national', 'school', 'city', 'world', 'state', 'university', 'later', 'season', 'united', 'used', 'american', 'many', 'year', 'made', 'south', 'states', 'part', 'team', 'three', 'known', 'album', 'second', 'music', 'county', 'film', 'amp', 'became', 'name', 'ampndash', 'including', 'series', 'history', 'however', 'north', 'john', 'people', 'war', 'league', 'early', 'area', 'played', 'group', 'since', 'family', 'life', 'work', 'high', 'well', 'march', 'district', 'number', 'former', 'released', 'several', 'following', 'house', 'game', 'government', 'club', '2010', '2006', 'town', '2008', '2011', 'college', 'york', 'band', 'four', 'west', 'called', 'named', '2007', 'career', '2012', 'july', 'september', 'international', 'center', 'village', 'place', 'line', 'song', 'won', 'november', 'january', 'april', 'end', 'august', 'river', 'company', 'station', 'style', 'october', 'day', '2009', 'home', 'use', 'us

# HashingTF

In [78]:
from pyspark.ml.feature import HashingTF

k = len(bow)

hasher = HashingTF(inputCol= 'tokensRefined2', outputCol= 'TF_Features', numFeatures=k)
hasher_df  = hasher.transform(stopword_removal)

hasher_df.printSchema()

hasher_df.select(['tokensRefined2','TF_Features']).show(20)


root
 |-- filename: string (nullable = true)
 |-- content: string (nullable = true)
 |-- tokens: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- tokensRefined: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- tokensRefined2: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- TF_Features: vector (nullable = true)

+--------------------+--------------------+
|      tokensRefined2|         TF_Features|
+--------------------+--------------------+
|[1514, albert, pr...|(1000,[0,4,5,7,9,...|
|[19514, events, 1...|(1000,[0,1,2,3,5,...|
|[185514, spiritua...|(1000,[0,5,7,9,12...|
|[5295514, narrato...|(1000,[1,3,5,7,9,...|
|[5297514, lisy, l...|(1000,[35,85,108,...|
|[5299514, indian,...|(1000,[15,21,27,4...|
|[5307514, many, 5...|(1000,[21,22,58,7...|
|[5322514, british...|(1000,[6,147,180,...|
|[5323514, femalew...|(1000,[1,4,5,7,17...|
|[5325514, gluvian...|(1000,[12,58,137,...|
|[5332514, birthpl...|(1000,[1

# TF-IDF

In [79]:
from pyspark.ml.feature import IDF

TF_IDF = IDF(inputCol= 'TF_Features', outputCol= 'TF_IDF_Features').fit(hasher_df).transform(hasher_df).show(20)

+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|            filename|             content|              tokens|       tokensRefined|      tokensRefined2|         TF_Features|     TF_IDF_Features|
+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|file:/content/gdr...|@@1514 Albert of ...|[@@1514, albert, ...|[1514, albert, pr...|[1514, albert, pr...|(1000,[0,4,5,7,9,...|(1000,[0,4,5,7,9,...|
|file:/content/gdr...|@@19514 # Events ...|[@@19514, #, even...|[19514, events, 1...|[19514, events, 1...|(1000,[0,1,2,3,5,...|(1000,[0,1,2,3,5,...|
|file:/content/gdr...|@@185514 Spiritua...|[@@185514, spirit...|[185514, spiritua...|[185514, spiritua...|(1000,[0,5,7,9,12...|(1000,[0,5,7,9,12...|
|file:/content/gdr...|@@5295514 narrato...|[@@5295514, narra...|[5295514, narrato...|[5295514, narrato...|