 Preparação de Dados

In [1]:
!pip install pyspark



In [2]:
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.functions import col
from pyspark.ml.feature import VectorAssembler, MinMaxScaler, PCA
from pyspark.ml.clustering import KMeans
from pyspark.ml.classification import DecisionTreeClassifier
from pyspark.ml.evaluation import ClusteringEvaluator

In [3]:
spark = SparkSession.builder.getOrCreate()

In [4]:
# Leia o arquivo ‘videos-tratados.snappy.parquet' no dataframe 'df_video'
df_video = spark.read.option('header', 'true').parquet('videos-comments-tratados.snappy.parquet')

In [5]:
# Adicione a coluna 'Month' com o valor do mês da coluna "Published At"

from pyspark.sql.functions import month

df_video = df_video.withColumn('Month', month('Published At'))

df_video.select('Published At', 'Month').show(5)

+------------+-----+
|Published At|Month|
+------------+-----+
|  2022-08-23|    8|
|  2022-08-23|    8|
|  2022-08-23|    8|
|  2022-08-23|    8|
|  2022-08-23|    8|
+------------+-----+
only showing top 5 rows


In [6]:
# Adicione a coluna "Keyword Index" com a transformação da coluna 'keyword' para valores numéricos

from pyspark.ml.feature import StringIndexer

indexer = StringIndexer(inputCol="keyword", outputCol="Keyword Index")
df_video = indexer.fit(df_video).transform(df_video)

df_video.select('keyword', 'Keyword Index').show(5)

+-------+-------------+
|keyword|Keyword Index|
+-------+-------------+
|   tech|         17.0|
|   tech|         17.0|
|   tech|         17.0|
|   tech|         17.0|
|   tech|         17.0|
+-------+-------------+
only showing top 5 rows


In [10]:
from pyspark.sql.functions import col

# Convertendo as colunas para o tipo integer (numérico)
df_video = df_video \
    .withColumn("Likes", col("Likes").cast("integer")) \
    .withColumn("Views", col("Views").cast("integer")) \
    .withColumn("Year", col("Year").cast("integer")) \
    .withColumn("Month", col("Month").cast("integer")) \
    .withColumn("Keyword Index", col("Keyword Index").cast("integer"))

# Após isso, suas colunas estarão no formato certo para uso no VectorAssembler
df_video.printSchema()  # Para conferir os tipos das colunas

root
 |-- Video ID: string (nullable = true)
 |-- Title: string (nullable = true)
 |-- Published At: date (nullable = true)
 |-- Keyword: string (nullable = true)
 |-- Likes: integer (nullable = true)
 |-- Comments: integer (nullable = true)
 |-- Views: integer (nullable = true)
 |-- Interaction: integer (nullable = true)
 |-- Year: integer (nullable = true)
 |-- Comment: string (nullable = true)
 |-- Sentiment: integer (nullable = true)
 |-- Likes Comment: integer (nullable = true)
 |-- Month: integer (nullable = true)
 |-- Keyword Index: integer (nullable = true)



In [11]:
# Adicione a coluna "Features Normal" com os dados normalizados da coluna Features, lembrando que para normalizar a coluna não pode conter valores nulos

from pyspark.ml.feature import VectorAssembler

# Lista dos campos que vão compor o vetor 'Features'
feature_columns = ['Likes', 'Views', 'Year', 'Month', 'Keyword Index']

# Criando o objeto VectorAssembler
assembler = VectorAssembler(inputCols=feature_columns, outputCol='Features')

# Transformando o DataFrame com o assembler
df_video = assembler.transform(df_video)

df_video.select('Features').show(5, truncate=False)

+---------------------------------+
|Features                         |
+---------------------------------+
|[3407.0,135612.0,2022.0,8.0,17.0]|
|[3407.0,135612.0,2022.0,8.0,17.0]|
|[3407.0,135612.0,2022.0,8.0,17.0]|
|[3407.0,135612.0,2022.0,8.0,17.0]|
|[3407.0,135612.0,2022.0,8.0,17.0]|
+---------------------------------+
only showing top 5 rows


In [13]:
from pyspark.ml.feature import Normalizer

# Removendo linhas que possuam valores nulos na coluna 'Features', se necessário
df_video = df_video.na.drop(subset=['Features'])

# Criando o objeto Normalizer
normalizer = Normalizer(inputCol="Features", outputCol="Features Normal", p=2.0)

# Aplicando normalização
df_video = normalizer.transform(df_video)

# Visualizando
df_video.select('Features', 'Features Normal').show(5, truncate=False)

+---------------------------------+--------------------------------------------------------------------------------------------------------+
|Features                         |Features Normal                                                                                         |
+---------------------------------+--------------------------------------------------------------------------------------------------------+
|[3407.0,135612.0,2022.0,8.0,17.0]|[0.02511243093431334,0.9995735203592899,0.014903826049070024,5.896667081728991E-5,1.2530417548674107E-4]|
|[3407.0,135612.0,2022.0,8.0,17.0]|[0.02511243093431334,0.9995735203592899,0.014903826049070024,5.896667081728991E-5,1.2530417548674107E-4]|
|[3407.0,135612.0,2022.0,8.0,17.0]|[0.02511243093431334,0.9995735203592899,0.014903826049070024,5.896667081728991E-5,1.2530417548674107E-4]|
|[3407.0,135612.0,2022.0,8.0,17.0]|[0.02511243093431334,0.9995735203592899,0.014903826049070024,5.896667081728991E-5,1.2530417548674107E-4]|
|[3407.0,1356

In [14]:
# Adicione a coluna "Features PCA" com a redução de 5 características para 1, utilizando o modelo PCA

from pyspark.ml.feature import PCA

# Configurando o PCA para reduzir de 5 para 1 dimensão
pca = PCA(k=1, inputCol="Features", outputCol="Features PCA")
pca_model = pca.fit(df_video)

# Transformando o dataframe
df_video = pca_model.transform(df_video)

# Visualizando o resultado
df_video.select('Features', 'Features PCA').show(5, truncate=False)

+---------------------------------+---------------------+
|Features                         |Features PCA         |
+---------------------------------+---------------------+
|[3407.0,135612.0,2022.0,8.0,17.0]|[-135636.63188203107]|
|[3407.0,135612.0,2022.0,8.0,17.0]|[-135636.63188203107]|
|[3407.0,135612.0,2022.0,8.0,17.0]|[-135636.63188203107]|
|[3407.0,135612.0,2022.0,8.0,17.0]|[-135636.63188203107]|
|[3407.0,135612.0,2022.0,8.0,17.0]|[-135636.63188203107]|
+---------------------------------+---------------------+
only showing top 5 rows


In [15]:
# Separando o DataFrame em 80% para treino e 20% para teste
train_df, test_df = df_video.randomSplit([0.8, 0.2], seed=42)

# Verificando o tamanho dos conjuntos
print("Treinamento:", train_df.count())
print("Teste:", test_df.count())

Treinamento: 14789
Teste: 3620


In [17]:
# Crie um modelo de regressão linear para estimar o valor do campo "Comments", utilizando a "Features Normal" e avalie o modelo

from pyspark.ml.regression import LinearRegression
from pyspark.ml.evaluation import RegressionEvaluator

# Definindo o modelo de regressão linear
lr = LinearRegression(featuresCol="Features Normal", labelCol="Comments")

# Treinando o modelo
lr_model = lr.fit(train_df)

# Fazendi previsões no conjunto de teste
predictions = lr_model.transform(test_df)

# Avaliando o modelo, por exemplo, usando RMSE
evaluator = RegressionEvaluator(
    labelCol="Comments",
    predictionCol="prediction",
    metricName="rmse"
)
rmse = evaluator.evaluate(predictions)
print("RMSE no conjunto de teste:", rmse)


RMSE no conjunto de teste: 43345.23343236093


In [18]:
# Salve o dataframe df_video como 'videos-preparados-parquet' no formato parquet


df_video.write.mode('overwrite').parquet('videos-preparados-parquet')

In [20]:
spark.stop()