In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, udf, explode
from pyspark.sql.types import BooleanType, ArrayType, StringType
from pyspark.ml.feature import Tokenizer, StopWordsRemover, NGram
from unicodedata import normalize

import pandas as pd
import re
import nltk

In [2]:
spark = (SparkSession.builder \
    .appName("SPARK TRAB - QUESTION 1 D").getOrCreate())

In [3]:
df_orig = spark.read.option("header", "false").option("delimiter", "\t").csv("./data/debate-tweets.tsv")

## Objetivo

In [4]:
df = df_orig.select("_c0", "_c1")

df = df.withColumnRenamed("_c0", "id") \
                     .withColumnRenamed("_c1", "content")

df.show()

+------------------+--------------------+
|                id|             content|
+------------------+--------------------+
|522394422710136832|@anacddd verdade,...|
|522394422806581248|              Que ñ*|
|522394422731100160| Vou quebrar a Bruna|
|522394422810783745|agora vou p segun...|
|522394423137943553|Me sinto tão bem ...|
|522394423188271104|Eu estou aqui, de...|
|522394423238606848|Quando vai embora...|
|522394423528022016|@paynecaralhudo k...|
|522394423632875521|Conceição da Barr...|
|522394424010362881| @Maniavato te amo ♥|
|522394424048091138|Alg me curtindo rs ♡|
|522394424010358784|@MiiluAA No, porq...|
|522394423741906944|#EMABiggestFansJu...|
|522394424568213505|@raizabatista dev...|
|522394424920506368|Me senti ate d fe...|
|522394424811458560|qual o sentido de...|
|522394425029574656|I'm at Lava Rápid...|
|522394425121841153|Fica comentando m...|
|522394425461579777|"odeio que me man...|
|522394425960701952|CAMAMTEBABILONFRA...|
+------------------+--------------

In [5]:
def extract_aecio(text):
    text_sem_acento = normalize('NFKD', text).encode('ASCII', 'ignore').decode('ASCII')
    if re.findall(r'\bA[eé]cio\b', text_sem_acento, re.IGNORECASE):
        return True
    else:
        return False

def remove_punctuation(text):
    return re.sub(r'[^\w\s]', ' ', text)

In [6]:
extract_aecio_udf = udf(extract_aecio, BooleanType())

df = df.withColumn("hasAecio", extract_aecio_udf(df["content"]))

df_aecio = df.filter(col("hasAecio")).select("id", "content")

In [7]:
remove_punctuation_udf = udf(remove_punctuation, StringType())

df_aecio = df_aecio.withColumn("content_clean", remove_punctuation_udf(col("content")))

In [8]:
tokenizer = Tokenizer(inputCol="content_clean", outputCol="words")
df_aecio = tokenizer.transform(df_aecio)

In [9]:
def remove_empty_tokens(tokens):
    return [token for token in tokens if len(token) >= 3]

In [10]:
stopwordList = nltk.corpus.stopwords.words('portuguese')

remover = StopWordsRemover(inputCol="words", outputCol="filtered", stopWords = stopwordList)
df_aecio = remover.transform(df_aecio)

remove_empty_tokens_udf = udf(remove_empty_tokens, ArrayType(StringType()))

df_aecio = df_aecio.withColumn("filtered", remove_empty_tokens_udf(col("filtered")))

In [11]:
from functools import reduce

def generate_ngrams(wordsData, ngram_range):
    ngrams_data = []
    for n in range(ngram_range[0], ngram_range[1] + 1):
        ngram = NGram(n=n, inputCol="filtered", outputCol="ngram")
        ngram_data = ngram.transform(wordsData)
        ngrams_data.append(ngram_data)
    return reduce(lambda df1, df2: df1.union(df2), ngrams_data)

In [12]:
# ngram_range = (2, 5)
# df_aecio = generate_ngrams(df_aecio, ngram_range)

ngram = NGram(n = 3, inputCol="filtered", outputCol="ngram")
df_aecio = ngram.transform(df_aecio)

df_aecio.show() # OK

+------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|                id|             content|       content_clean|               words|            filtered|               ngram|
+------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|522394798914015233|@KennedyAncar @Dr...| KennedyAncar  Dr...|[, kennedyancar, ...|[kennedyancar, dr...|[kennedyancar dri...|
|522394968611360768|O site do TSE-MG ...|O site do TSE MG ...|[o, site, do, tse...|[site, tse, ficou...|[site tse ficou, ...|
|522394979403325440|Olha isso @RealKa...|Olha isso  RealKa...|[olha, isso, , re...|[olha, realkajuru...|[olha realkajuru ...|
|522395075972976643|@Indianara_m tamb...| Indianara_m tamb...|[, indianara_m, t...|[indianara_m, que...|[indianara_m quer...|
|522395717797969920|@BlogdoNoblat gan...| BlogdoNoblat gan...|[, blogdonoblat, ...|[blogdonoblat, ga...|[blogdonoblat 

In [13]:
ngrams = df_aecio.select("ngram")
ngrams = ngrams.withColumn("sentences", explode("ngram")).select("sentences")
top_results = ngrams.groupBy("sentences").count().orderBy(col("count").desc()).limit(100)

In [14]:
dataframe_pd = top_results.toPandas()

dataframe_pd.to_csv('./outcome/Q1/result_sentences_aecio.csv', index=False)

In [15]:
#important_sentences_df.show()

In [16]:
spark.stop()

In [17]:
# REDUZIR NUMERO DE FEATURES
# CHECAR MOSTRAR O RESUTLADO SEM TER A FINAL_FEATURES E PRONTO