# Análise exploratória de dados Tweet Covid19

## 1. Instalação dos pacotes necessários para EDA

In [None]:
# Executar esse bloco caso não tenha o pacote instalado
# install.packages("tidyverse")
# install.packages("magrittr")
# install.packages("tm")
# install.packages("wordcloud")

## 2. Carregando os pacotes

In [None]:
library(wordcloud)
library(magrittr)
library(tm)
library(tidyverse)
library(tidyr)
library(cluster)

## 3. Leitura dos dados

In [None]:
# Leitura do arquivo por linha do PubMed
covid <- readLines("PM-COVID-4064-TM.txt")

In [None]:
# Leitura do arquivo por DF do PubMed
covid_df <- read.delim("PM-COVID-4064-TM.txt", header = FALSE, sep = "\n")
covid_df$doc_id <- seq.int(nrow(covid_df))
covid_df <- covid_df[,c(2,1)]
colnames(covid_df) <- c("doc_id", "text")

In [None]:
# ler o arquivo Tweets
tweets_covid <-read_csv('covid19_tweets.csv')

In [None]:
glimpse(tweets_covid)

In [None]:
tweets_covid<-tweets_covid %>%select(c(user_location, date, text, source))

In [None]:
tweets_covid<-tweets_covid%>%rename(
    localizacao = user_location,
    data_hora_publicacao = date,
    texto = text,
    origem = source 
)

In [None]:
tweets_covid%>%head(5)

In [None]:
tweets_covid%>%tail(5)

In [None]:
# definindo a fonte texto Pub Med
covid_source <- VectorSource(covid)
covid_df_source <- DataframeSource(covid_df)
# transformando em corpus texto Pub Med
covid_corpus <- VCorpus(covid_source)
covid_df_corpus <- VCorpus(covid_df_source)

In [None]:
# definindo a fonte texto tweets
tweets_covid_source <- VectorSource(tweets_covid$texto)
# transformando em corpus texto tweets
tweets_covid_corpus <- VCorpus(tweets_covid_source)

In [None]:
print(covid_corpus)
print(covid_df_corpus)
covid_corpus[[2]]
covid_corpus[[2]]$content

In [None]:
print(tweets_covid_corpus)
tweets_covid_corpus[[2]]
tweets_covid_corpus[[5]]$content

## 4. Limpeza dos Dados

In [None]:
#StopWords
stopwords("en")
stopwords("pt")
stopwords("es")

In [None]:
new_stops <- c("COVID", "the", "The", stopwords("en"))

In [None]:
# Normalizacao e limpeza dos texto Pub Med
covid_clean <- tm_map(covid_corpus, removeWords, words = c(new_stops))
covid_clean <- tm_map(covid_corpus, removePunctuation)
covid_clean <- tm_map(covid_corpus, stripWhitespace)
covid_clean[[2]]$content

In [None]:
# Normalizacao e limpeza dos Tweets
removerURL <-function(texto)gsub("http[^[:space:]]*", "", texto)
tweets_covid_corpus <- tm_map(tweets_covid_corpus, removeWords, words = c(new_stops))
tweets_covid_corpus <- tm_map(tweets_covid_corpus, content_transformer(removerURL))
tweets_covid_corpus <- tm_map(tweets_covid_corpus, content_transformer(removePunctuation))
tweets_covid_corpus <- tm_map(tweets_covid_corpus, content_transformer(removeNumbers))
tweets_covid_corpus <- tm_map(tweets_covid_corpus, content_transformer(stripWhitespace))
tweets_covid_corpus[[2]]$content

In [None]:
#Term Frequency - Document Frequency matriz
covid_dtm <- DocumentTermMatrix(covid_clean)
covid_dtm
covid_m1 <- as.matrix(covid_dtm)

In [None]:
covid_tdm <- TermDocumentMatrix(covid_clean)
covid_tdm
covid_m2 <- as.matrix(covid_tdm)

## Visualização dos Dados

In [None]:
term_frequency_covid <- rowSums(covid_m2)
term_frequency_covid <- sort(term_frequency_covid, decreasing = TRUE)
head(term_frequency_covid, 100) 
term_frequency_covid[100:200]

In [None]:
term <- names(term_frequency_covid)
num <- term_frequency_covid
word_freqs <- data.frame(term, num)
wordcloud(word_freqs$term, word_freqs$num, max.words = 100, colors = "red")

In [None]:
freq <- sort(colSums(as.matrix(covid_dtm)), decreasing=TRUE)   
barplot(freq[1:10],col="lightgreen",las=2)

In [None]:
dtmss <- removeSparseTerms(covid_dtm, 0.80)   
dtmss   
d <- dist(t(dtmss), method="euclidian")   
fit <- hclust(d=d, method="complete")
fit 
plot(fit, hang=1) 
groups <- cutree(fit, k=6)   
rect.hclust(fit, k=6, border="red")

In [None]:
d <- dist(t(dtmss), method="euclidian")   
kfit <- kmeans(d, 2)   
clusplot(as.matrix(d), kfit$cluster, color=T, shade=T, labels=2, lines=0)

In [None]:
set.seed(1234)
wordcloud(words = word_freqs$term, freq = word_freqs$num, min.freq = 1,
          max.words=200, random.order=FALSE, rot.per=0.35, 
          colors=brewer.pal(8, "Dark2"))