In [None]:
from wordcloud import WordCloud, STOPWORDS
import matplotlib.pyplot as plt

import pandas as pd

from collections import Counter
import string

from database import engine
from models import Insights


In [None]:


# Load the dataset
file_path = 'source\dataset_instagram-post-scraper_2024-02-08_14-54-37-568.csv'
data = pd.read_csv(file_path)

# Display the first few rows of the dataframe to understand its structure
data.head()


In [None]:
# Combine all captions into one large text string
text_data = ' '.join(data['caption'].dropna().astype(str))

# Convert text to lowercase
text_data = text_data.lower()

# Remove punctuation and special characters
text_data = text_data.translate(str.maketrans('', '', '!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'))

# Add common stop words to the default list, if necessary
custom_stopwords = set([
    'de', 'a', 'o', 'que', 'e', 'do', 'da', 'em', 'um', 'para', 'com', 'não', 'uma', 'os', 'no', 'se', 'na', 'por',
    'mais', 'as', 'dos', 'como', 'mas', 'foi', 'ao', 'ele', 'das', 'tem', 'à', 'seu', 'sua', 'ou', 'ser', 'quando',
    'muito', 'nos', 'já', 'eu', 'também', 'só', 'pelo', 'pela', 'até', 'isso', 'ela', 'entre', 'era', 'depois',
    'sem', 'mesmo', 'aos', 'ter', 'seus', 'quem', 'nas', 'me', 'esse', 'eles', 'estão', 'você', 'tinha', 'foram',
    'essa', 'num', 'nem', 'suas', 'meu', 'às', 'minha', 'têm', 'numa', 'pelos', 'elas', 'há', 'seja', 'qual', 'será',
    'nós', 'tenho', 'lhe', 'deles', 'essas', 'esses', 'pelas', 'este', 'fosse', 'dele', 'tu', 'te', 'vocês', 'vos',
    'lhes', 'meus', 'minhas', 'teu', 'tua', 'teus', 'tuas', 'nosso', 'nossa', 'nossos', 'nossas', 'dela', 'delas',
    'esta', 'estes', 'estas', 'aquele', 'aquela', 'aqueles', 'aquelas', 'isto', 'aquilo', 'estou', 'está', 'estamos',
    'estão', 'estive', 'esteve', 'estivemos', 'estiveram', 'estava', 'estávamos', 'estavam', 'estivera', 'estivéramos',
    'esteja', 'estejamos', 'estejam', 'estivesse', 'estivéssemos', 'estivessem', 'estiver', 'estivermos', 'estiverem',
    'hei', 'há', 'havemos', 'hão', 'houve', 'houvemos', 'houveram', 'houvera', 'houvéramos', 'haja', 'hajamos', 'hajam',
    'houvesse', 'houvéssemos', 'houvessem', 'houver', 'houvermos', 'houverem', 'houverei', 'houverá', 'houveremos',
    'houverão', 'houveria', 'houveríamos', 'houveriam', 'sou', 'somos', 'são', 'era', 'éramos', 'eram', 'fui', 'foi',
    'fomos', 'foram', 'fora', 'fôramos', 'seja', 'sejamos', 'sejam', 'fosse', 'fôssemos', 'fossem', 'for', 'formos',
    'forem', 'serei', 'será', 'seremos', 'serão', 'seria', 'seríamos', 'seriam', 'tenho', 'tem', 'temos', 'tém', 'tinha',
    'tínhamos', 'tinham', 'tive', 'teve', 'tivemos', 'tiveram', 'tivera', 'tivéramos', 'tenha', 'tenhamos', 'tenham',
    'tivesse', 'tivéssemos', 'tivessem', 'tiver', 'tivermos', 'tiverem', 'terei', 'terá', 'teremos', 'terão', 'teria',
    'teríamos', 'teriam','é', 'todo', 'dia', 'hoje', 'sobre', 'deu', 'sempre', 'sobre', 'toda', 'todos', 'dia', 'dias','tá', 
    'todas', 'hoje', 'agora', 'aqui', 'vamos', 'vai', 'tudo', 'vamo', 'vem', 'aí', 'além', 'alem', 'link', 'bio', 'pra', 'junto',
      'pode', 'fazer', 'outra', 'ainda', 'assim', 'nesse', 'onde', 'precisa', 'ontem', 'muita', 'cada', 'dessa', 'tbt', 'quer',
      'outro', 'nessa', 'vez', 'desse', 'pois', 'desde'
])

# Generate a word cloud
wordcloud = WordCloud(stopwords=custom_stopwords, background_color='white', width=800, height=400).generate(text_data)

# Display the word cloud
plt.figure(figsize=(10, 5))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')  # Do not display the axis
plt.show()

In [None]:
# Function to clean and tokenize text
def clean_and_tokenize(text):
    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    # Tokenize by splitting the text
    words = text.split()
    # Remove stopwords and lowercase the tokens
    return [word.lower() for word in words if word.lower() not in stopwords]

# Assuming 'custom_stopwords' is a list of stopwords you want to exclude
stopwords = set(custom_stopwords)

# Concatenate all captions into a single string
all_captions = ' '.join(data['caption'].dropna().astype(str))

# Clean and tokenize the concatenated captions
tokens = clean_and_tokenize(all_captions)

# Count the occurrences of each word
word_counts = Counter(tokens)

# Get the most common 20 words and their counts
top_words = word_counts.most_common(20)

# Print the top 0 words and their counts
print("Top 20 words in the entire database:")
for word, count in top_words:
    print(f"{word}: {count}")


In [None]:
# Group the dataset by 'ownerUsername'
grouped_data = data.groupby('ownerUsername')

# Loop through each group
for username, group in grouped_data:
    # Get the captions for the current username, dropping any missing values and converting to string
    captions = group['caption'].dropna().astype(str)

    # Combine all captions into a single string, removing punctuation and converting to lowercase
    text_data = ' '.join(captions).lower()
    text_data = text_data.translate(str.maketrans('', '', '!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'))

     # Check if text_data is empty after preprocessing
    if text_data.strip():  # This checks if text_data is not just whitespace
        wordcloud = WordCloud(stopwords=custom_stopwords, background_color='white', width=800, height=400).generate(text_data) # Generate the word cloud for the current username
        plt.figure(figsize=(10, 5)) # Initialize a figure for the current word cloud
        # Plot the word cloud
        plt.imshow(wordcloud, interpolation='bilinear')
        plt.title(f'Word Cloud for {username}')
        plt.axis('off')
        # Display the plot
        plt.show()
    else:
        print(f"No words to plot for {username}.")



In [None]:
from collections import Counter
import string
import pandas as pd 

# Supondo que 'custom_stopwords' seja uma lista de palavras a serem excluídas
stopwords = set(custom_stopwords)

# Função para limpar e tokenizar o texto
def clean_and_tokenize(text):
    # Remover pontuação
    text = text.translate(str.maketrans('', '', string.punctuation))
    # Tokenizar dividindo o texto
    words = text.split()
    # Remover stopwords e converter os tokens para minúsculo
    return [word.lower() for word in words if word.lower() not in stopwords]

# Lista para armazenar os resultados
results = []

# Loop através de cada grupo
for username, group in grouped_data:
    # Inicializar um objeto Counter para contar as ocorrências das palavras
    word_counts = Counter()

    # Loop através de cada legenda no grupo
    for caption in group['caption'].dropna().astype(str):
        # Limpar e tokenizar a legenda, depois atualizar as contagens das palavras
        word_counts.update(clean_and_tokenize(caption))

    # Obter as 10 palavras mais comuns e suas contagens
    top_words = word_counts.most_common(10)

    # Adicionar os resultados para o usuário atual na lista de resultados
    for word, count in top_words:
        results.append({'Usuário': username, 'Palavra': word, 'Contagem': count})

# Criar um DataFrame do pandas com os resultados
df_results = pd.DataFrame(results)

# Salvar o DataFrame em um arquivo Excel
df_results.to_excel('out/resultados_palavras.xlsx', index=False)


In [None]:
import base64
from io import BytesIO
import matplotlib.pyplot as plt
from wordcloud import WordCloud

# Start building the HTML content
html_content = "<html><head><title>Word Clouds</title></head><body>"

for username, group in grouped_data:
    captions = group['caption'].dropna().astype(str)
    text_data = ' '.join(captions).lower()
    text_data = text_data.translate(str.maketrans('', '', '!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'))

    if text_data.strip():  # Check if text_data is not empty
        wordcloud = WordCloud(stopwords=custom_stopwords, background_color='white', width=800, height=400).generate(text_data)
        
        # Save the plot to a BytesIO buffer
        buffer = BytesIO()
        plt.figure(figsize=(10, 5))
        plt.imshow(wordcloud, interpolation='bilinear')
        plt.title(f'Word Cloud for {username}')
        plt.axis('off')
        plt.savefig(buffer, format='png')
        plt.close()  # Close the plot to free up memory

        # Encode the image as base64
        buffer.seek(0)
        img_str = base64.b64encode(buffer.read()).decode()

        # Add the image to the HTML, encoded as base64
        html_content += f'<h1>Word Cloud for {username}</h1>'
        html_content += f'<img src="data:image/png;base64,{img_str}"/><br/>'

    else:
        print(f"No words to plot for {username}.")

# Finish the HTML file
html_content += "</body></html>"

# Write the HTML content to a file
with open('out/word_clouds.html', 'w') as f:
    f.write(html_content)


In [None]:
from docx import Document
from docx.shared import Inches
import matplotlib.pyplot as plt
from wordcloud import WordCloud
import io

# Create a new Word document
doc = Document()

for username, group in grouped_data:
    captions = group['caption'].dropna().astype(str)
    text_data = ' '.join(captions).lower()
    text_data = text_data.translate(str.maketrans('', '', '!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'))

    if text_data.strip():  # Check if text_data is not empty
        wordcloud = WordCloud(stopwords=custom_stopwords, background_color='white', width=800, height=400).generate(text_data)
        
        # Save the plot to a bytes object to avoid writing to disk
        buf = io.BytesIO()
        plt.figure(figsize=(10, 5))
        plt.imshow(wordcloud, interpolation='bilinear')
        plt.title(f'Word Cloud for {username}')
        plt.axis('off')
        plt.savefig(buf, format='png')
        plt.close()  # Close the figure to free memory

        # Seek to the start of the BytesIO buffer
        buf.seek(0)

        # Add a heading with the username
        doc.add_heading(f'Word Cloud for {username}', level=1)

        # Add the image to the Word document
        doc.add_picture(buf, width=Inches(6))

        # Add a page break after each word cloud
        doc.add_page_break()

        # Clear the buffer for the next image
        buf.truncate(0)
        buf.seek(0)
    else:
        print(f"No words to plot for {username}.")

# Save the document
doc.save('out/word_clouds.docx')
