In [None]:
import sqlite3
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import PCA
from nltk.tokenize import TweetTokenizer
import nltk
import pickle
from imblearn.under_sampling import RandomUnderSampler
nltk.download('stopwords')

In [None]:
conn = sqlite3.connect('Reddit.db')

brasildob = pd.read_sql_query("""
select Comment.body, Subreddit.name from Comment
INNER JOIN Subreddit ON Comment.subreddit_id = Subreddit.id
WHERE Subreddit.name="BrasildoB"
""", conn)

brasil = pd.read_sql_query("""
select Comment.body, Subreddit.name from Comment
INNER JOIN Subreddit ON Comment.subreddit_id = Subreddit.id
WHERE Subreddit.name="brasil"
""", conn)

brasilivre = pd.read_sql_query("""
select Comment.body, Subreddit.name from Comment
INNER JOIN Subreddit ON Comment.subreddit_id = Subreddit.id
WHERE Subreddit.name="brasilivre"
""", conn)

In [None]:
bigdata = pd.concat([brasil, brasilivre, brasildob])
X, y = RandomUnderSampler().fit_resample(bigdata.body.to_numpy().reshape(-1,1), bigdata.name)

In [None]:
bigdata = pd.DataFrame({"body":X.flatten(), "subreddit":y})
#bigdata = bigdata.sample(n=500, random_state=42)

In [None]:
vect = TfidfVectorizer(tokenizer=TweetTokenizer().tokenize)
vect.fit(bigdata.body)
tfidf_dict = dict(zip(vect.get_feature_names(), vect.idf_))

In [None]:
from gensim.models import Word2Vec

tknzr = TweetTokenizer()
stop_words = set(nltk.corpus.stopwords.words('portuguese'))

texts = bigdata.body.str.lower()
#texts = texts.apply(lambda x: ' '.join([word for word in x.split() if word not in (stop_words)]))

X = texts.apply(tknzr.tokenize)

emb = Word2Vec(X, vector_size=100, min_count=1, workers=8)

In [None]:
tokens = []
labels = []

for word in list(emb.wv.key_to_index):
  tokens.append(emb.wv[word])
  labels.append(word)

pca = PCA(n_components=50)
new_values = pca.fit_transform(tokens)
tsne = TSNE(perplexity=40, n_components=2)
new_values = tsne.fit_transform(new_values)

In [None]:
sns.scatterplot(new_values[:,0], new_values[:,1])

In [None]:
def embedding_transform(X, tfidf, w2v):
    docs_vectors = pd.DataFrame()
    for doc in X:
      temp = pd.DataFrame()
      for word in doc:
        try:
          word_vec = tfidf[word] * w2v.wv[word]
          temp = pd.concat([temp, pd.DataFrame([word_vec])], ignore_index = True)
        except:
          pass
      doc_vector = temp.mean()
      docs_vectors = pd.concat([docs_vectors, pd.DataFrame([doc_vector])], ignore_index = True)
    return docs_vectors

In [None]:
X_emb = embedding_transform(X.iloc, tfidf_dict, emb)
X_emb

In [None]:
indexes_na = list(X_emb[X_emb.isna().any(axis=1)].index)
indexes_na

In [None]:
X_emb.drop(indexes_na, inplace=True)
bigdata = bigdata.reset_index().drop(indexes_na)

In [None]:
#pickle.dump(bigdata, open("bigdata.pickle", "wb"))
bigdata = pickle.load(open("bigdata.pickle", "rb"))
bigdata

In [None]:
pca = PCA(n_components=50)
pca_data = pca.fit_transform(X_emb)
tsne = TSNE(perplexity=40, n_components=3)
tsne_data = tsne.fit_transform(pca_data)

In [None]:
emb_data = pd.DataFrame({"Attr1":tsne_data[:,0],
                        "Attr2":tsne_data[:,1],
                        'body':bigdata.body,
                        "subreddit":bigdata.subreddit})

pickle.dump(emb_data, open("emb_data_pca_tsne.pickle", "wb"))

In [None]:
to_plot = emb_data #emb_data[emb_data.subreddit != "brasil"]
plt.figure(figsize=(15,10))
sns.scatterplot("Attr1", "Attr2", hue="subreddit", data=to_plot)

In [None]:
import plotly.express as px
import plotly

fig = px.scatter(emb_data, 
                 x="Attr1", 
                 y="Attr2", 
                 hover_data = ['body'],
                 color="subreddit", 
                 symbol='subreddit')
plotly.offline.plot(fig, filename='fig.html')
#fig.show(renderer="browser")

In [None]:
sns.histplot(to_plot.subreddit)

In [None]:
sns.kdeplot("Attr1", hue="subreddit", data=to_plot)
sns.kdeplot("Attr2", hue="subreddit", data=to_plot)

In [None]:
sns.boxplot(x=to_plot.subreddit, y=to_plot.Attr1)

In [None]:
sns.boxplot(x=to_plot.subreddit, y=to_plot.Attr2)
