In [None]:
import pandas as pd
import numpy as np
import pickle
import pathlib
import glob
import matplotlib.pyplot as plt
from collections import Counter
from sklearn.feature_extraction import text 
from sklearn.feature_extraction.text import CountVectorizer
from wordcloud import WordCloud
from statistics import mean

# Read in the document-term matrix

datahoroscope = pd.read_pickle('horoscopedtm.pkl')
datahoroscope = datahoroscope.transpose()
datahoroscope.head()

dataganeshaspeaks = pd.read_pickle('ganeshaspeaksdtm.pkl')
dataganeshaspeaks = dataganeshaspeaks.transpose()
dataganeshaspeaks.head()

dataastrology = pd.read_pickle('astrologydtm.pkl')
dataastrology = dataastrology.transpose()
dataastrology.head()

dataastrostyle = pd.read_pickle('astrostyledtm.pkl')
dataastrostyle = dataastrostyle.transpose()
dataastrostyle.head()

dataall = pd.read_pickle('alldtm.pkl')
dataall = dataall.transpose()
dataall.head()

horoscopes = ['aries', 'taurus', 'gemini', 'cancer', 'leo', 'virgo', 'libra', 'scorpio', 'sagittarius', 'capricorn', 'aquarius', 'pisces']


In [None]:
# Find the top 30 words
top_dict1 = {}
for h in datahoroscope.columns:
    top = datahoroscope[h].sort_values(ascending=False).head(30)
    top_dict1[h]= list(zip(top.index, top.values))
    
top_dict2 = {}   
for h in dataganeshaspeaks.columns:
    top = dataganeshaspeaks[h].sort_values(ascending=False).head(30)
    top_dict2[h]= list(zip(top.index, top.values))
    
top_dict3 = {}   
for h in dataastrology.columns:
    top = dataastrology[h].sort_values(ascending=False).head(30)
    top_dict3[h]= list(zip(top.index, top.values))
    
top_dict4 = {}   
for h in dataastrostyle.columns:
    top = dataastrostyle[h].sort_values(ascending=False).head(30)
    top_dict4[h]= list(zip(top.index, top.values))
    
top_dict5 = {}   
for h in dataall.columns:
    top = dataall[h].sort_values(ascending=False).head(30)
    top_dict5[h]= list(zip(top.index, top.values))
                   

In [None]:
# Print the top 15 words
for horoscope, top_words in top_dict5.items():
    print(horoscope)
    print(', '.join([word for word, count in top_words[0:14]]))
    print('---')

In [None]:
words = []
for horoscope in dataall.columns:
    top = [word for (word, count) in top_dict5[horoscope]]
    for t in top:
        words.append(t)

In [None]:
Counter(words).most_common()

In [None]:
# If more than half of the horoscopes have it as a top word, exclude it from the list
#add_stop_words = [word for word, count in Counter(words).most_common() if count > 4]
 
add_stop_words = ['ganesha','youre']
add_stop_words

In [None]:
allcorpus_clean = pd.read_pickle('allcorpus_clean.pkl')

# Add new stop words
stop_words = text.ENGLISH_STOP_WORDS.union(add_stop_words)

cv = CountVectorizer(stop_words=stop_words)
data_cv = cv.fit_transform(allcorpus_clean.interpretation)
data_stop = pd.DataFrame(data_cv.toarray(), columns=cv.get_feature_names())
data_stop.index = allcorpus_clean.index

pickle.dump(cv, open("cv_stop.pkl", "wb"))
data_stop.to_pickle("dtm_stop.pkl")

In [None]:
wc = WordCloud(stopwords=stop_words, background_color="white", colormap="Dark2",
               max_font_size=150, random_state=42)

plt.rcParams['figure.figsize'] = [16, 6]

# Create subplots for each horoscopes (Daily)
for index, horoscope in enumerate(dataall.columns):
    wc.generate(allcorpus_clean.interpretation[horoscope])
    
    plt.subplot(3, 4, index+1)
    plt.imshow(wc, interpolation="bilinear")
    plt.axis("off")
    plt.title(horoscopes[index])
    
plt.show()

In [None]:
# Read ALL data from folders

path_list = glob.glob("Data/*")

names =[]
for path in path_list:
    path = path.replace('Data\\','')
    names.append(path)
    
data_ganeshaspeaks = {}
data_horoscope = {}
data_astrology = {}
data_astrostyle = {}

for i, c in enumerate(horoscopes):
    ganesha = []
    horoscope = []
    astrology = []
    astrostyle = []
    for j in range(0, len(names)):
        with open("Data/" + names[j] + "/ganeshaspeaks.com/" + c + ".txt", "rb") as file:
            temp1 = pickle.load(file)
            ganesha.append(temp1[1])
            data_ganeshaspeaks[c] = ganesha
        with open("Data/" + names[j] + "/horoscope.com/" + c + ".txt", "rb") as file:
            temp2 = pickle.load(file)
            horoscope.append(temp2[0])
            data_horoscope[c] = horoscope
        with open("Data/" + names[j] + "/astrology.com/" + c + ".txt", "rb") as file:
            temp3 = pickle.load(file)
            astrology.append(temp3[0])
            data_astrology[c] = astrology
        with open("Data/" + names[j] + "/astrostyle.com/" + c + ".txt", "rb") as file:
            temp4 = pickle.load(file)
            astrostyle.append(temp4[0])
            data_astrostyle[c] = astrostyle


In [None]:
#returns minimum, maximum and average word counts for a site
def min_max_words(dic):
    lengths = []
    for i, h in enumerate(horoscopes):
        for j in range(0,len(names)):
            words = dic[h][j].split()
            lengths.append(len(words))
    max_length = max(lengths)
    min_length = min(i for i in lengths if i > 0) 
    average_length = mean(i for i in lengths if i > 0)
    
    return max_length, min_length, average_length

max_ganesha, min_ganesha, avr_ganesha = min_max_words(data_ganeshaspeaks)
max_horoscope, min_horoscope, avr_horoscope = min_max_words(data_horoscope)
max_astrology, min_astrology, avr_astrology = min_max_words(data_astrology)
max_astrostyle, min_astrostyle, avr_astrostyle = min_max_words(data_astrostyle)

In [None]:
#returns minimum, maximum and average sentence counts for a site
def min_max_sentence(dic):
    lengths = []
    for i, h in enumerate(horoscopes):
        for j in range(0,12):
            sentences = dic[h][j].split('.')
            lengths.append(len(sentences))
    max_length = max(lengths)
    min_length = min(i for i in lengths if i > 0) 
    average_length = mean(i for i in lengths if i > 0)
    
    return max_length, min_length, average_length

max_ganesha_sent, min_ganesha_sent, avr_ganesha_sent = min_max_sentence(data_ganeshaspeaks)
max_horoscope_sent, min_horoscope_sent, avr_horoscope_sent = min_max_sentence(data_horoscope)
max_astrology_sent, min_astrology_sent, avr_astrology_sent = min_max_sentence(data_astrology)
max_astrostyle_sent, min_astrostyle_sent, avr_astrostyle_sent = min_max_sentence(data_astrostyle)

In [None]:
sites = ['ganeshaspeaks.com', 'horoscope.com', 'astrology.com', 'astrostyle.com']

# PLOT 1

plt.rcParams['figure.figsize'] = [6, 4]


# data to plot
data_sentence = [[min_ganesha_sent, min_horoscope_sent, min_astrology_sent, min_astrostyle_sent],
                 [avr_ganesha_sent, avr_horoscope_sent, avr_astrology_sent, avr_astrostyle_sent],
                 [max_ganesha_sent, max_horoscope_sent, max_astrology_sent, max_astrostyle_sent]]

n_groups = 4

# create plot
fig, ax = plt.subplots()
index = np.arange(n_groups)
bar_width = 0.22
opacity = 0.8

rects1 = plt.bar(index, data_sentence[0], bar_width, alpha=0.4, color='red', label= "min")

rects2 = plt.bar(index + bar_width, data_sentence[1], bar_width, alpha=0.4, color='blue', label="average")

rects3 = plt.bar(index + bar_width*2, data_sentence[2], bar_width, alpha=0.5, color='orange', label="max")

for s in ['top','right']:
    ax.spines[s].set_visible(False)
    
ax.grid(b=True, color='grey', linestyle='-.', linewidth=0.5, alpha=0.2)

ax.xaxis.set_ticks_position('none')
ax.yaxis.set_ticks_position('none')

plt.ylabel('Sentence Counts')
plt.title('Statistics')
plt.xticks(index + bar_width, (sites[0], sites[1], sites[2], sites[3]))
plt.legend()

plt.tight_layout()
plt.savefig("Statistics_Sentence.jpg", dpi=200)
plt.show()

In [None]:
# PLOT 2

# data to plot
data_words = [[min_ganesha, min_horoscope, min_astrology, min_astrostyle],
              [avr_ganesha, avr_horoscope, avr_astrology, avr_astrostyle],
              [max_ganesha, max_horoscope, max_astrology, max_astrostyle]]

n_groups = 4

# create plot
fig, ax = plt.subplots()
index = np.arange(n_groups)
bar_width = 0.22
opacity = 0.8

rects1 = plt.bar(index, data_words[0], bar_width, alpha=0.4, color='red', label= "min")

rects2 = plt.bar(index + bar_width, data_words[1], bar_width, alpha=0.4, color='blue', label="average")

rects3 = plt.bar(index + bar_width*2, data_words[2], bar_width, alpha=0.5, color='orange', label="max")

for s in ['top','right']:
    ax.spines[s].set_visible(False)
    
ax.grid(b=True, color='grey', linestyle='-.', linewidth=0.5, alpha=0.2)

ax.xaxis.set_ticks_position('none')
ax.yaxis.set_ticks_position('none')

plt.ylabel('Word Counts')
plt.title('Statistics')
plt.xticks(index + bar_width, (sites[0], sites[1], sites[2], sites[3]))
plt.legend()

plt.tight_layout()
plt.savefig("Statistics_Words.jpg", dpi=200)
plt.show()
    
    
