In [None]:
# Read in the corpus, including punctuation!
import pickle
import random
import glob
import re
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from statistics import mean
from collections import Counter
from textblob import TextBlob


# Read ALL data from folders

path_list = glob.glob("Data/*")

names =[]
for path in path_list:
    path = path.replace('Data\\','')
    names.append(path)
    
data_ganeshaspeaks = {}
data_horoscope = {}
data_astrology = {}
data_astrostyle = {}

horoscopes = ['aries', 'taurus', 'gemini', 'cancer', 'leo', 'virgo', 'libra', 'scorpio', 'sagittarius', 'capricorn', 'aquarius', 'pisces']


for i, c in enumerate(horoscopes):
    ganesha = []
    horoscope = []
    astrology = []
    astrostyle = []
    for j in range(0, len(names)):
        with open("Data/" + names[j] + "/ganeshaspeaks.com/" + c + ".txt", "rb") as file:
            temp1 = pickle.load(file)
            ganesha.append(temp1[1])
            data_ganeshaspeaks[c] = ganesha
        with open("Data/" + names[j] + "/horoscope.com/" + c + ".txt", "rb") as file:
            temp2 = pickle.load(file)
            horoscope.append(temp2[0])
            data_horoscope[c] = horoscope
        with open("Data/" + names[j] + "/astrology.com/" + c + ".txt", "rb") as file:
            temp3 = pickle.load(file)
            astrology.append(temp3[0])
            data_astrology[c] = astrology
        with open("Data/" + names[j] + "/astrostyle.com/" + c + ".txt", "rb") as file:
            temp4 = pickle.load(file)
            astrostyle.append(temp4[0])
            data_astrostyle[c] = astrostyle


In [None]:
#key: horoscopes, value: string format for data_all 

def listToString(s):  
    
    # initialize an empty string 
    str1 = ""  
    
    # traverse in the string   
    res = [''.join(ele) for ele in s] 
    for ele in res:  
        str1 += ele   
    
    # return string   
    return str1  

for i, h in enumerate(horoscopes):
    data_ganeshaspeaks[h] = listToString(data_ganeshaspeaks[h])
    data_horoscope[h] = listToString(data_horoscope[h])
    data_astrology[h] = listToString(data_astrology[h])
    data_astrostyle[h] = listToString(data_astrostyle[h])


In [None]:
# creates a table with frequencies
def generateTable(data):
    
    T = {}
    data = data.split()
    for i in range(len(data)-1):
        X = data[i]
        Y = data[i+1]
        #print("X  %s and Y %s  " %(X,Y))
        
        if T.get(X) is None:
            T[X] = {}
            T[X][Y] = 1
        else:
            if T[X].get(Y) is None:
                T[X][Y] = 1
            else:
                T[X][Y] += 1
    
    return T


In [None]:
# converts freqeuncies to probabilitys
def convertFrequencies(T):     
    for i in T.keys():
        sum_values = float(sum(T[i].values()))
        for k in T[i].keys():
            T[i][k] = T[i][k]/sum_values
                
    return T


In [None]:
# creats model by using generateTable() and convertFrequencies methods 
def MarkovChain(text):
    T = generateTable(text)
    T = convertFrequencies(T)
    return T

In [None]:
def sample_next(word,model):
 
    if model.get(word) is None:
        return " "
    possible_words = list(model[word].keys())
    possible_values = list(model[word].values())
 
    return np.random.choice(possible_words,p=possible_values)

In [None]:
# generates texts that has length maxLen
def generateText(model, maxLen=15):
    
    starting_sent = random.choice(list(model.keys()))
    word1 = starting_sent
    sentence = word1.capitalize()
    ctx = starting_sent
    
    for ix in range(maxLen-1):
        next_prediction = sample_next(ctx,model)
        sentence += ' ' + next_prediction
        ctx = next_prediction
    
    return sentence

In [None]:
texts = {}
models = {}
stopWords = ['Ganesha', 'Jan', 'Dec']
  
for i, h in enumerate(horoscopes):
    texts[h]= data_ganeshaspeaks[h] + data_horoscope[h] + data_astrology[h] + data_astrostyle[h]
    texts[h] = texts[h].replace('\n',' ')
    texts[h] = texts[h].replace('\t',' ')
    texts[h] = texts[h].replace('“', '')
    texts[h] = texts[h].replace('”', '')
    texts[h] = texts[h].replace('(', '')
    texts[h] = texts[h].replace(')', '')
    texts[h] = texts[h].replace('-', '')
    texts[h] = texts[h].replace('—', '')
    texts[h] = texts[h].replace(':', '')
    texts[h] = re.sub(r'[0-9]+', '', texts[h])

    for spaced in ['.',',','!','?']:
        texts[h] = texts[h].replace(spaced, ' {0} '.format(spaced))
        
    text_words = texts[h].split()

    resultwords  = [word for word in text_words if word not in stopWords]
    texts[h] = ' '.join(resultwords)
    
    models[h] = MarkovChain(texts[h])

In [None]:
#Counter(texts['taurus'].split()).most_common()

In [None]:
# for multiple generation for all the horoscopes
def mult_generation(models, number):
    final_texts = {}

    for i, h in enumerate(horoscopes):
        maxLen = random.randint(50, 150) # text lengths will range from 50 to 150
        final_texts[h] = [] 
        for j in range(0, testNumber):
            final_texts[h].append(generateText(models[h], maxLen))
        
    return final_texts

In [None]:
# Returns sentiment score list
def sentiment_analysis(dic, testNumber):
    scores = []
    for i, h in enumerate(horoscopes):
        scores.append([])
        for j in range(0,testNumber):
            pol =  TextBlob(dic[h][j]).sentiment.polarity
            scores[i].append(pol)
    return scores

In [None]:
# generate testNumber*interpretation of every horoscope and calculate sentiment scores all of them
testNumber = 250
final_texts_list = mult_generation(models, testNumber)
sentimentScores = sentiment_analysis(final_texts_list, testNumber)

In [None]:
# calculate average sentiment scores of every horoscope
avr_sentimentScores = []
for i in range(0, 12):
    avr_sentimentScores.append(mean(sentimentScores[i]))
avr_sentimentScores

In [None]:
# plot of average scores
plt.plot(avr_sentimentScores, alpha=0.9, color='orange')

# Labeling the X-axis 
plt.xlabel('Horoscopes') 
# Labeling the Y-axis 
plt.ylabel('Sentiment Score') 
# Give a title to the graph
plt.title('Sentiment Scores of Generated Interpretations')
plt.xticks(np.arange(12),('aries', 'taurus', 'gemini', 'cancer', 'leo', 'virgo', 'libra', 'scorpio', 'sagittarius', 'capricorn', 'aquarius', 'pisces'), rotation= 45)

 
plt.show()

In [None]:
print (generateText(models['aquarius'], maxLen=50))