# Purpose
Based on latest twitter trends, make a haiku that follows the standard rules.  Then post that tweet in form.

# 0.0 Imports

In [1]:
#system
import os
import numpy as np
from datetime import datetime, timedelta

#spark
from pyspark import SparkContext, RDD
from pyspark.sql import SparkSession, functions as F, Window
from pyspark.sql.types import FloatType, IntegerType
from operator import add

#twitter
import tweepy as tw

#npl
import nltk
from nltk.corpus import stopwords, cmudict
import re

# 1.0 Setup

### 1.1 Twitter

Setup Twitter Keys 

In [2]:
consumer_key = os.environ['twitter_consumer_key']
consumer_secret_key = os.environ['twitter_consumer_secret_key']
access_token =  os.environ['twitter_access_token']
access_token_secret = os.environ['twitter_access_token_secret']

Authorize

In [3]:
#setup authorization
auth = tw.OAuthHandler(consumer_key,consumer_secret_key)
auth.set_access_token(access_token,access_token_secret)

Activate twitterAPI

In [4]:
#make API
twitterAPI = tw.API(auth,wait_on_rate_limit=True)

### 1.2 Pyspark

Activate Pyspark

In [5]:
sc = SparkContext("local","nugget")

Activate builder

In [6]:
scCreate = SparkSession.builder.getOrCreate()

# 2.0 Isolate Global Trend 
The tweet has to be based on a American trend in order for the tweet to gain traction. <br>
I am going to focus on America initially, because I speak english and that will make things more tractable.

Identify what is the number one trend in America

In [7]:
#indicate the trend of interset, ranked from top to bottom
trendRank = 0

#grab the trend
trend = twitterAPI.trends_place(23424977)[0]['trends'][trendRank]['name']

#print out trend for the record
trend

'Melo'

Get text of tweets related to the smackdown (eliminate retweets, no additional knowledge) (last 24 hours)

In [8]:
trendCursor = tw.Cursor(#initatize search
                        twitterAPI.search
                        
                        #search the trend, excluding retweents
                        ,q=trend +" -filter:retweets" 
                        
                        #english tweets only
                        ,lang="en"
                        
                        #start of range of time (drop if we only want the most recent)
                        #,since="2020-12-03"
    
                        #result type (more interested to making sure we get things that are recent)
                        ,result_type="recent"
                        
                        #collect only this many tweets
                       ).items(1000)

Extract the text of trends themselves 

In [9]:
trendText = [x.text for x in trendCursor]

Head of trend

In [10]:
trendText[0:5]

['Favorite Basketball Names:\nDino Radja\nFab Melo\nFrancisco Elson \nChauncey Billups\nGod Shammgod\nDionte Christmas\nSpud… https://t.co/OIobVKRJY7',
 'Funny thing is that year y’all hated Kd so bad mfs was rooting for Houston and telling me if cp didn’t get hurt they knock golden state whew',
 '@realDonaldTrump #Cristmas #shopping for me\n\nhttps://t.co/qPiM6JtZFm\n\nMelo Supreme Court #JoeBobSavesChristmas Luka… https://t.co/9xWYHJR3XI',
 '@BlueMarbleProYT @TheHoopCentral Are you new to watching melo hoop? He does this all the time his team don’t care',
 'Ppl telling me harden style doesn’t win like cp3 didn’t get hurt and they was a game away from the finals 😂']

# 3.0 Get words, weights, and syllables
With spark get all unqiue words and counts (use as weight in bag of words) <br>
With nltk find the number of syllables

### 3.1 Get the words and the weights out of the trend text

Function to intelligenlty join together ext of all tweets

Parallelize

Get count of unique words. Filter out one letters (no insight).  Filter out stopwords.  Put into data frame

In [11]:
#get stop word dict for referenec (otherwise triggers pickling error)

#broadcast to all mappers
sw = stopwords.words('english') + ['http','https','co']

#obcense words list
obsceneWord = ['fuck','fucking','fucked','fucks','ass','motherfucker','shit']

Find words and their unique occurances of words

In [12]:
#generate dataframe
ttpDF = scCreate.createDataFrame(
    
#paralleize trend text
(sc.parallelize(trendText)\

#consolidate tweets into single master key, combine together, have only teh value of the tweeets together
.map(lambda x:(1,x+" ")).reduceByKey(add).values()\

#lower the case of all the words and strip out non-alpabetic characters
.map(lambda x:re.sub('[^a-z]+'," ",x.lower()))\

#create keys for each unique word (split on spaces)
.flatMap(lambda x: x.split(" "))\
      
#add on counter
.map(lambda x:(x,1))\
    
#get sum of occurances of each word
.reduceByKey(add)\
      
#eliminate words length 1
.filter(lambda x:len(x[0])>1)\

#elimiate stop words
.filter(lambda x:x[0] not in sw)\
 
#elimiate obscene words
.filter(lambda x:x[0] not in obsceneWord)\
      
#sort (descending order)
.sortBy(lambda x:-1*x[1])  

#add on words and counts as column names
),schema =['word','count'])

Find the syllables for words (for now only real words are included)

In [13]:
#dictionary with the syallbles
d = cmudict.dict()

def SyllableCount(word,cmuDict = d):
    ''' For a given word,find the number of syllables'''
    
    #clean word
    word = word.lower()
    
    #if word in dict, then proceeed
    if word in cmuDict.keys():
        
        #get the word spread out (join)
        dList = "".join(cmuDict[word][0])
        
        #acquire total count of syllables 
        dCount = dList.count("0") + dList.count("1") + dList.count("2")
        
        #return
        return dCount
    
    #indicate unknown word
    else:
        #return -1 to indicate an error 
        return -1
    
#spark function
udfSyllableCount =F.udf(SyllableCount,IntegerType())

Get all the case where the syllables can be found.  For negative ones, exclude

In [14]:
ttpDF2 = ttpDF.withColumn("syllable",udfSyllableCount(F.col("word"))).filter(F.col("syllable")!= -1)

For Haiku, Randomly Rank the words that arge going to be use.  Weight based on the count (np.log)

In [15]:
#function to assign final weights
def GetWeightRank(x):
    """based on x, returns random value.  Natural log take to reduce oversampling of very, very common words"""
    return float(np.random.random()*np.log(x))

#make spark function
udfGetWeightRank = F.udf(GetWeightRank,FloatType())

Apply function to rank, sort, and index

In [16]:
#generate column with rank based on the weighted count
ttpDF2 = ttpDF2.withColumn("rank",udfGetWeightRank(F.col("count")))

#sort by the rank in order to find the words we will be using
ttpDF2 = ttpDF2.sort(F.desc("rank"))

#with this sort, add in an index column we can recognize
ttpDF2 = ttpDF2.withColumn("index",F.row_number().over(Window.orderBy(F.monotonically_increasing_id()))-1)

# 4.0 Get the @ to send the tweet to

In [17]:
#paralleize trend text
haikuAddress = (sc.parallelize(trendText)\

#consolidate tweets into single master key, combine together, have only teh value of the tweeets together
.map(lambda x:(1,x+" ")).reduceByKey(add).values()\
 
#create keys for each unique word (split on spaces)
.flatMap(lambda x: x.split(" "))\

#add on counter
.map(lambda x:(x,1))\
 
#get sum of occurances of each word
.reduceByKey(add)\

#retain only addresses
.filter(lambda x: "@" in x[0])\

#sort (descending order)
.sortBy(lambda x:-1*x[1])\
 
#only interested in the most common one
.take(1)
)[0][0]

haikuAddress

'@trailblazers'

# 5.0 Put it all together 

Develop Function To Make Haiku

In [18]:
def HaikuBuilder(df,trend,address,form=[5,7,5],verbose=True):
    """build haiku status for twitter.  Includes the trend that seeded the tweet, as well as the most common address to send it into the world.
    
    Args:
        df (pyspark.sql.dataframe.DataFrame) = DataFrame with words from trends in order for generation of haikus
        trend (string) = original trend seeding the haiku
        address (string) = most common profile seen, ie the one who will appreciate the haiku the most 
        form (list) = format of haiku [first line length in syllables, second line length in syllables, third line length in syllables]. Defaults [5,7,5]
        verbose (boolean) = True if to print each word as it is added to haiku, False prints nothing
    
    Returns:
        string : the status for twitter (includes note on trend, address, and haiku)
    
    """
    
    #for incrementing through the df
    indexHolder = 0
    
    #hold text of the haiku
    lineDict = {
        0:"",
        1:"",
        2:""
    }
    
    #hold syllable counts for each line
    syllableDict = {
        0:0,
        1:0,
        2:0
        
    }
    #collect the df
    df = df.collect()
    
    #while statement iterates forward until all three lines until it is fully populated
    while not ((syllableDict[0] == form[0]) & (syllableDict[1] == form[1]) & (syllableDict[2] == form[2])):
        
        #grab new word from twitter
        newWord = df[indexHolder]
        
        #iterate through applying the syllables
        for i in range(3):
        
            #first line apply as words are needed
            if newWord['syllable'] + syllableDict[i] <= form[i]:

                #if it fits the rules, add to the word and syllable count
                lineDict[i] = lineDict[i] + " " + newWord['word'] 
                syllableDict[i] = syllableDict[i] + newWord['syllable']
                
                #break if word is entered, if word is not entered try fitting word into 
                break
            
            #otherwise, proceed to next word
            
        
        #if verbose, print the progress
        if verbose:
            print(indexHolder, newWord['word'],len(df),lineDict,syllableDict)
        
        #increament index (so next time we will grab the next word)
        indexHolder += 1 
    
    #add on end of lines, also capital first letter 
    for i in range(3):
        
        #generate the suffix to indicate seperate sentences
        suffix = "."
        if i != 2:
            suffix += "\n"
        
        #apply upper case to first letter, and then suffix.
        lineDict[i] = lineDict[i][1].upper() + lineDict[i][2:] + suffix

    
    #combine and return into one long string (as well as indicate trend)
    return f"Haiku for {trend} {address} :\n "+ " ".join([x for x in lineDict.values()])

haiku = HaikuBuilder(ttpDF2,trend,haikuAddress)
haiku

0 melo 1364 {0: ' melo', 1: '', 2: ''} {0: 2, 1: 0, 2: 0}
1 like 1364 {0: ' melo like', 1: '', 2: ''} {0: 3, 1: 0, 2: 0}
2 trailblazers 1364 {0: ' melo like', 1: ' trailblazers', 2: ''} {0: 3, 1: 3, 2: 0}
3 good 1364 {0: ' melo like good', 1: ' trailblazers', 2: ''} {0: 4, 1: 3, 2: 0}
4 go 1364 {0: ' melo like good go', 1: ' trailblazers', 2: ''} {0: 5, 1: 3, 2: 0}
5 ball 1364 {0: ' melo like good go', 1: ' trailblazers ball', 2: ''} {0: 5, 1: 4, 2: 0}
6 know 1364 {0: ' melo like good go', 1: ' trailblazers ball know', 2: ''} {0: 5, 1: 5, 2: 0}
7 bench 1364 {0: ' melo like good go', 1: ' trailblazers ball know bench', 2: ''} {0: 5, 1: 6, 2: 0}
8 think 1364 {0: ' melo like good go', 1: ' trailblazers ball know bench think', 2: ''} {0: 5, 1: 7, 2: 0}
9 league 1364 {0: ' melo like good go', 1: ' trailblazers ball know bench think', 2: ' league'} {0: 5, 1: 7, 2: 1}
10 already 1364 {0: ' melo like good go', 1: ' trailblazers ball know bench think', 2: ' league already'} {0: 5, 1: 7, 2: 4}
1

'Haiku for Melo @trailblazers :\n Melo like good go.\n Trailblazers ball know bench think.\n League already trent.'

Update Status

In [19]:
if True:
    twitterAPI.update_status(haiku)
    print('Haiku Uploaded to World')

Haiku Uploaded to World


Stop spark context

In [20]:
sc.stop()

# End 