## Load the relevant pieces

In [1]:
import pandas as pd
from festivalTools import *

  from collections import Sequence


In [2]:
allText_Lemmatized = pd.read_pickle("Data/allTextSep27_lemmatized.pkl")
allText_Lemmatized.drop_duplicates(subset=['Event Name'], inplace=True, keep='last')
allText_Lemmatized.drop_duplicates(subset=['Summary'], inplace=True, keep='last')

In [3]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import NMF, LatentDirichletAllocation

## Now create some word bags

In [4]:
stopwords = nltk.corpus.stopwords.words('english')
newStopWords = ['january', 'february', 'march',
                'april', 'may', 'june', 'july',
                'aug', 'august', 'sept', 'september', '2018']
stopwords.extend(newStopWords)

In [5]:
tf_vectorizer = CountVectorizer(max_df=0.95, min_df=2,
                                max_features=10000,
                                stop_words=stopwords)
tf = tf_vectorizer.fit_transform(allText_Lemmatized['Text'])

In [6]:
tf_vectorizer.get_feature_names()

['00',
 '000',
 '00am',
 '00pm',
 '01',
 '02',
 '020',
 '023',
 '03',
 '04',
 '05',
 '06',
 '07',
 '08',
 '09',
 '10',
 '100',
 '1000',
 '100th',
 '101',
 '102',
 '104',
 '105',
 '106',
 '108',
 '109',
 '10am',
 '10k',
 '10pm',
 '10th',
 '11',
 '110',
 '111',
 '11124',
 '11201',
 '115',
 '11am',
 '11pm',
 '11th',
 '12',
 '120',
 '1200',
 '121',
 '123',
 '125',
 '12pm',
 '12th',
 '13',
 '130',
 '13th',
 '14',
 '140',
 '149',
 '14th',
 '15',
 '150',
 '1500',
 '155',
 '15pm',
 '15th',
 '16',
 '160',
 '16th',
 '17',
 '170',
 '175',
 '17th',
 '18',
 '180',
 '1800s',
 '1851',
 '189',
 '18th',
 '19',
 '190',
 '1900',
 '1914',
 '1918',
 '1920s',
 '1930s',
 '1937',
 '1940s',
 '1942',
 '1945',
 '1948',
 '1950',
 '1950s',
 '1953',
 '1954',
 '1955',
 '1956',
 '1958',
 '1959',
 '1960',
 '1960s',
 '1961',
 '1962',
 '1963',
 '1964',
 '1965',
 '1966',
 '1967',
 '1968',
 '1969',
 '1970',
 '1970s',
 '1971',
 '1972',
 '1973',
 '1974',
 '1975',
 '1976',
 '1977',
 '1978',
 '1979',
 '1980',
 '1980s',
 '1981

In [7]:
tf_vectorizer_short = CountVectorizer(max_df=0.95, min_df=2,
                                max_features=100,
                                stop_words='english')
tf_short = tf_vectorizer_short.fit_transform(allText_Lemmatized['Text'])

In [8]:
tfidf_vectorizer = TfidfVectorizer(max_df=0.95, min_df=2,
                                   max_features=10000,
                                   stop_words=stopwords)
tfidf = tfidf_vectorizer.fit_transform(allText_Lemmatized['Text'])

In [9]:
tfidf_vectorizer.get_feature_names()

['00',
 '000',
 '00am',
 '00pm',
 '01',
 '02',
 '020',
 '023',
 '03',
 '04',
 '05',
 '06',
 '07',
 '08',
 '09',
 '10',
 '100',
 '1000',
 '100th',
 '101',
 '102',
 '104',
 '105',
 '106',
 '108',
 '109',
 '10am',
 '10k',
 '10pm',
 '10th',
 '11',
 '110',
 '111',
 '11124',
 '11201',
 '115',
 '11am',
 '11pm',
 '11th',
 '12',
 '120',
 '1200',
 '121',
 '123',
 '125',
 '12pm',
 '12th',
 '13',
 '130',
 '13th',
 '14',
 '140',
 '149',
 '14th',
 '15',
 '150',
 '1500',
 '155',
 '15pm',
 '15th',
 '16',
 '160',
 '16th',
 '17',
 '170',
 '175',
 '17th',
 '18',
 '180',
 '1800s',
 '1851',
 '189',
 '18th',
 '19',
 '190',
 '1900',
 '1914',
 '1918',
 '1920s',
 '1930s',
 '1937',
 '1940s',
 '1942',
 '1945',
 '1948',
 '1950',
 '1950s',
 '1953',
 '1954',
 '1955',
 '1956',
 '1958',
 '1959',
 '1960',
 '1960s',
 '1961',
 '1962',
 '1963',
 '1964',
 '1965',
 '1966',
 '1967',
 '1968',
 '1969',
 '1970',
 '1970s',
 '1971',
 '1972',
 '1973',
 '1974',
 '1975',
 '1976',
 '1977',
 '1978',
 '1979',
 '1980',
 '1980s',
 '1981

## Test some training options

In [21]:
lda = LatentDirichletAllocation(n_components=25, max_iter=200,
                                learning_method='batch')
clusters = lda.fit_transform(tf)


In [22]:
def print_top_words(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        message = "Topic #%d: " % topic_idx
        message += " ".join([feature_names[i]
                             for i in topic.argsort()[:-n_top_words - 1:-1]])
        print(message)
    print()

In [23]:
print("\nTopics in LDA model:")
tf_feature_names = tf_vectorizer.get_feature_names()
print_top_words(lda, tf_feature_names, 10)



Topics in LDA model:
Topic #0: harvest beer vega say year la route brewing 91 brewery
Topic #1: new york comedy show park music brooklyn one time year
Topic #2: art seattle new japanese work artist oct book go film
Topic #3: comedy show laugh perform michael comedian also late comic one
Topic #4: com 30 10 free day ticket event music 11 show
Topic #5: global citizen mandela world year poverty nelson take end new
Topic #6: show tribeca new season tv say premiere episode like year
Topic #7: book say science author child research people work social parent
Topic #8: download band ticket say year metal tour best rock christmas
Topic #9: year one seafood say take get race day new last
Topic #10: music cole event year say world album new artist north
Topic #11: film award director new year feature work one documentary producer
Topic #12: art community city center include program cultural chicago new support
Topic #13: film toronto international premiere ap star photo director london file
Top

#### Now save all relevant clustering and event information into a clustering DataFrame

In [24]:
#first load some relevant dfs
cities = pd.read_excel('Data/worldcities.xlsx')
cities1M = cities.query('population > 1000000')
cities50K = cities.query('population > 50000')

In [25]:
def getCityLatLng(cityName):
    cities = cities50K[cities50K['city'] == cityName]

    if cities.shape[0]>0: #we found cities
        cities = cities.sort_values(by=['population'])
        coords = cities.iloc[-1][['lat', 'lng']]
        return coords.values
    else:
        return [-1, -1]


In [26]:
#finally, loop to aggregate data and create DF
listOfDicts = []
for index, (event, city, summary) in enumerate(zip(allText_Lemmatized['Event Name'], 
                                                   allText_Lemmatized['City'], allText_Lemmatized['Summary'])):
    if type(city) == str:
        coords = getCityLatLng(city)
    else:
        coords = [-1, -1]
    clusterScores = clusters[index]
    eventDict = {"Event Name": event, "City": city, "Summary": summary, "Latitude": coords[0], 
                 "Longitude": coords[1], "Cluster Scores": clusterScores}
    listOfDicts.append(eventDict)
    
clusterDF = pd.DataFrame(listOfDicts)

In [27]:
clusterDF

Unnamed: 0,City,Cluster Scores,Event Name,Latitude,Longitude,Summary
0,Bethlehem,"[7.092198582543587e-05, 7.092198582498029e-05,...",Peanut Butter Festival,-28.2196,28.3000,Peanut Butter and Jelly sandwiches were though...
1,Durban,"[0.007889808024866163, 1.968310206090022e-06, ...",Lesbian Film Festival,-29.8650,30.9800,A lesbian film festival is returning to Paduca...
2,Springs,"[2.6525198942467058e-05, 2.6525198942190437e-0...",Blue Springs Fall Fun Festival,-26.2696,28.4300,Cronin’s Bar & Grill in Lenexa — basic burger....
3,Bintulu,"[7.920792080624784e-05, 7.9207920804607e-05, 7...",Borneo International Kite Festival,3.1664,113.0360,Click to share on WhatsApp (Opens in new windo...
4,Los Angeles,"[2.740364194871672e-07, 2.740364194907998e-07,...",LA Film Festival,34.1140,-118.4068,Film Independent moved the festival from downt...
5,Kazan,"[0.00032000000004251913, 0.0003200000000256569...",Kazan International Festival,55.7499,49.1263,"International-award winning film Halda, direct..."
6,Bangkok,"[0.008213891676664047, 0.07217341252831533, 0....",Irish Film Festival,13.7500,100.5166,Actor Colin Farrell has become a patron for th...
7,San Francisco,"[3.755868545322674e-05, 0.18880201724027, 0.24...",Annual Iranian Film Festival,37.7561,-122.4429,Although they're best known for big-picture fe...
8,Sydney,"[2.651816494786987e-06, 2.6518164948073017e-06...",Palestinian Film Festival,-33.9200,151.1852,EVEREST CARNIVAL The Everest Carnival is comin...
9,Melbourne,"[0.012150609853995843, 0.010077465201042718, 3...",Melbourne Fringe Festival,-37.8200,144.9750,From the Spring Racing Carnival to Melbourne F...


### Add dates to DF

In [28]:
events = pd.read_pickle("Data/allEventsSep27.pkl")
prevEvents1 = pd.read_pickle("Data/allEvents.pkl")
prevEvents2 = pd.read_pickle("Data/allEventsSep23.pkl")
finalEvents = pd.concat([events, prevEvents1, prevEvents2])

dates = []
for index in range(len(clusterDF)):
    eventName = clusterDF.iloc[index]['Event Name']
    try:
        day = finalEvents[finalEvents['Event Name'] == eventName].iloc[0]['Day']
        month = finalEvents[finalEvents['Event Name'] == eventName].iloc[0]['Month']
    except:
        dates.append("Unkown")
        continue
        
    if day < 10:
        date = "Early " + month 
    elif day < 20:
        date = "Mid " + month
    else:
        date = "Late " + month
    dates.append(date)
clusterDF['Date'] = dates


In [29]:
clusterDF

Unnamed: 0,City,Cluster Scores,Event Name,Latitude,Longitude,Summary,Date
0,Bethlehem,"[7.092198582543587e-05, 7.092198582498029e-05,...",Peanut Butter Festival,-28.2196,28.3000,Peanut Butter and Jelly sandwiches were though...,Mid September
1,Durban,"[0.007889808024866163, 1.968310206090022e-06, ...",Lesbian Film Festival,-29.8650,30.9800,A lesbian film festival is returning to Paduca...,Late September
2,Springs,"[2.6525198942467058e-05, 2.6525198942190437e-0...",Blue Springs Fall Fun Festival,-26.2696,28.4300,Cronin’s Bar & Grill in Lenexa — basic burger....,Mid September
3,Bintulu,"[7.920792080624784e-05, 7.9207920804607e-05, 7...",Borneo International Kite Festival,3.1664,113.0360,Click to share on WhatsApp (Opens in new windo...,Late September
4,Los Angeles,"[2.740364194871672e-07, 2.740364194907998e-07,...",LA Film Festival,34.1140,-118.4068,Film Independent moved the festival from downt...,Late September
5,Kazan,"[0.00032000000004251913, 0.0003200000000256569...",Kazan International Festival,55.7499,49.1263,"International-award winning film Halda, direct...",Mid September
6,Bangkok,"[0.008213891676664047, 0.07217341252831533, 0....",Irish Film Festival,13.7500,100.5166,Actor Colin Farrell has become a patron for th...,Late September
7,San Francisco,"[3.755868545322674e-05, 0.18880201724027, 0.24...",Annual Iranian Film Festival,37.7561,-122.4429,Although they're best known for big-picture fe...,Late September
8,Sydney,"[2.651816494786987e-06, 2.6518164948073017e-06...",Palestinian Film Festival,-33.9200,151.1852,EVEREST CARNIVAL The Everest Carnival is comin...,Late September
9,Melbourne,"[0.012150609853995843, 0.010077465201042718, 3...",Melbourne Fringe Festival,-37.8200,144.9750,From the Spring Racing Carnival to Melbourne F...,Late September


In [30]:
print(clusterDF[clusterDF['Event Name'] == 'Cinemagic Film and Television Festival']['Cluster Scores'].values)

[array([1.39372822e-04, 1.39372822e-04, 1.39372822e-04, 1.85980925e-02,
       1.39372822e-04, 1.39372822e-04, 1.39372822e-04, 3.85336840e-02,
       1.39372822e-04, 1.01839286e-01, 1.39372822e-04, 1.39372822e-04,
       1.39372822e-04, 3.64251127e-02, 1.39372822e-04, 1.39372822e-04,
       1.39372822e-04, 1.39372822e-04, 1.39372822e-04, 8.01816368e-01,
       1.39372822e-04, 1.39372822e-04, 1.39372822e-04, 1.39372822e-04,
       1.39372822e-04])]


### Save the model for use later

In [None]:
pickle.dump(tf_vectorizer, open('/Users/orimichaelweiner/Documents/GitHub/GlobalFestivalFinder/Data/tf_vectorizer_final.pkl', 'wb'))
pickle.dump(lda, open('/Users/orimichaelweiner/Documents/GitHub/GlobalFestivalFinder/Data/lda_final.pkl', 'wb'))
pickle.dump(clusters, open('/Users/orimichaelweiner/Documents/GitHub/GlobalFestivalFinder/Data/clusters_final.pkl', 'wb'))
clusterDF.to_pickle("/Users/orimichaelweiner/Documents/GitHub/GlobalFestivalFinder/Data/eventScores_final.pkl")
