# Classifying Events by Type

A similar application is the Facebook Event Classifier, which attempts to categorize events (e.g. parties, concerts, games).

In [88]:
import pandas as pd;
import numpy as np;
import seaborn as sns;
import matplotlib.pyplot as plt;
import string
from functools import reduce
from tqdm import tqdm
from pylab import *;

from sklearn.preprocessing import OneHotEncoder, LabelEncoder, StandardScaler

import nltk, re, pprint
from nltk.corpus   import stopwords
from nltk          import word_tokenize
from nltk.tokenize import RegexpTokenizer

from nltk.sentiment.vader import SentimentIntensityAnalyzer as SIA

from sklearn.feature_extraction.stop_words import ENGLISH_STOP_WORDS

from sklearn.feature_extraction.text import CountVectorizer

from sklearn.linear_model import LogisticRegression

from wordcloud import WordCloud, STOPWORDS

from scipy import sparse

from nltk.corpus import stopwords
from time import time

eng_stopwords = set(stopwords.words("english"))

%matplotlib inline

In [79]:
events = pd.read_csv('raw_data/cleaned_up.csv')

## Feature Engineering

In [80]:
# Convert Days to Simple Binary (Lose Time of Day Information)

days = ['Sunday', 'Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday2', 'Monday2'];

events[days] = ((events[days] == '0') == False).astype(int);

In [81]:
# Convert Contact Email, URL to Binary

events['Contact Email']  = pd.isnull(events['Contact Email']).values.astype(int)
events['URL']            = pd.isnull(events['URL']).values.astype(int)
events['Located at Art'] = pd.isnull(events['Located at Art']).values.astype(int)

In [82]:
# Convert Location to Binary

#events['Location']       = pd.isnull(events['Location']).values.astype(int)

In [83]:
# Convert Hosted by Camp to Binary

#events['Hosted by Camp']       = pd.isnull(events['Hosted by Camp']).values.astype(int)

In [84]:
events.head()

Unnamed: 0,Description,Title,Hosted by Camp,Location,Type,Contact Email,URL,Located at Art,Year,Sunday,Monday,Tuesday,Wednesday,Thursday,Friday,Saturday,Sunday2,Monday2
0,“Check Your Baggage” on PLEASair’s giant mocke...,Check Your Baggage at PLEASair,Pleasair,8:30 & D.N.A.,Other,1,1,1,2009,0,1,1,1,1,1,1,0,0
1,Get your picture taken as Fred or Wilma Flints...,Prehistoric Photo-OP,Camp Bedrock,,Kid-friendly,0,1,1,2009,0,1,1,1,1,1,1,0,0
2,"Please come see us at The Lost Boys camp, wher...",Citezen Art Project,The Lost Boys,,Kid-friendly,0,1,1,2009,0,1,1,1,1,1,1,1,0
3,Help us create our Garden of Hedon and evolve ...,Garden of Hedon,Celestial Bodies,,Gathering/Party,1,0,1,2009,0,1,1,1,1,1,1,1,0
4,Get a secret mission! Evolve and win! Help oth...,The Evolution Game,,Center Camp,Game,1,1,1,2009,0,1,1,1,1,1,1,1,0


In [104]:
#######################
# FEATURE ENGINEERING #
#######################

def engineer_feature(series, func, normalize=True):
    feature = series.apply(func)
       
    if normalize:
        feature = pd.Series(z_normalize(feature.values.reshape(-1,1)).reshape(-1,))
    feature.name = func.__name__ 
    return feature

def engineer_features(series, funclist, normalize=True):
    features = pd.DataFrame()
    for func in funclist:
        print(str(func))
        feature = engineer_feature(series, func, normalize)
        features[feature.name] = feature
    return features

##################
### Normalizer ###
##################

scaler = StandardScaler()
def z_normalize(data):
    scaler.fit(data)
    return scaler.transform(data)
    
def count_words(x, words):
    count = 0
    for word in words:
        count += len(re.findall(word, str(x)))
    return count
    
################
### Features ###
################

def uppercase_freq(x):
    return len(re.findall(r'[A-Z]', x))/len(x)

def sentence_count(x):
    return len(re.findall("\n", str(x)))+1

def word_count(x):
    return len(str(x).split())

def unique_word_count(x):
    return len(set(str(x).split()))

def count_letters(x):
    return len(str(x))

def count_punctuations(x):
    return len([c for c in str(x) if c in string.punctuation])

def count_words_title(x):
    return len([w for w in str(x).split() if w.istitle()])

def count_stopwords(x):
    return len([w for w in str(x).lower().split() if w in eng_stopwords])

def mean_word_len(x):
    words = [len(w) for w in str(x).split()]

    if len(words) == 0:
        return 0
    else:
        return np.mean(words)

##################################
### Category-Specific Features ###
##################################

def count_kids_words(x):
    return count_words(x, ['kid', 'scout'])

def count_party_words(x):
    return count_words(x, ['party', 'dance', 'music', 'celebrate'])

def count_adult_words(x):
    return count_words(x, ['adult', 'massage', 'sensual', 'erotic', 'sex', 'bdsm', 'pleasure'])

def count_game_words(x):
    return count_words(x, ['game', 'play', 'prize', 'race', 'tournament'])

def count_ritual_words(x):
    return count_words(x, ['ceremony', 'ritual', 'temple', 'sacred'])
    
def count_care_words(x):
    return count_words(x, ['heal', 'massage', 'help', 'body'])

def count_class_words(x):
    return count_words(x, ['learn', 'workshop', 'practice', 'class'])

def count_performance_words(x):
    return count_words(x, ['perform', 'stage', 'live', 'show', 'audience'])

def count_food_words(x):
    return count_words(x, ['coffee', 'pickle', 'food', 'serv', 'fresh', 'bacon', 'cheese', 'delicious', 'pancake', 'tast'])

def count_fire_words(x):
    return count_words(x, ['fire', 'burn', 'spin', 'fuel', 'flame', 'light', 'flow'])

def count_parade_words(x):
    return count_words(x, ['parade', 'march', 'tour'])

############################
### Sentimental Features ###
############################

sia = SIA();

def sentiment_compound(x):
    polarity = sia.polarity_scores(x)
    return polarity['compound']       

def sentiment_negative(x):
    polarity = sia.polarity_scores(x)
    return polarity['neg']       

def sentiment_neutral(x):
    polarity = sia.polarity_scores(x)
    return polarity['neu']       

def sentiment_positive(x):
    polarity = sia.polarity_scores(x)
    return polarity['pos']       
        
########################
### Derived Features ###
########################

def unique_word_ratio(x):
    wc = word_count(x)
    
    if wc == 0:
        return 0
    else:
        return unique_word_count(x)/wc

def percent_ratio(x):
    wc = word_count(x)
    
    if wc == 0:
        return 0
    else:
        return count_punctuations(x)/wc

def words_per_sentence(x):
    sc = sentence_count(x)
    
    if sc == 0:
        return 0
    else:
        return word_count(x)/sc

In [105]:
feature_functions = [uppercase_freq, sentence_count, word_count, unique_word_count, count_letters, count_punctuations, 
                     count_words_title, count_stopwords, mean_word_len, count_kids_words, count_party_words, 
                     count_adult_words, count_game_words, count_ritual_words, count_care_words,
                     count_class_words, count_performance_words, count_food_words, count_fire_words, count_parade_words,
                     unique_word_ratio, percent_ratio, words_per_sentence,
                     sentiment_compound, sentiment_negative, sentiment_positive, sentiment_neutral]

features = [f.__name__ for f in feature_functions]

F_train = engineer_features(events['Description'].fillna(''), feature_functions, normalize=False)

X_handFeatures = F_train[features].as_matrix()

<function uppercase_freq at 0x000001F3186570D0>
<function sentence_count at 0x000001F3186579D8>
<function word_count at 0x000001F318657BF8>
<function unique_word_count at 0x000001F318657B70>
<function count_letters at 0x000001F318657EA0>
<function count_punctuations at 0x000001F318657D08>
<function count_words_title at 0x000001F318657D90>
<function count_stopwords at 0x000001F318657F28>
<function mean_word_len at 0x000001F318657C80>
<function count_kids_words at 0x000001F318657510>
<function count_party_words at 0x000001F318657158>
<function count_adult_words at 0x000001F3186572F0>
<function count_game_words at 0x000001F318657488>
<function count_ritual_words at 0x000001F318657E18>
<function count_care_words at 0x000001F3186B9EA0>
<function count_class_words at 0x000001F3186B9510>
<function count_performance_words at 0x000001F3186B9F28>
<function count_food_words at 0x000001F3186B9D90>
<function count_fire_words at 0x000001F3186B9048>
<function count_parade_words at 0x000001F3186B9950>

In [106]:
# run the CountVectorizer on the event descriptions and event titles

count_vect_desc  = CountVectorizer(stop_words='english', min_df=25,  ngram_range=(1, 3), analyzer='word')
count_vect_title = CountVectorizer(stop_words='english', min_df=25,  ngram_range=(1, 3), analyzer='word')
count_vect_camp  = CountVectorizer(stop_words='english', min_df=100, ngram_range=(3, 5), analyzer='char')
count_vect_loca  = CountVectorizer(stop_words='english', min_df=50,  ngram_range=(3, 5), analyzer='char')

X        = count_vect_desc.fit_transform(events['Description'].values);
X_titles = count_vect_title.fit_transform(events['Title'].values);
X_camp   = count_vect_camp.fit_transform([str(event) for event in events['Hosted by Camp'].fillna('').values]);
X_loca   = count_vect_camp.fit_transform([str(event) for event in events['Location'].fillna('').values]);

iX_desc  = X.shape[1]
iX_title = X_titles.shape[1]
iX_camp  = X_camp.shape[1]
iX_loca  = X_loca.shape[1]

print(X.shape)
print(X_titles.shape)
print(X_camp.shape)
print(X_loca.shape)

(20165, 3975)
(20165, 511)
(20165, 61)
(20165, 478)


In [107]:
basic_features = ['Contact Email', 'URL', 'Located at Art']

print(shape(X));
print(shape(X_titles));
print(shape(X_camp));
print(shape(events[days].values));
print(shape(events[basic_features].values));

desc_length = shape(X)[1]

print(desc_length)

(20165, 3975)
(20165, 511)
(20165, 61)
(20165, 9)
(20165, 3)
3975


In [108]:
enc = OneHotEncoder()
le  = LabelEncoder()

camps = events['Hosted by Camp'].fillna('')

events['Hosted by Camp'] = le.fit_transform(camps)

camps = enc.fit_transform(events[['Hosted by Camp']])

camps.toarray().shape

(20165, 2137)

In [109]:
#X = sparse.csr_matrix(hstack((X.toarray(), X_titles.toarray(), X_camp.toarray(), X_handFeatures, 
#                              events[days].values, events[basic_features].values)))

X = sparse.csr_matrix(hstack((X.toarray(), X_titles.toarray(), X_handFeatures, camps.toarray(), 
                              events[days].values, events[basic_features].values)))

print(shape(X))

(20165, 6662)


In [110]:
le = LabelEncoder()

le.fit(events['Type'].values)

labels = le.classes_;

print(le.classes_)

y = le.transform(events['Type'].values) 

['Adult-oriented' 'Care/Support' 'Class/Workshop' 'Fire' 'Food' 'Game'
 'Gathering/Party' 'Kid-friendly' 'Other' 'Parade' 'Performance'
 'Ritual/Ceremony']


In [111]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, stratify=y)

print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(10082, 6662)
(10083, 6662)
(10082,)
(10083,)


In [None]:
from sklearn.naive_bayes import MultinomialNB

clf = MultinomialNB().fit(X_train, y_train)

pred = clf.predict(X_test)

from sklearn.metrics import classification_report, confusion_matrix, mean_absolute_error, mean_squared_error;

#print("Confusion Matrix")
#print(confusion_matrix(y_test, pred));
#print("\n")

print("Classification Report")
print(classification_report(y_test, pred, target_names=labels));
print("\n")

In [113]:
from sklearn.naive_bayes import MultinomialNB

clf = LogisticRegression().fit(X_train, y_train)

pred = clf.predict(X_test)

from sklearn.metrics import classification_report, confusion_matrix, mean_absolute_error, mean_squared_error;

#print("Confusion Matrix")
#print(confusion_matrix(y_test, pred));
#print("\n")

print("Classification Report")
print(classification_report(y_test, pred, target_names=labels));
print("\n")

Classification Report
                 precision    recall  f1-score   support

 Adult-oriented       0.73      0.49      0.59       562
   Care/Support       0.58      0.42      0.49       372
 Class/Workshop       0.75      0.86      0.80      3210
           Fire       0.65      0.51      0.57        81
           Food       0.63      0.57      0.60       419
           Game       0.67      0.55      0.60       474
Gathering/Party       0.66      0.80      0.73      2641
   Kid-friendly       0.51      0.21      0.30       181
          Other       0.28      0.20      0.24       711
         Parade       0.65      0.46      0.54       140
    Performance       0.64      0.56      0.59       779
Ritual/Ceremony       0.53      0.36      0.43       513

    avg / total       0.65      0.67      0.65     10083





In [17]:
import lightgbm as lgb;

clf = lgb.LGBMClassifier().fit(X_train.toarray(), y_train)

pred = clf.predict(X_test.toarray())

#print("Confusion Matrix")
#print(confusion_matrix(y_test, pred));
#print("\n")

print("Classification Report")
print(classification_report(y_test, pred, target_names=labels));
print("\n")

  if diff:


Classification Report
                 precision    recall  f1-score   support

 Adult-oriented       0.65      0.41      0.50       562
   Care/Support       0.48      0.20      0.28       372
 Class/Workshop       0.70      0.85      0.77      3210
           Fire       0.24      0.10      0.14        81
           Food       0.56      0.35      0.43       419
           Game       0.56      0.41      0.47       474
Gathering/Party       0.55      0.82      0.66      2641
   Kid-friendly       0.28      0.10      0.15       181
          Other       0.35      0.10      0.15       711
         Parade       0.68      0.39      0.49       140
    Performance       0.63      0.45      0.53       779
Ritual/Ceremony       0.52      0.26      0.34       513

    avg / total       0.59      0.61      0.58     10083



