In [1]:
import pandas as pd
import csv
import numpy as np
import re
import string as string_library
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.model_selection import cross_val_score, cross_val_predict
import sys
from matplotlib import pyplot as plt
from sklearn.model_selection import KFold
from nltk.stem.porter import PorterStemmer
import math
import warnings
warnings.filterwarnings('ignore')
sys.setrecursionlimit(100000)

# Build the training set and then Classify

In [2]:
videos = []
with open("sentence_clusters_fastcluster_nonbinary_all.tsv", 'rt') as csvfile:
    reader = csv.reader(csvfile, delimiter='\t')
    for row in reader:
        videos.append([row[0], row[1], row[2], row[3], row[4]])
        
videos_en_new = pd.DataFrame(videos, columns=['id', 'description', 'channelTitle', 'sentence', 'clusterNum'])
del videos

In [3]:
num_coupon_codes = 0
for i, row in videos_en_new.iterrows():
    if row['clusterNum'] == str(68):
        num_coupon_codes += 1
        videos_en_new.at[i, 'isCoupon'] = 1
    else:
        videos_en_new.at[i, 'isCoupon'] = 0

In [4]:
print videos_en_new.shape

(77706, 6)


In [5]:
print num_coupon_codes

177


In [6]:
# There are few coupon codes compared to the size of the dataset
# Need to shrink the negative sample size in our training dataset
sample_set = videos_en_new.sample(n=1000)

In [7]:
sample_set.shape

(1000, 6)

In [8]:
positives = []
for i, row in videos_en_new.iterrows():
    if row['isCoupon'] == 1:
        positives.append([row.id, row.description, row.channelTitle, row.sentence, row.clusterNum, row.isCoupon])
        
positiveDF = pd.DataFrame(positives, columns=['id', 'description', 'channelTitle', 'sentence', 'clusterNum', 'isCoupon'])
del positives
print(positiveDF.shape)

(177, 6)


In [9]:
training_set = sample_set.append(positiveDF, ignore_index=True)
training_set.shape

(1177, 6)

In [10]:
y = training_set.isCoupon

In [11]:
stemmer = PorterStemmer()

def tokenize(line):
    if (line is None):
        line = ''
    printable = set(string_library.printable)
    line = ''.join(filter(lambda x: x in printable, line)) 
    stopwords = nltk.corpus.stopwords.words('english')

    tokenizer = nltk.RegexpTokenizer(r'[a-zA-Z]*\'[a-zA-Z]*|\w+')
    
    tokens = []
    
    line = re.sub(r'(http[s]?://|www.)(?:[a-zA-Z]|[0-9]|[$-_@.&+]*|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))*', '', line).lower()
    tokens.extend(tokenizer.tokenize(line))
    
    tokens_ = [f.strip(string_library.punctuation) for f in tokens]
    tokens_ = [f for f in tokens_ if f != '' and f not in stopwords and len(f) != 1]
    tokens_ = [f for f in tokens_ if not (f.isdigit() or f[0] == '-' and f[1:].isdigit())]
    tokens_ = [stemmer.stem(f) for f in tokens_]

    return tokens_


In [12]:
countVec = CountVectorizer(tokenizer=tokenize, min_df=4).fit(training_set['sentence'])

In [13]:
len(countVec.get_feature_names())

348

In [14]:
classifier = SVC(kernel="linear", C=0.05)

In [15]:
kf = KFold(n_splits=3, shuffle=True)
kf.get_n_splits(training_set)
round_num = 0
for train_index, test_index in kf.split(training_set):
    round_num += 1
    print 'Round', round_num
    X_train, X_test = training_set.loc[train_index], training_set.loc[test_index]
    y_train, y_test = y.loc[train_index], y.loc[test_index]
    lineVec = countVec.transform(X_train['sentence'])
    testVec = countVec.transform(X_test['sentence'])
    classifier.fit(lineVec, X_train['isCoupon'])
    predictions = classifier.predict(testVec)
    print 'Cross-validated scores:', cross_val_score(classifier, lineVec, y_train, cv=5)
    print 'r2 score: ', metrics.r2_score(y_test, predictions)
    print 'F1 Score:', metrics.f1_score(y_test, predictions)
    print '_______________________________________________________________'
    #plt.scatter(y_test, predictions, alpha=0.007)

Round 1
Cross-validated scores: [0.99363057 0.97452229 1.         1.         0.98717949]
r2 score:  0.8803410128894753
F1 Score: 0.9464285714285715
_______________________________________________________________
Round 2
Cross-validated scores: [1.         0.97468354 1.         0.99358974 0.98076923]
r2 score:  0.9089050009295407
F1 Score: 0.9618320610687022
_______________________________________________________________
Round 3
Cross-validated scores: [0.99363057 0.96178344 1.         0.99363057 0.98089172]
r2 score:  0.9570458031996494
F1 Score: 0.9814814814814815
_______________________________________________________________


# Check How the weights will work in the Browser Extension

In [16]:
weights = classifier.coef_.toarray()[0]

In [17]:
features = countVec.get_feature_names()

def get_vector(sentence):
    vector = [0] * len(features)
    tokens = tokenize(sentence)
    for token in tokens:
        if token in features:
            vector[features.index(token)] += 1
    return vector

In [18]:
false_positives = 0
false_negatives = 0
for i, row in videos_en_new.iterrows():
    tot = 0
    vector = get_vector(row['sentence'])
    for j in range(len(weights)):
        tot += weights[j] * vector[j]
    if tot > 1:
        if row['isCoupon'] == 0:
            #print "False +: ", row['sentence']
            false_positives += 1
    else:
        if row['isCoupon'] == 1:
            false_negatives += 1
            #print "False -: ", row['sentence']

In [19]:
total_vids = videos_en_new.shape[0]
print "Total Inputs: ", total_vids
print "False Positives: ", false_positives # some of these are coupon codes
print "False Positive Percentage: %1.2f" % (false_positives/(total_vids*1.0) * 100), "%"
print "False Negatives: ", false_negatives # could be a result of imperfections in the cluster

Total Inputs:  77706
False Positives:  116
False Positive Percentage: 0.15 %
False Negatives:  10


# Export features and Weights

In [20]:
features = countVec.get_feature_names()
features_string = "const CouponFeatures = ["
for item in features:
    temp = item
    if item[0:2] == "b'":
        temp = item[2:]
    features_string += '"' + temp + '",'
features_string = features_string[0:-1] + "];"
weights_string = "const CouponWeights = ["
for item in weights:
    if item == 0:
        weights_string += "0, "
    else:
        weights_string += '%1.20f' % item + ", "
weights_string = weights_string[0:-2] + "];"
#print weights_string
#print features_string
print len(features), len(weights)

348 348
