In [2]:
import pickle
import random
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer 
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score
from sklearn.feature_extraction.text import TfidfTransformer
import gzip # for compressing Pickeled data

In [3]:
## read in the training data
#with open("/home/nyao/Code/Data/sentence_score_conso10.pk", 'rb') as f:
with open("./Data/sentence_score_conso10.pk", 'rb') as f:
    data=pickle.load(f)

In [4]:
data.head(2)

Unnamed: 0,Website,Sentence,Data Retention,Data Security,Do Not Track,First Party Collection/Use,International and Specific Audiences,Not_used,Policy Change,Third Party Sharing/Collection,"User Access, Edit and Deletion",User Choice/Control
0,playstation.com,"Privacy Policy Last Revised: April, 2011",,,,,,0.0,,,,
1,playstation.com,"Sony Computer Entertainment America LLC (""SCEA...",,,,,,0.0,,,,


In [5]:

def getXY(scoretable, category, trainportion):
    # trainportion means the percentage of data to be used as training, the rest are used as validation
    # choose a category, get the data from the data file
    categoryFrame=scoretable[["Sentence", category]]
    nonmissing=categoryFrame.dropna()
    nonmissing=nonmissing.copy()
    nonmissing.loc[:,category]=1
    missingdata=categoryFrame.loc[pd.isnull(categoryFrame[category])]
    missingdata=missingdata.copy()
    missingdata.loc[:,category]=0

    ## equal portion sampling, sample from both positive and negative group
    shufflednonmissing=nonmissing.sample(frac=1)   # shuffle the data 
    shuffledmissing=missingdata.sample(frac=1)

    nonmissingNumber=shufflednonmissing.shape[0]
    missingNumber=shuffledmissing.shape[0]

    trainPos=shufflednonmissing.iloc[0:int(nonmissingNumber*trainportion)]
    valPos=shufflednonmissing.iloc[int(nonmissingNumber*trainportion)+1::]

    trainNeg=shuffledmissing.iloc[0:int(missingNumber*trainportion)]
    valNeg=shuffledmissing.iloc[int(missingNumber*trainportion)+1::]

    trainData=pd.concat([trainPos, trainNeg], axis=0)
    valData=pd.concat([valPos, valNeg], axis=0)

    trainData=trainData.sample(frac=1)
    valData=valData.sample(frac=1)

    trainTexts=trainData["Sentence"].tolist()
    trainLabels=trainData[category].tolist()

    valTexts=valData["Sentence"].tolist()
    valLabels=valData[category].tolist()

    return trainTexts, valTexts, trainLabels, valLabels


def validate(vectorizer, classifier, traintexts, valTexts, trainlabels, valLabels):
    results={}
    
    trainCounts=vectorizer.transform(traintexts)
    valCounts=vectorizer.transform(valTexts)

    # fit the model onto the data
    classifier.fit(trainCounts, trainlabels)
    
    # predict the samples in the validation
    predictions=classifier.predict(valCounts)
    f1score=f1_score(valLabels, predictions, average="weighted")
    
    # return the results
    results={"model":classifier, "validation_labels":valLabels, "predictions":predictions, "f1_score":f1score}
    
    return results
    

### Create vectorizers

In [6]:
%%time

trainportion = 0.8

# get list of categories
categories = list(data)
categories.remove("Not_used")
categories.remove("Website")
categories.remove("Sentence")

vectorizers = {}

for category in categories:
    ## initialize a vectorizer using unigrams and bigrams
    vectorizer=TfidfVectorizer(max_df=0.2, stop_words="english", ngram_range = (1,2))

    # generate the data
    traintexts, valTexts, trainlabels, valLabels = getXY(data, category, trainportion)

    # fit a vectorizer
    vectorizers[category] = vectorizer.fit(traintexts)


CPU times: user 3.89 s, sys: 73.5 ms, total: 3.96 s
Wall time: 3.96 s


### Write vectorizers to disk
Note: vectorizers only take a few seconds to generate but consume lots of disk space. Suggest skipping this step.

In [7]:
with gzip.open('./Data/vectorizers.pk.gz', 'wb') as f:
    pickle.dump(vectorizers, f)

### Train and test classifiers

In [16]:
%%time

trainportion = 0.8

# get list of categories
categories = list(data)
categories.remove("Not_used")
categories.remove("Website")
categories.remove("Sentence")

# TODO: Run on full dataset
#categories = ['Data Retention','Data Security', 'Do Not Track']

## initialize classifiers
classifiers = {}
classifiers['RandomForest, n=500'] = RandomForestClassifier(n_estimators=500)
classifiers['AdaBoost, n=500'] = AdaBoostClassifier(n_estimators=500)

allResults = {}

for cname in classifiers:
    allResults[cname] = {}

    for category in categories:
        print("{} : {}".format(cname,category))
        
        vectorizer = vectorizers[category]
        classifier = classifiers[cname]
        
        # generate the data
        traintexts, valTexts, trainlabels, valLabels = getXY(data, category, trainportion)

        results = validate(vectorizer, classifier, traintexts, valTexts, trainlabels, valLabels)
        allResults[cname][category] = results
        
print()

RandomForest, n=500 : Data Retention
RandomForest, n=500 : Data Security
RandomForest, n=500 : Do Not Track
RandomForest, n=500 : First Party Collection/Use
RandomForest, n=500 : International and Specific Audiences
RandomForest, n=500 : Policy Change
RandomForest, n=500 : Third Party Sharing/Collection
RandomForest, n=500 : User Access, Edit and Deletion
RandomForest, n=500 : User Choice/Control
AdaBoost, n=500 : Data Retention
AdaBoost, n=500 : Data Security
AdaBoost, n=500 : Do Not Track
AdaBoost, n=500 : First Party Collection/Use
AdaBoost, n=500 : International and Specific Audiences
AdaBoost, n=500 : Policy Change
AdaBoost, n=500 : Third Party Sharing/Collection
AdaBoost, n=500 : User Access, Edit and Deletion
AdaBoost, n=500 : User Choice/Control

CPU times: user 1h 29min 4s, sys: 12.4 s, total: 1h 29min 16s
Wall time: 1h 29min 27s


### Print test results

In [17]:
rows = []

for cname in classifiers:
    # get all the f1_scores for this vectorizer/classifer combination
    f1_scores = []
    for category in categories:
        results = allResults[cname][category]
        f1_scores.append(results["f1_score"])
    row = [cname]
    row.extend(f1_scores)
    rows.append(row)

# create a pandas dataframe from results
cols = ['Classifier']
cols.extend(categories)
df = pd.DataFrame(rows,columns=cols)
df.round(4)

Unnamed: 0,Classifier,Data Retention,Data Security,Do Not Track,First Party Collection/Use,International and Specific Audiences,Policy Change,Third Party Sharing/Collection,"User Access, Edit and Deletion",User Choice/Control
0,"RandomForest, n=500",0.9718,0.941,0.9942,0.7877,0.9549,0.9698,0.8394,0.9537,0.8765
1,"AdaBoost, n=500",0.9695,0.9448,0.9943,0.7845,0.9586,0.9763,0.8338,0.9618,0.8793


### Write classifiers to disk

In [18]:
# for now, pick just one algorithm for all models
cname = "RandomForest, n=500"

models = {}

for category in categories:
    results = allResults[cname][category]
    models[category] = results["model"]

with gzip.open('./Data/classifiers_rforest.pk.gz', 'wb') as f:
    pickle.dump(models, f)

## output file is 18 MB

In [19]:
# for now, pick just one algorithm for all models
cname = "AdaBoost, n=500"

models = {}

for category in categories:
    results = allResults[cname][category]
    models[category] = results["model"]

with gzip.open('./Data/classifiers_adaboost.pk.gz', 'wb') as f:
    pickle.dump(models, f)

# output file is 99 KB