In [1]:
import pickle
import random
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer 
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score
from sklearn.feature_extraction.text import TfidfTransformer
import gzip # for compressing Pickeled data

In [2]:
## read in the training data
#with open("/home/nyao/Code/Data/sentence_score_conso10.pk", 'rb') as f:
with open("./Data/sentence_score_conso10.pk", 'rb') as f:
    data=pickle.load(f)

In [3]:
data.head(2)

Unnamed: 0,Website,Sentence,Data Retention,Data Security,Do Not Track,First Party Collection/Use,International and Specific Audiences,Not_used,Policy Change,Third Party Sharing/Collection,"User Access, Edit and Deletion",User Choice/Control
0,playstation.com,"Privacy Policy Last Revised: April, 2011",,,,,,0.0,,,,
1,playstation.com,"Sony Computer Entertainment America LLC (""SCEA...",,,,,,0.0,,,,


In [4]:

def getXY(scoretable, category, trainportion):
    # trainportion means the percentage of data to be used as training, the rest are used as validation
    # choose a category, get the data from the data file
    categoryFrame=scoretable[["Sentence", category]]
    nonmissing=categoryFrame.dropna()
    nonmissing=nonmissing.copy()
    nonmissing.loc[:,category]=1
    missingdata=categoryFrame.loc[pd.isnull(categoryFrame[category])]
    missingdata=missingdata.copy()
    missingdata.loc[:,category]=0

    ## equal portion sampling, sample from both positive and negative group
    shufflednonmissing=nonmissing.sample(frac=1)   # shuffle the data 
    shuffledmissing=missingdata.sample(frac=1)

    nonmissingNumber=shufflednonmissing.shape[0]
    missingNumber=shuffledmissing.shape[0]

    trainPos=shufflednonmissing.iloc[0:int(nonmissingNumber*trainportion)]
    valPos=shufflednonmissing.iloc[int(nonmissingNumber*trainportion)+1::]

    trainNeg=shuffledmissing.iloc[0:int(missingNumber*trainportion)]
    valNeg=shuffledmissing.iloc[int(missingNumber*trainportion)+1::]

    trainData=pd.concat([trainPos, trainNeg], axis=0)
    valData=pd.concat([valPos, valNeg], axis=0)

    trainData=trainData.sample(frac=1)
    valData=valData.sample(frac=1)

    trainTexts=trainData["Sentence"].tolist()
    trainLabels=trainData[category].tolist()

    valTexts=valData["Sentence"].tolist()
    valLabels=valData[category].tolist()

    return trainTexts, valTexts, trainLabels, valLabels

def modelAndValidate(datatable, category, vectorizer, classifier, trainportion):
    results={}

    # generate the data
    traintexts, valTexts,trainlabels, valLabels =getXY(datatable, category, trainportion)
    
    # use the vectorizer to generate features, for both training and validation data
    vectorizer.fit(traintexts)
    trainCounts=vectorizer.transform(traintexts)
    valCounts=vectorizer.transform(valTexts)
    
    # fit the model onto the data
    classifier.fit(trainCounts, trainlabels)
    
    # predict the samples in the validation
    predictions=classifier.predict(valCounts)
    f1score=f1_score(valLabels, predictions, average="weighted")
    
    # return the results
    results={"vectorizer":vectorizer, "model":classifier, "validation_labels":valLabels, "predictions":predictions, "f1_score":f1score}
    
    return results


### Train and test classifiers

In [5]:
%%time

trainportion = 0.8

# get list of categories
categories = list(data)
categories.remove("Not_used")
categories.remove("Website")
categories.remove("Sentence")

# TODO: Run on full dataset
#categories = ['Do Not Track']

allResults = {}

for category in categories:
    print(category)

    # initialize vectorizer and classifier
    vectorizer = TfidfVectorizer(max_df=0.2, stop_words="english", ngram_range = (1,2)) 
    classifier = RandomForestClassifier(n_estimators=500)

    # generate the data
    traintexts, valTexts, trainlabels, valLabels = getXY(data, category, trainportion)

    results = modelAndValidate(data, category, vectorizer, classifier, 0.8)

    # results = validate(vectorizer, classifier, traintexts, valTexts, trainlabels, valLabels)
    
    allResults[category] = results
        
print()

Data Retention
Data Security
Do Not Track
First Party Collection/Use
International and Specific Audiences
Policy Change
Third Party Sharing/Collection
User Access, Edit and Deletion
User Choice/Control

CPU times: user 8min 22s, sys: 1.17 s, total: 8min 23s
Wall time: 8min 24s


### Print test results

In [6]:
rows = []

# get all the f1_scores for this vectorizer/classifer combination
f1_scores = []
for category in categories:
    results = allResults[category]
    f1_scores.append(results["f1_score"])
row = []
row.extend(f1_scores)
rows.append(row)

# create a pandas dataframe from results
cols = []
cols.extend(categories)
df = pd.DataFrame(rows,columns=cols)
df.round(4)

Unnamed: 0,Data Retention,Data Security,Do Not Track,First Party Collection/Use,International and Specific Audiences,Policy Change,Third Party Sharing/Collection,"User Access, Edit and Deletion",User Choice/Control
0,0.9734,0.9379,0.9961,0.7901,0.9522,0.9745,0.8394,0.9561,0.8812


### Write vectorizers and classifiers to disk

In [7]:
vectorizers = {}
models = {}

for category in categories:
    results = allResults[category]
    vectorizers[category] = results["vectorizer"]
    models[category] = results["model"]

with gzip.open('./Data/vectorizers.pk.gz', 'wb') as f:
    pickle.dump(vectorizers, f)
    
with gzip.open('./Data/classifiers_rforest.pk.gz', 'wb') as f:
    pickle.dump(models, f)

## output file is 18 MB