In [1]:
import pickle
import random
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer 
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score
from sklearn.feature_extraction.text import TfidfTransformer

In [2]:
## read in the training data
#with open("/home/nyao/Code/Data/sentence_score_conso10.pk", 'rb') as f:
with open("./Data/sentence_score_conso10.pk", 'rb') as f:
    data=pickle.load(f)

In [3]:
data.head(5)

Unnamed: 0,Website,Sentence,Data Retention,Data Security,Do Not Track,First Party Collection/Use,International and Specific Audiences,Not_used,Policy Change,Third Party Sharing/Collection,"User Access, Edit and Deletion",User Choice/Control
0,playstation.com,"Privacy Policy Last Revised: April, 2011",,,,,,0.0,,,,
1,playstation.com,"Sony Computer Entertainment America LLC (""SCEA...",,,,,,0.0,,,,
2,playstation.com,This privacy policy is intended to provide you...,,,,,,0.0,,,,
3,playstation.com,"If you have any questions, complaints or comme...",,,,,,0.0,,,,
4,playstation.com,This Privacy Statement and the certification s...,,,,,,0.0,,,,


In [4]:
datasecurity=data[["Sentence", "Data Security"]]
datasecurity.loc[pd.isnull(datasecurity["Data Security"])]
datasecurity.iloc[:423].shape

(423, 2)

In [5]:
a=[1,2,3,4]
random.shuffle(a)
print(a)

[2, 4, 3, 1]


In [6]:

def getXY(scoretable, category, trainportion):
    # trainportion means the percentage of data to be used as training, the rest are used as validation
    # choose a category, get the data from the data file
    categoryFrame=scoretable[["Sentence", category]]
    nonmissing=categoryFrame.dropna()
    nonmissing=nonmissing.copy()
    nonmissing.loc[:,category]=1
    missingdata=categoryFrame.loc[pd.isnull(categoryFrame[category])]
    missingdata=missingdata.copy()
    missingdata.loc[:,category]=0

    ## equal portion sampling, sample from both positive and negative group
    shufflednonmissing=nonmissing.sample(frac=1)   # shuffle the data 
    shuffledmissing=missingdata.sample(frac=1)

    nonmissingNumber=shufflednonmissing.shape[0]
    missingNumber=shuffledmissing.shape[0]

    trainPos=shufflednonmissing.iloc[0:int(nonmissingNumber*trainportion)]
    valPos=shufflednonmissing.iloc[int(nonmissingNumber*trainportion)+1::]

    trainNeg=shuffledmissing.iloc[0:int(missingNumber*trainportion)]
    valNeg=shuffledmissing.iloc[int(missingNumber*trainportion)+1::]

    trainData=pd.concat([trainPos, trainNeg], axis=0)
    valData=pd.concat([valPos, valNeg], axis=0)

    trainData=trainData.sample(frac=1)
    valData=valData.sample(frac=1)

    trainTexts=trainData["Sentence"].tolist()
    trainLabels=trainData[category].tolist()

    valTexts=valData["Sentence"].tolist()
    valLabels=valData[category].tolist()

    return trainTexts, valTexts, trainLabels, valLabels

def modelAndValidate(datatable, category, vectorizer, classifier, trainportion):
    results={}

    # generate the data
    traintexts, valTexts,trainlabels, valLabels =getXY(datatable, category, trainportion)
    
    # use the vectorizer to generate features, for both training and validation data
    vectorizer.fit(traintexts)
    trainCounts=vectorizer.transform(traintexts)
    valCounts=vectorizer.transform(valTexts)
    
    # fit the model onto the data
    classifier.fit(trainCounts, trainlabels)
    
    # predict the samples in the validation
    predictions=classifier.predict(valCounts)
    f1score=f1_score(valLabels, predictions, average="weighted")
    
    
    # return the results
    results={"model":classifier, "validation_labels":valLabels, "predictions":predictions, "f1_score":f1score}
    #print("Finished modeling for category {}".format(category))
    #print("F1 score : ", f1score)
    
    return results
    

In [7]:
%%time

## initialize a vectorizer
vectorizerFinal=TfidfVectorizer(max_df=0.2, stop_words="english") 
## initialize a classifer

classifierFinal=RandomForestClassifier(n_estimators=500)

category="Data Security"
results = modelAndValidate(data, category, vectorizerFinal, classifierFinal, 0.8)
print(results["f1_score"])

0.946380533612
CPU times: user 16.3 s, sys: 112 ms, total: 16.4 s
Wall time: 16.4 s


### Test models

In [8]:
%%time

# get list of categories
categories = list(data)
categories.remove("Not_used")
categories.remove("Website")
categories.remove("Sentence")

# TODO: Remove this and run on full dataset
categories = ['Data Retention','Data Security', 'Do Not Track']

## initialize vectorizers
vectorizers = {}
vectorizers['Tf-idf, unigrams'] = TfidfVectorizer(max_df=0.2, stop_words="english")
vectorizers['Tf-idf, uni+bigrams'] = TfidfVectorizer(max_df=0.2, stop_words="english", ngram_range = (1,2)) 
vectorizers['Tf-idf, unigrams'] = TfidfVectorizer(max_df=0.2, stop_words="english")
vectorizers['Tf-idf, uni+bigrams'] = TfidfVectorizer(max_df=0.2, stop_words="english", ngram_range = (1,2)) 

## initialize classifiers
classifiers = {}
classifiers['RandomForest, n=500'] = RandomForestClassifier(n_estimators=500)
classifiers['AdaBoost, n=500'] = AdaBoostClassifier(n_estimators=500)

allResults = {}

for vname in vectorizers:
    vectorizer = vectorizers[vname]
    allResults[vname] = {}
    
    for cname in classifiers:
        classifier = classifiers[cname]
        allResults[vname][cname] = {}

        for category in categories:
            print("{} {} {}".format(vname,cname,category))
            results = modelAndValidate(data, category, vectorizer, classifier, 0.8)
            allResults[vname][cname][category] = results

Tf-idf, unigrams RandomForest, n=500 Data Retention
Tf-idf, unigrams RandomForest, n=500 Data Security
Tf-idf, unigrams RandomForest, n=500 Do Not Track
Tf-idf, unigrams AdaBoost, n=500 Data Retention
Tf-idf, unigrams AdaBoost, n=500 Data Security
Tf-idf, unigrams AdaBoost, n=500 Do Not Track
Tf-idf, uni+bigrams RandomForest, n=500 Data Retention
Tf-idf, uni+bigrams RandomForest, n=500 Data Security
Tf-idf, uni+bigrams RandomForest, n=500 Do Not Track
Tf-idf, uni+bigrams AdaBoost, n=500 Data Retention
Tf-idf, uni+bigrams AdaBoost, n=500 Data Security
Tf-idf, uni+bigrams AdaBoost, n=500 Do Not Track
CPU times: user 2min 39s, sys: 591 ms, total: 2min 39s
Wall time: 2min 39s


### Print results

In [10]:
rows = []

for vname in vectorizers:
    for cname in classifiers:
        # get all the f1_scores for this vectorizer/classifer combination
        f1_scores = []
        for category in categories:
            results = allResults[vname][cname][category]
            f1_scores.append(results["f1_score"])
        row = [vname, cname]
        row.extend(f1_scores)
        rows.append(row)

# create a pandas dataframe from results
cols = ['Vectorizer','Classifier']
cols.extend(categories)
df = pd.DataFrame(rows,columns=cols)
df.round(4)

Unnamed: 0,Vectorizer,Classifier,Data Retention,Data Security,Do Not Track
0,"Tf-idf, unigrams","RandomForest, n=500",0.9715,0.9495,0.9942
1,"Tf-idf, unigrams","AdaBoost, n=500",0.9723,0.9411,0.9957
2,"Tf-idf, uni+bigrams","RandomForest, n=500",0.9731,0.9499,0.9939
3,"Tf-idf, uni+bigrams","AdaBoost, n=500",0.9764,0.9422,0.9929
