In [1]:
import pandas as pd
import csv
import numpy as np
import json
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import train_test_split

In [2]:
filepath = 'OPP-115/two_columns.csv'

In [3]:
df1 = pd.read_csv(filepath)

In [4]:
res = []
for index, row in df1.iterrows():
    Category = row["Category"]
    Segements = json.loads(row["Segements"])
    for key, value in Segements.items():
        if "selectedText" in value:
            if value['selectedText'] == 'null' or value['selectedText'] == 'Not selected' or value['selectedText'] == '' or value['selectedText'] == ' ':
                continue
            res.append([value['selectedText'], Category, value['value']])

In [5]:
df = pd.DataFrame(res, columns=['text', 'category', 'attribute'])
df

Unnamed: 0,text,category,attribute
0,Sci-News.com is committed to protecting and re...,Other,Introductory/Generic
1,nformation that you provide by filling in form...,First Party Collection/Use,Explicit
2,Sci-News.com may collect and process,First Party Collection/Use,Unspecified
3,including,First Party Collection/Use,Unspecified
4,"other purposes, for example when you report a ...",First Party Collection/Use,Basic service/feature
...,...,...,...
77759,information@mohegansun.com,Other,Privacy contact information
77760,If you have a question regarding any of the ab...,Other,Privacy contact information
77761,"1 Mohegan Sun Boulevard Uncasville, CT 06382 F...",Other,Privacy contact information
77762,"1 Mohegan Sun Boulevard Uncasville, CT 06382 F...",Other,Privacy contact information


In [6]:
df['category'].value_counts()

First Party Collection/Use              37281
Third Party Sharing/Collection          25013
User Choice/Control                      6145
Other                                    3444
User Access, Edit and Deletion           1691
Policy Change                            1216
Data Security                            1006
Data Retention                            942
International and Specific Audiences      936
Do Not Track                               90
Name: category, dtype: int64

In [7]:
df_cate = df[['text', 'category']]

In [8]:
def concat_func(x):
    return pd.Series({
        'category': x['category'].unique()
    }
    )
#分组聚合+拼接
df_cate1 = df_cate.groupby(df_cate['text']).apply(concat_func).reset_index()

In [9]:
df_cate1

Unnamed: 0,text,category
0,""" You are not required to provide information ...",[First Party Collection/Use]
1,"""Account"" on the homepage to sign in to your S...","[User Access, Edit and Deletion]"
2,"""Automatic information"" is information automat...",[First Party Collection/Use]
3,"""Automatic information"" is information automat...",[First Party Collection/Use]
4,"""Change Your Communication Preferences.",[User Choice/Control]
...,...,...
31321,zip code and state,[First Party Collection/Use]
31322,"zip code,",[First Party Collection/Use]
31323,"zip code, age, income",[First Party Collection/Use]
31324,zip code/postal code.,[First Party Collection/Use]


In [10]:
mlb = MultiLabelBinarizer()

In [11]:
labels = mlb.fit_transform(df_cate1['category'])

In [12]:
columns = [f"label{i}" for i in range(labels.shape[1])]

In [13]:
columns

['label0',
 'label1',
 'label2',
 'label3',
 'label4',
 'label5',
 'label6',
 'label7',
 'label8',
 'label9']

In [14]:
df_cate1[columns] = labels

In [15]:
df_cate2 = df_cate1[["text"] + columns]

In [16]:
df_cate2

Unnamed: 0,text,label0,label1,label2,label3,label4,label5,label6,label7,label8,label9
0,""" You are not required to provide information ...",0,0,0,1,0,0,0,0,0,0
1,"""Account"" on the homepage to sign in to your S...",0,0,0,0,0,0,0,0,1,0
2,"""Automatic information"" is information automat...",0,0,0,1,0,0,0,0,0,0
3,"""Automatic information"" is information automat...",0,0,0,1,0,0,0,0,0,0
4,"""Change Your Communication Preferences.",0,0,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...
31321,zip code and state,0,0,0,1,0,0,0,0,0,0
31322,"zip code,",0,0,0,1,0,0,0,0,0,0
31323,"zip code, age, income",0,0,0,1,0,0,0,0,0,0
31324,zip code/postal code.,0,0,0,1,0,0,0,0,0,0


In [17]:
train, temp = train_test_split(df_cate2, test_size=0.3, random_state=2022)
dev, test = train_test_split(temp, test_size=0.5, random_state=2022)

In [18]:
train.to_csv('OPP-115/data/train.csv', index=None)
dev.to_csv('OPP-115/data/dev.csv', index=None)
test.to_csv('OPP-115/data/test.csv', index=None)

In [19]:
class_ = list(mlb.classes_)
class_

['Data Retention',
 'Data Security',
 'Do Not Track',
 'First Party Collection/Use',
 'International and Specific Audiences',
 'Other',
 'Policy Change',
 'Third Party Sharing/Collection',
 'User Access, Edit and Deletion',
 'User Choice/Control']

In [20]:
with open('OPP-115/data/class.txt', 'w', encoding='utf-8') as f:
    for c in class_:
        f.write(c + '\n')

In [21]:
# df_cate1.loc[16090, 'category']